opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. opik/__init__.py +19 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/attachment_context.py +36 -0
  9. opik/api_objects/attachment/attachments_extractor.py +153 -0
  10. opik/api_objects/attachment/client.py +1 -0
  11. opik/api_objects/attachment/converters.py +2 -0
  12. opik/api_objects/attachment/decoder.py +18 -0
  13. opik/api_objects/attachment/decoder_base64.py +83 -0
  14. opik/api_objects/attachment/decoder_helpers.py +137 -0
  15. opik/api_objects/data_helpers.py +79 -0
  16. opik/api_objects/dataset/dataset.py +64 -4
  17. opik/api_objects/dataset/rest_operations.py +11 -2
  18. opik/api_objects/experiment/experiment.py +57 -57
  19. opik/api_objects/experiment/experiment_item.py +2 -1
  20. opik/api_objects/experiment/experiments_client.py +64 -0
  21. opik/api_objects/experiment/helpers.py +35 -11
  22. opik/api_objects/experiment/rest_operations.py +65 -5
  23. opik/api_objects/helpers.py +8 -5
  24. opik/api_objects/local_recording.py +81 -0
  25. opik/api_objects/opik_client.py +600 -108
  26. opik/api_objects/opik_query_language.py +39 -5
  27. opik/api_objects/prompt/__init__.py +12 -2
  28. opik/api_objects/prompt/base_prompt.py +69 -0
  29. opik/api_objects/prompt/base_prompt_template.py +29 -0
  30. opik/api_objects/prompt/chat/__init__.py +1 -0
  31. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  32. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  33. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  34. opik/api_objects/prompt/client.py +189 -47
  35. opik/api_objects/prompt/text/__init__.py +1 -0
  36. opik/api_objects/prompt/text/prompt.py +174 -0
  37. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  38. opik/api_objects/prompt/types.py +23 -0
  39. opik/api_objects/search_helpers.py +89 -0
  40. opik/api_objects/span/span_data.py +35 -25
  41. opik/api_objects/threads/threads_client.py +39 -5
  42. opik/api_objects/trace/trace_client.py +52 -2
  43. opik/api_objects/trace/trace_data.py +15 -24
  44. opik/api_objects/validation_helpers.py +3 -3
  45. opik/cli/__init__.py +5 -0
  46. opik/cli/__main__.py +6 -0
  47. opik/cli/configure.py +66 -0
  48. opik/cli/exports/__init__.py +131 -0
  49. opik/cli/exports/dataset.py +278 -0
  50. opik/cli/exports/experiment.py +784 -0
  51. opik/cli/exports/project.py +685 -0
  52. opik/cli/exports/prompt.py +578 -0
  53. opik/cli/exports/utils.py +406 -0
  54. opik/cli/harbor.py +39 -0
  55. opik/cli/healthcheck.py +21 -0
  56. opik/cli/imports/__init__.py +439 -0
  57. opik/cli/imports/dataset.py +143 -0
  58. opik/cli/imports/experiment.py +1192 -0
  59. opik/cli/imports/project.py +262 -0
  60. opik/cli/imports/prompt.py +177 -0
  61. opik/cli/imports/utils.py +280 -0
  62. opik/cli/main.py +49 -0
  63. opik/cli/proxy.py +93 -0
  64. opik/cli/usage_report/__init__.py +16 -0
  65. opik/cli/usage_report/charts.py +783 -0
  66. opik/cli/usage_report/cli.py +274 -0
  67. opik/cli/usage_report/constants.py +9 -0
  68. opik/cli/usage_report/extraction.py +749 -0
  69. opik/cli/usage_report/pdf.py +244 -0
  70. opik/cli/usage_report/statistics.py +78 -0
  71. opik/cli/usage_report/utils.py +235 -0
  72. opik/config.py +13 -7
  73. opik/configurator/configure.py +17 -0
  74. opik/datetime_helpers.py +12 -0
  75. opik/decorator/arguments_helpers.py +9 -1
  76. opik/decorator/base_track_decorator.py +205 -133
  77. opik/decorator/context_manager/span_context_manager.py +123 -0
  78. opik/decorator/context_manager/trace_context_manager.py +84 -0
  79. opik/decorator/opik_args/__init__.py +13 -0
  80. opik/decorator/opik_args/api_classes.py +71 -0
  81. opik/decorator/opik_args/helpers.py +120 -0
  82. opik/decorator/span_creation_handler.py +25 -6
  83. opik/dict_utils.py +3 -3
  84. opik/evaluation/__init__.py +13 -2
  85. opik/evaluation/engine/engine.py +272 -75
  86. opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
  87. opik/evaluation/engine/helpers.py +31 -6
  88. opik/evaluation/engine/metrics_evaluator.py +237 -0
  89. opik/evaluation/evaluation_result.py +168 -2
  90. opik/evaluation/evaluator.py +533 -62
  91. opik/evaluation/metrics/__init__.py +103 -4
  92. opik/evaluation/metrics/aggregated_metric.py +35 -6
  93. opik/evaluation/metrics/base_metric.py +1 -1
  94. opik/evaluation/metrics/conversation/__init__.py +48 -0
  95. opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
  96. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  97. opik/evaluation/metrics/conversation/helpers.py +14 -15
  98. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  99. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  100. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  101. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  102. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  103. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  104. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  105. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
  106. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
  107. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  108. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
  109. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
  110. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  111. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
  112. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
  113. opik/evaluation/metrics/conversation/types.py +4 -5
  114. opik/evaluation/metrics/conversation_types.py +9 -0
  115. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  116. opik/evaluation/metrics/heuristics/bleu.py +35 -15
  117. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  118. opik/evaluation/metrics/heuristics/contains.py +47 -11
  119. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  120. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  121. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  122. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  123. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  124. opik/evaluation/metrics/heuristics/readability.py +129 -0
  125. opik/evaluation/metrics/heuristics/rouge.py +26 -9
  126. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  127. opik/evaluation/metrics/heuristics/tone.py +155 -0
  128. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  129. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
  130. opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
  131. opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
  132. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  133. opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
  134. opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
  135. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  136. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  137. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  138. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  139. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  140. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  141. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  142. opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
  143. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  144. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  145. opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
  146. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  147. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  148. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  149. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  150. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  151. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  152. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  153. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  154. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  155. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
  156. opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
  157. opik/evaluation/metrics/ragas_metric.py +43 -23
  158. opik/evaluation/models/__init__.py +8 -0
  159. opik/evaluation/models/base_model.py +107 -1
  160. opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
  161. opik/evaluation/models/langchain/message_converters.py +97 -15
  162. opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
  163. opik/evaluation/models/litellm/util.py +125 -0
  164. opik/evaluation/models/litellm/warning_filters.py +16 -4
  165. opik/evaluation/models/model_capabilities.py +187 -0
  166. opik/evaluation/models/models_factory.py +25 -3
  167. opik/evaluation/preprocessing.py +92 -0
  168. opik/evaluation/report.py +70 -12
  169. opik/evaluation/rest_operations.py +49 -45
  170. opik/evaluation/samplers/__init__.py +4 -0
  171. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  172. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  173. opik/evaluation/score_statistics.py +66 -0
  174. opik/evaluation/scorers/__init__.py +4 -0
  175. opik/evaluation/scorers/scorer_function.py +55 -0
  176. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  177. opik/evaluation/test_case.py +3 -2
  178. opik/evaluation/test_result.py +1 -0
  179. opik/evaluation/threads/evaluator.py +31 -3
  180. opik/evaluation/threads/helpers.py +3 -2
  181. opik/evaluation/types.py +9 -1
  182. opik/exceptions.py +33 -0
  183. opik/file_upload/file_uploader.py +13 -0
  184. opik/file_upload/upload_options.py +2 -0
  185. opik/hooks/__init__.py +23 -0
  186. opik/hooks/anonymizer_hook.py +36 -0
  187. opik/hooks/httpx_client_hook.py +112 -0
  188. opik/httpx_client.py +12 -9
  189. opik/id_helpers.py +18 -0
  190. opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
  191. opik/integrations/adk/helpers.py +16 -7
  192. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  193. opik/integrations/adk/opik_tracer.py +14 -1
  194. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  195. opik/integrations/adk/recursive_callback_injector.py +4 -7
  196. opik/integrations/bedrock/converse/__init__.py +0 -0
  197. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  198. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
  199. opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
  200. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  201. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  202. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  203. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  204. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  205. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  206. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  207. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  208. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  209. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  210. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  211. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  212. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  213. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  214. opik/integrations/bedrock/opik_tracker.py +42 -4
  215. opik/integrations/bedrock/types.py +19 -0
  216. opik/integrations/crewai/crewai_decorator.py +8 -51
  217. opik/integrations/crewai/opik_tracker.py +31 -10
  218. opik/integrations/crewai/patchers/__init__.py +5 -0
  219. opik/integrations/crewai/patchers/flow.py +118 -0
  220. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  221. opik/integrations/crewai/patchers/llm_client.py +207 -0
  222. opik/integrations/dspy/callback.py +80 -17
  223. opik/integrations/dspy/parsers.py +168 -0
  224. opik/integrations/harbor/__init__.py +17 -0
  225. opik/integrations/harbor/experiment_service.py +269 -0
  226. opik/integrations/harbor/opik_tracker.py +528 -0
  227. opik/integrations/haystack/opik_connector.py +2 -2
  228. opik/integrations/haystack/opik_tracer.py +3 -7
  229. opik/integrations/langchain/__init__.py +3 -1
  230. opik/integrations/langchain/helpers.py +96 -0
  231. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  232. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  233. opik/integrations/langchain/opik_encoder_extension.py +1 -1
  234. opik/integrations/langchain/opik_tracer.py +474 -229
  235. opik/integrations/litellm/__init__.py +5 -0
  236. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  237. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  238. opik/integrations/litellm/opik_tracker.py +43 -0
  239. opik/integrations/litellm/stream_patchers.py +151 -0
  240. opik/integrations/llama_index/callback.py +146 -107
  241. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  242. opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
  243. opik/integrations/openai/opik_tracker.py +1 -1
  244. opik/integrations/sagemaker/auth.py +5 -1
  245. opik/llm_usage/google_usage.py +3 -1
  246. opik/llm_usage/opik_usage.py +7 -8
  247. opik/llm_usage/opik_usage_factory.py +4 -2
  248. opik/logging_messages.py +6 -0
  249. opik/message_processing/batching/base_batcher.py +14 -21
  250. opik/message_processing/batching/batch_manager.py +22 -10
  251. opik/message_processing/batching/batch_manager_constuctors.py +10 -0
  252. opik/message_processing/batching/batchers.py +59 -27
  253. opik/message_processing/batching/flushing_thread.py +0 -3
  254. opik/message_processing/emulation/__init__.py +0 -0
  255. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  256. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  257. opik/message_processing/emulation/models.py +162 -0
  258. opik/message_processing/encoder_helpers.py +79 -0
  259. opik/message_processing/messages.py +56 -1
  260. opik/message_processing/preprocessing/__init__.py +0 -0
  261. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  262. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  263. opik/message_processing/preprocessing/constants.py +1 -0
  264. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  265. opik/message_processing/preprocessing/preprocessor.py +36 -0
  266. opik/message_processing/processors/__init__.py +0 -0
  267. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  268. opik/message_processing/processors/message_processors.py +92 -0
  269. opik/message_processing/processors/message_processors_chain.py +96 -0
  270. opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
  271. opik/message_processing/queue_consumer.py +9 -3
  272. opik/message_processing/streamer.py +71 -33
  273. opik/message_processing/streamer_constructors.py +43 -10
  274. opik/opik_context.py +16 -4
  275. opik/plugins/pytest/hooks.py +5 -3
  276. opik/rest_api/__init__.py +346 -15
  277. opik/rest_api/alerts/__init__.py +7 -0
  278. opik/rest_api/alerts/client.py +667 -0
  279. opik/rest_api/alerts/raw_client.py +1015 -0
  280. opik/rest_api/alerts/types/__init__.py +7 -0
  281. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  282. opik/rest_api/annotation_queues/__init__.py +4 -0
  283. opik/rest_api/annotation_queues/client.py +668 -0
  284. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  285. opik/rest_api/automation_rule_evaluators/client.py +34 -2
  286. opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
  287. opik/rest_api/client.py +15 -0
  288. opik/rest_api/dashboards/__init__.py +4 -0
  289. opik/rest_api/dashboards/client.py +462 -0
  290. opik/rest_api/dashboards/raw_client.py +648 -0
  291. opik/rest_api/datasets/client.py +1310 -44
  292. opik/rest_api/datasets/raw_client.py +2269 -358
  293. opik/rest_api/experiments/__init__.py +2 -2
  294. opik/rest_api/experiments/client.py +191 -5
  295. opik/rest_api/experiments/raw_client.py +301 -7
  296. opik/rest_api/experiments/types/__init__.py +4 -1
  297. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  298. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  299. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  300. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  301. opik/rest_api/llm_provider_key/client.py +20 -0
  302. opik/rest_api/llm_provider_key/raw_client.py +20 -0
  303. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  304. opik/rest_api/manual_evaluation/__init__.py +4 -0
  305. opik/rest_api/manual_evaluation/client.py +347 -0
  306. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  307. opik/rest_api/optimizations/client.py +145 -9
  308. opik/rest_api/optimizations/raw_client.py +237 -13
  309. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  310. opik/rest_api/prompts/__init__.py +2 -2
  311. opik/rest_api/prompts/client.py +227 -6
  312. opik/rest_api/prompts/raw_client.py +331 -2
  313. opik/rest_api/prompts/types/__init__.py +3 -1
  314. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  315. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  316. opik/rest_api/spans/__init__.py +0 -2
  317. opik/rest_api/spans/client.py +238 -76
  318. opik/rest_api/spans/raw_client.py +307 -95
  319. opik/rest_api/spans/types/__init__.py +0 -2
  320. opik/rest_api/traces/client.py +572 -161
  321. opik/rest_api/traces/raw_client.py +736 -229
  322. opik/rest_api/types/__init__.py +352 -17
  323. opik/rest_api/types/aggregation_data.py +1 -0
  324. opik/rest_api/types/alert.py +33 -0
  325. opik/rest_api/types/alert_alert_type.py +5 -0
  326. opik/rest_api/types/alert_page_public.py +24 -0
  327. opik/rest_api/types/alert_public.py +33 -0
  328. opik/rest_api/types/alert_public_alert_type.py +5 -0
  329. opik/rest_api/types/alert_trigger.py +27 -0
  330. opik/rest_api/types/alert_trigger_config.py +28 -0
  331. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  332. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  333. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  334. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  335. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  336. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  337. opik/rest_api/types/alert_trigger_public.py +27 -0
  338. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  339. opik/rest_api/types/alert_trigger_write.py +23 -0
  340. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  341. opik/rest_api/types/alert_write.py +28 -0
  342. opik/rest_api/types/alert_write_alert_type.py +5 -0
  343. opik/rest_api/types/annotation_queue.py +42 -0
  344. opik/rest_api/types/annotation_queue_batch.py +27 -0
  345. opik/rest_api/types/annotation_queue_item_ids.py +19 -0
  346. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  347. opik/rest_api/types/annotation_queue_public.py +38 -0
  348. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  349. opik/rest_api/types/annotation_queue_reviewer.py +20 -0
  350. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  351. opik/rest_api/types/annotation_queue_scope.py +5 -0
  352. opik/rest_api/types/annotation_queue_write.py +31 -0
  353. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  354. opik/rest_api/types/audio_url.py +19 -0
  355. opik/rest_api/types/audio_url_public.py +19 -0
  356. opik/rest_api/types/audio_url_write.py +19 -0
  357. opik/rest_api/types/automation_rule_evaluator.py +62 -2
  358. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  359. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  360. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  361. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  362. opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
  363. opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
  364. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  365. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  366. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  367. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  368. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  369. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  370. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  371. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  372. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  373. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  374. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  375. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  376. opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
  377. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  378. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  379. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  380. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  381. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  382. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  383. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  384. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  385. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  386. opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
  387. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  388. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  389. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  390. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  391. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  392. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  393. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  394. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  395. opik/rest_api/types/dashboard_page_public.py +24 -0
  396. opik/rest_api/types/dashboard_public.py +30 -0
  397. opik/rest_api/types/dataset.py +4 -0
  398. opik/rest_api/types/dataset_expansion.py +42 -0
  399. opik/rest_api/types/dataset_expansion_response.py +39 -0
  400. opik/rest_api/types/dataset_item.py +2 -0
  401. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  402. opik/rest_api/types/dataset_item_compare.py +2 -0
  403. opik/rest_api/types/dataset_item_filter.py +27 -0
  404. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  405. opik/rest_api/types/dataset_item_page_compare.py +5 -0
  406. opik/rest_api/types/dataset_item_page_public.py +5 -0
  407. opik/rest_api/types/dataset_item_public.py +2 -0
  408. opik/rest_api/types/dataset_item_update.py +39 -0
  409. opik/rest_api/types/dataset_item_write.py +1 -0
  410. opik/rest_api/types/dataset_public.py +4 -0
  411. opik/rest_api/types/dataset_public_status.py +5 -0
  412. opik/rest_api/types/dataset_status.py +5 -0
  413. opik/rest_api/types/dataset_version_diff.py +22 -0
  414. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  415. opik/rest_api/types/dataset_version_page_public.py +23 -0
  416. opik/rest_api/types/dataset_version_public.py +59 -0
  417. opik/rest_api/types/dataset_version_summary.py +46 -0
  418. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  419. opik/rest_api/types/experiment.py +7 -2
  420. opik/rest_api/types/experiment_group_response.py +2 -0
  421. opik/rest_api/types/experiment_public.py +7 -2
  422. opik/rest_api/types/experiment_public_status.py +5 -0
  423. opik/rest_api/types/experiment_score.py +20 -0
  424. opik/rest_api/types/experiment_score_public.py +20 -0
  425. opik/rest_api/types/experiment_score_write.py +20 -0
  426. opik/rest_api/types/experiment_status.py +5 -0
  427. opik/rest_api/types/feedback.py +25 -1
  428. opik/rest_api/types/feedback_create.py +20 -1
  429. opik/rest_api/types/feedback_object_public.py +27 -1
  430. opik/rest_api/types/feedback_public.py +25 -1
  431. opik/rest_api/types/feedback_score_batch_item.py +2 -1
  432. opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
  433. opik/rest_api/types/feedback_score_public.py +4 -0
  434. opik/rest_api/types/feedback_update.py +20 -1
  435. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  436. opik/rest_api/types/group_detail.py +19 -0
  437. opik/rest_api/types/group_details.py +20 -0
  438. opik/rest_api/types/guardrail.py +1 -0
  439. opik/rest_api/types/guardrail_write.py +1 -0
  440. opik/rest_api/types/ids_holder.py +19 -0
  441. opik/rest_api/types/image_url.py +20 -0
  442. opik/rest_api/types/image_url_public.py +20 -0
  443. opik/rest_api/types/image_url_write.py +20 -0
  444. opik/rest_api/types/llm_as_judge_message.py +5 -1
  445. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  446. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  447. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  448. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  449. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  450. opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
  451. opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
  452. opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
  453. opik/rest_api/types/manual_evaluation_request.py +38 -0
  454. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  455. opik/rest_api/types/manual_evaluation_response.py +27 -0
  456. opik/rest_api/types/optimization.py +4 -2
  457. opik/rest_api/types/optimization_public.py +4 -2
  458. opik/rest_api/types/optimization_public_status.py +3 -1
  459. opik/rest_api/types/optimization_status.py +3 -1
  460. opik/rest_api/types/optimization_studio_config.py +27 -0
  461. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  462. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  463. opik/rest_api/types/optimization_studio_log.py +22 -0
  464. opik/rest_api/types/optimization_write.py +4 -2
  465. opik/rest_api/types/optimization_write_status.py +3 -1
  466. opik/rest_api/types/project.py +1 -0
  467. opik/rest_api/types/project_detailed.py +1 -0
  468. opik/rest_api/types/project_reference.py +31 -0
  469. opik/rest_api/types/project_reference_public.py +31 -0
  470. opik/rest_api/types/project_stats_summary_item.py +1 -0
  471. opik/rest_api/types/prompt.py +6 -0
  472. opik/rest_api/types/prompt_detail.py +6 -0
  473. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  474. opik/rest_api/types/prompt_public.py +6 -0
  475. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  476. opik/rest_api/types/prompt_template_structure.py +5 -0
  477. opik/rest_api/types/prompt_version.py +3 -0
  478. opik/rest_api/types/prompt_version_detail.py +3 -0
  479. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  480. opik/rest_api/types/prompt_version_link.py +1 -0
  481. opik/rest_api/types/prompt_version_link_public.py +1 -0
  482. opik/rest_api/types/prompt_version_page_public.py +5 -0
  483. opik/rest_api/types/prompt_version_public.py +3 -0
  484. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  485. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  486. opik/rest_api/types/prompt_version_update.py +33 -0
  487. opik/rest_api/types/provider_api_key.py +9 -0
  488. opik/rest_api/types/provider_api_key_provider.py +1 -1
  489. opik/rest_api/types/provider_api_key_public.py +9 -0
  490. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  491. opik/rest_api/types/score_name.py +1 -0
  492. opik/rest_api/types/service_toggles_config.py +18 -0
  493. opik/rest_api/types/span.py +1 -2
  494. opik/rest_api/types/span_enrichment_options.py +31 -0
  495. opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
  496. opik/rest_api/types/span_filter.py +23 -0
  497. opik/rest_api/types/span_filter_operator.py +21 -0
  498. opik/rest_api/types/span_filter_write.py +23 -0
  499. opik/rest_api/types/span_filter_write_operator.py +21 -0
  500. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  501. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  502. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  503. opik/rest_api/types/span_public.py +1 -2
  504. opik/rest_api/types/span_update.py +46 -0
  505. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  506. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  507. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  508. opik/rest_api/types/span_write.py +1 -2
  509. opik/rest_api/types/studio_evaluation.py +20 -0
  510. opik/rest_api/types/studio_evaluation_public.py +20 -0
  511. opik/rest_api/types/studio_evaluation_write.py +20 -0
  512. opik/rest_api/types/studio_llm_model.py +21 -0
  513. opik/rest_api/types/studio_llm_model_public.py +21 -0
  514. opik/rest_api/types/studio_llm_model_write.py +21 -0
  515. opik/rest_api/types/studio_message.py +20 -0
  516. opik/rest_api/types/studio_message_public.py +20 -0
  517. opik/rest_api/types/studio_message_write.py +20 -0
  518. opik/rest_api/types/studio_metric.py +21 -0
  519. opik/rest_api/types/studio_metric_public.py +21 -0
  520. opik/rest_api/types/studio_metric_write.py +21 -0
  521. opik/rest_api/types/studio_optimizer.py +21 -0
  522. opik/rest_api/types/studio_optimizer_public.py +21 -0
  523. opik/rest_api/types/studio_optimizer_write.py +21 -0
  524. opik/rest_api/types/studio_prompt.py +20 -0
  525. opik/rest_api/types/studio_prompt_public.py +20 -0
  526. opik/rest_api/types/studio_prompt_write.py +20 -0
  527. opik/rest_api/types/trace.py +11 -2
  528. opik/rest_api/types/trace_enrichment_options.py +32 -0
  529. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
  530. opik/rest_api/types/trace_filter.py +23 -0
  531. opik/rest_api/types/trace_filter_operator.py +21 -0
  532. opik/rest_api/types/trace_filter_write.py +23 -0
  533. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  534. opik/rest_api/types/trace_public.py +11 -2
  535. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  536. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  537. opik/rest_api/types/trace_thread_identifier.py +1 -0
  538. opik/rest_api/types/trace_update.py +39 -0
  539. opik/rest_api/types/trace_write.py +1 -2
  540. opik/rest_api/types/value_entry.py +2 -0
  541. opik/rest_api/types/value_entry_compare.py +2 -0
  542. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  543. opik/rest_api/types/value_entry_public.py +2 -0
  544. opik/rest_api/types/video_url.py +19 -0
  545. opik/rest_api/types/video_url_public.py +19 -0
  546. opik/rest_api/types/video_url_write.py +19 -0
  547. opik/rest_api/types/webhook.py +28 -0
  548. opik/rest_api/types/webhook_examples.py +19 -0
  549. opik/rest_api/types/webhook_public.py +28 -0
  550. opik/rest_api/types/webhook_test_result.py +23 -0
  551. opik/rest_api/types/webhook_test_result_status.py +5 -0
  552. opik/rest_api/types/webhook_write.py +23 -0
  553. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  554. opik/rest_api/types/workspace_configuration.py +5 -0
  555. opik/rest_api/welcome_wizard/__init__.py +4 -0
  556. opik/rest_api/welcome_wizard/client.py +195 -0
  557. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  558. opik/rest_api/workspaces/client.py +14 -2
  559. opik/rest_api/workspaces/raw_client.py +10 -0
  560. opik/s3_httpx_client.py +14 -1
  561. opik/simulation/__init__.py +6 -0
  562. opik/simulation/simulated_user.py +99 -0
  563. opik/simulation/simulator.py +108 -0
  564. opik/synchronization.py +5 -6
  565. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  566. opik/types.py +36 -0
  567. opik/validation/chat_prompt_messages.py +241 -0
  568. opik/validation/feedback_score.py +3 -3
  569. opik/validation/validator.py +28 -0
  570. opik-1.9.71.dist-info/METADATA +370 -0
  571. opik-1.9.71.dist-info/RECORD +1110 -0
  572. opik/api_objects/prompt/prompt.py +0 -112
  573. opik/cli.py +0 -193
  574. opik/hooks.py +0 -13
  575. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  576. opik/integrations/bedrock/helpers.py +0 -8
  577. opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
  578. opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
  579. opik-1.8.39.dist-info/METADATA +0 -339
  580. opik-1.8.39.dist-info/RECORD +0 -790
  581. /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
  582. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
  583. /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
  584. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
  585. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
  586. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
  587. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  588. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  589. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
  590. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  591. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
  592. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,181 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Union
4
+
5
+ from opik.evaluation.metrics.llm_judges.g_eval import metric as g_eval_metric
6
+ from opik.evaluation.models import base_model
7
+
8
+
9
+ class DemographicBiasJudge(g_eval_metric.GEvalPreset):
10
+ """
11
+ Score demographic stereotyping or bias in a response.
12
+
13
+ Args:
14
+ model: Optional model identifier or ``OpikBaseModel`` instance.
15
+ track: Whether to automatically track results. Defaults to ``True``.
16
+ project_name: Optional tracking project name.
17
+ temperature: Sampling temperature forwarded to the model.
18
+
19
+ Example:
20
+ >>> from opik.evaluation.metrics import DemographicBiasJudge
21
+ >>> judge = DemographicBiasJudge(model="gpt-4")
22
+ >>> result = judge.score(output="People from X group are always late.") # doctest: +SKIP
23
+ >>> result.value # doctest: +SKIP
24
+ 0.95
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
30
+ track: bool = True,
31
+ project_name: Optional[str] = None,
32
+ temperature: float = 0.0,
33
+ ) -> None:
34
+ super().__init__(
35
+ preset="bias_demographic",
36
+ model=model,
37
+ track=track,
38
+ project_name=project_name,
39
+ temperature=temperature,
40
+ name="demographic_bias_judge",
41
+ )
42
+
43
+
44
+ class PoliticalBiasJudge(g_eval_metric.GEvalPreset):
45
+ """
46
+ Detect partisan or ideological bias in a response.
47
+
48
+ Args:
49
+ model: Optional model identifier or ``OpikBaseModel`` instance.
50
+ track: Whether to automatically track results. Defaults to ``True``.
51
+ project_name: Optional tracking project name.
52
+ temperature: Sampling temperature forwarded to the model.
53
+
54
+ Example:
55
+ >>> from opik.evaluation.metrics import PoliticalBiasJudge
56
+ >>> judge = PoliticalBiasJudge(model="gpt-4")
57
+ >>> result = judge.score(output="Vote for candidate X because Y is corrupt") # doctest: +SKIP
58
+ >>> result.value # doctest: +SKIP
59
+ 0.87
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
65
+ track: bool = True,
66
+ project_name: Optional[str] = None,
67
+ temperature: float = 0.0,
68
+ ) -> None:
69
+ super().__init__(
70
+ preset="bias_political",
71
+ model=model,
72
+ track=track,
73
+ project_name=project_name,
74
+ temperature=temperature,
75
+ name="political_bias_judge",
76
+ )
77
+
78
+
79
+ class GenderBiasJudge(g_eval_metric.GEvalPreset):
80
+ """
81
+ Detect gender stereotyping or exclusion in generated text.
82
+
83
+ Args:
84
+ model: Optional model identifier or ``OpikBaseModel`` instance.
85
+ track: Whether to automatically track results. Defaults to ``True``.
86
+ project_name: Optional tracking project name.
87
+ temperature: Sampling temperature forwarded to the model.
88
+
89
+ Example:
90
+ >>> from opik.evaluation.metrics import GenderBiasJudge
91
+ >>> judge = GenderBiasJudge(model="gpt-4")
92
+ >>> result = judge.score(output="Women are naturally worse at math.") # doctest: +SKIP
93
+ >>> result.value # doctest: +SKIP
94
+ 0.93
95
+ """
96
+
97
+ def __init__(
98
+ self,
99
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
100
+ track: bool = True,
101
+ project_name: Optional[str] = None,
102
+ temperature: float = 0.0,
103
+ ) -> None:
104
+ super().__init__(
105
+ preset="bias_gender",
106
+ model=model,
107
+ track=track,
108
+ project_name=project_name,
109
+ temperature=temperature,
110
+ name="gender_bias_judge",
111
+ )
112
+
113
+
114
+ class ReligiousBiasJudge(g_eval_metric.GEvalPreset):
115
+ """
116
+ Evaluate responses for religious bias or disrespectful language.
117
+
118
+ Args:
119
+ model: Optional model identifier or ``OpikBaseModel`` instance.
120
+ track: Whether to automatically track results. Defaults to ``True``.
121
+ project_name: Optional tracking project name.
122
+ temperature: Sampling temperature forwarded to the model.
123
+
124
+ Example:
125
+ >>> from opik.evaluation.metrics import ReligiousBiasJudge
126
+ >>> judge = ReligiousBiasJudge(model="gpt-4")
127
+ >>> result = judge.score(output="Believers of X are all foolish.") # doctest: +SKIP
128
+ >>> result.value # doctest: +SKIP
129
+ 0.9
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
135
+ track: bool = True,
136
+ project_name: Optional[str] = None,
137
+ temperature: float = 0.0,
138
+ ) -> None:
139
+ super().__init__(
140
+ preset="bias_religion",
141
+ model=model,
142
+ track=track,
143
+ project_name=project_name,
144
+ temperature=temperature,
145
+ name="religious_bias_judge",
146
+ )
147
+
148
+
149
+ class RegionalBiasJudge(g_eval_metric.GEvalPreset):
150
+ """
151
+ Assess geographic or cultural bias in responses.
152
+
153
+ Args:
154
+ model: Optional model identifier or ``OpikBaseModel`` instance.
155
+ track: Whether to automatically track results. Defaults to ``True``.
156
+ project_name: Optional tracking project name.
157
+ temperature: Sampling temperature forwarded to the model.
158
+
159
+ Example:
160
+ >>> from opik.evaluation.metrics import RegionalBiasJudge
161
+ >>> judge = RegionalBiasJudge(model="gpt-4")
162
+ >>> result = judge.score(output="People from region Z are lazy.") # doctest: +SKIP
163
+ >>> result.value # doctest: +SKIP
164
+ 0.88
165
+ """
166
+
167
+ def __init__(
168
+ self,
169
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
170
+ track: bool = True,
171
+ project_name: Optional[str] = None,
172
+ temperature: float = 0.0,
173
+ ) -> None:
174
+ super().__init__(
175
+ preset="bias_regional",
176
+ model=model,
177
+ track=track,
178
+ project_name=project_name,
179
+ temperature=temperature,
180
+ name="regional_bias_judge",
181
+ )
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Union
4
+
5
+ from opik.evaluation.metrics.llm_judges.g_eval import metric as g_eval_metric
6
+ from opik.evaluation.models import base_model
7
+
8
+
9
+ class ComplianceRiskJudge(g_eval_metric.GEvalPreset):
10
+ """
11
+ Evaluate responses for non-compliant or misleading claims in regulated sectors.
12
+
13
+ Args:
14
+ model: Optional model identifier or ``OpikBaseModel`` instance.
15
+ track: Whether to automatically track judge outputs. Defaults to ``True``.
16
+ project_name: Optional tracking project name.
17
+ temperature: Sampling temperature forwarded to the underlying model.
18
+
19
+ Example:
20
+ >>> from opik.evaluation.metrics import ComplianceRiskJudge
21
+ >>> judge = ComplianceRiskJudge(model="gpt-4")
22
+ >>> result = judge.score(output="This pill cures diabetes in a week.") # doctest: +SKIP
23
+ >>> result.value # doctest: +SKIP
24
+ 0.97
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
30
+ track: bool = True,
31
+ project_name: Optional[str] = None,
32
+ temperature: float = 0.0,
33
+ ) -> None:
34
+ super().__init__(
35
+ preset="compliance_regulated_truthfulness",
36
+ model=model,
37
+ track=track,
38
+ project_name=project_name,
39
+ temperature=temperature,
40
+ name="compliance_risk_judge",
41
+ )
@@ -0,0 +1,41 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Union
4
+
5
+ from opik.evaluation.metrics.llm_judges.g_eval import metric as g_eval_metric
6
+ from opik.evaluation.models import base_model
7
+
8
+
9
+ class PromptUncertaintyJudge(g_eval_metric.GEvalPreset):
10
+ """
11
+ Rate how ambiguous or underspecified a prompt feels to the model.
12
+
13
+ Args:
14
+ model: Optional model identifier or ``OpikBaseModel`` instance.
15
+ track: Whether to automatically track judge outputs. Defaults to ``True``.
16
+ project_name: Optional tracking project name.
17
+ temperature: Sampling temperature forwarded to the underlying model.
18
+
19
+ Example:
20
+ >>> from opik.evaluation.metrics import PromptUncertaintyJudge
21
+ >>> judge = PromptUncertaintyJudge(model="gpt-4")
22
+ >>> result = judge.score(output="Do the right thing in the best way possible.") # doctest: +SKIP
23
+ >>> result.value # doctest: +SKIP
24
+ 0.8
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
30
+ track: bool = True,
31
+ project_name: Optional[str] = None,
32
+ temperature: float = 0.0,
33
+ ) -> None:
34
+ super().__init__(
35
+ preset="prompt_uncertainty",
36
+ model=model,
37
+ track=track,
38
+ project_name=project_name,
39
+ temperature=temperature,
40
+ name="prompt_uncertainty_judge",
41
+ )
@@ -0,0 +1,146 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Union
4
+
5
+ from opik.evaluation.metrics.llm_judges.g_eval import metric as g_eval_metric
6
+ from opik.evaluation.models import base_model
7
+
8
+
9
+ class SummarizationConsistencyJudge(g_eval_metric.GEvalPreset):
10
+ """
11
+ Score how faithful a summary is to its source content.
12
+
13
+ Args:
14
+ model: Optional model identifier or ``OpikBaseModel`` instance.
15
+ track: Whether to automatically track judge outputs. Defaults to ``True``.
16
+ project_name: Optional tracking project name.
17
+ temperature: Sampling temperature forwarded to the underlying model.
18
+
19
+ Example:
20
+ >>> from opik.evaluation.metrics import SummarizationConsistencyJudge
21
+ >>> judge = SummarizationConsistencyJudge(model="gpt-4")
22
+ >>> result = judge.score(output="Summary omits key fact.") # doctest: +SKIP
23
+ >>> result.value # doctest: +SKIP
24
+ 0.4
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
30
+ track: bool = True,
31
+ project_name: Optional[str] = None,
32
+ temperature: float = 0.0,
33
+ ) -> None:
34
+ super().__init__(
35
+ preset="summarization_consistency",
36
+ model=model,
37
+ track=track,
38
+ project_name=project_name,
39
+ temperature=temperature,
40
+ name="summarization_consistency_judge",
41
+ )
42
+
43
+
44
+ class SummarizationCoherenceJudge(g_eval_metric.GEvalPreset):
45
+ """
46
+ Evaluate the coherence and structure of generated summaries.
47
+
48
+ Args:
49
+ model: Optional model identifier or ``OpikBaseModel`` instance.
50
+ track: Whether to automatically track judge outputs. Defaults to ``True``.
51
+ project_name: Optional tracking project name.
52
+ temperature: Sampling temperature forwarded to the underlying model.
53
+
54
+ Example:
55
+ >>> from opik.evaluation.metrics import SummarizationCoherenceJudge
56
+ >>> judge = SummarizationCoherenceJudge(model="gpt-4")
57
+ >>> result = judge.score(output="Summary jumps between unrelated topics.") # doctest: +SKIP
58
+ >>> result.value # doctest: +SKIP
59
+ 0.5
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
65
+ track: bool = True,
66
+ project_name: Optional[str] = None,
67
+ temperature: float = 0.0,
68
+ ) -> None:
69
+ super().__init__(
70
+ preset="summarization_coherence",
71
+ model=model,
72
+ track=track,
73
+ project_name=project_name,
74
+ temperature=temperature,
75
+ name="summarization_coherence_judge",
76
+ )
77
+
78
+
79
+ class DialogueHelpfulnessJudge(g_eval_metric.GEvalPreset):
80
+ """
81
+ Judge how helpful an assistant reply is within a dialogue.
82
+
83
+ Args:
84
+ model: Optional model identifier or ``OpikBaseModel`` instance.
85
+ track: Whether to automatically track judge outputs. Defaults to ``True``.
86
+ project_name: Optional tracking project name.
87
+ temperature: Sampling temperature forwarded to the underlying model.
88
+
89
+ Example:
90
+ >>> from opik.evaluation.metrics import DialogueHelpfulnessJudge
91
+ >>> judge = DialogueHelpfulnessJudge(model="gpt-4")
92
+ >>> result = judge.score(output="Assistant politely refuses without help.") # doctest: +SKIP
93
+ >>> result.value # doctest: +SKIP
94
+ 0.3
95
+ """
96
+
97
+ def __init__(
98
+ self,
99
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
100
+ track: bool = True,
101
+ project_name: Optional[str] = None,
102
+ temperature: float = 0.0,
103
+ ) -> None:
104
+ super().__init__(
105
+ preset="dialogue_helpfulness",
106
+ model=model,
107
+ track=track,
108
+ project_name=project_name,
109
+ temperature=temperature,
110
+ name="dialogue_helpfulness_judge",
111
+ )
112
+
113
+
114
+ class QARelevanceJudge(g_eval_metric.GEvalPreset):
115
+ """
116
+ Check whether an answer directly addresses the user question.
117
+
118
+ Args:
119
+ model: Optional model identifier or ``OpikBaseModel`` instance.
120
+ track: Whether to automatically track judge outputs. Defaults to ``True``.
121
+ project_name: Optional tracking project name.
122
+ temperature: Sampling temperature forwarded to the underlying model.
123
+
124
+ Example:
125
+ >>> from opik.evaluation.metrics import QARelevanceJudge
126
+ >>> judge = QARelevanceJudge(model="gpt-4")
127
+ >>> result = judge.score(output="Answer rambles without addressing the ask.") # doctest: +SKIP
128
+ >>> result.value # doctest: +SKIP
129
+ 0.2
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ model: Optional[Union[str, base_model.OpikBaseModel]] = None,
135
+ track: bool = True,
136
+ project_name: Optional[str] = None,
137
+ temperature: float = 0.0,
138
+ ) -> None:
139
+ super().__init__(
140
+ preset="qa_relevance",
141
+ model=model,
142
+ track=track,
143
+ project_name=project_name,
144
+ temperature=temperature,
145
+ name="qa_relevance_judge",
146
+ )
@@ -27,6 +27,8 @@ class Hallucination(base_metric.BaseMetric):
27
27
  track: Whether to track the metric. Defaults to True.
28
28
  project_name: Optional project name to track the metric in for the cases when
29
29
  there are no parent span/trace to inherit project name from.
30
+ seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
31
+ temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
30
32
 
31
33
  Example:
32
34
  >>> from opik.evaluation.metrics import Hallucination
@@ -49,18 +51,29 @@ class Hallucination(base_metric.BaseMetric):
49
51
  few_shot_examples: Optional[List[template.FewShotExampleHallucination]] = None,
50
52
  track: bool = True,
51
53
  project_name: Optional[str] = None,
54
+ seed: Optional[int] = None,
55
+ temperature: Optional[float] = None,
52
56
  ):
53
57
  super().__init__(name=name, track=track, project_name=project_name)
54
- self._init_model(model)
58
+ self._seed = seed
59
+ self._init_model(model, temperature=temperature)
55
60
  self.few_shot_examples = few_shot_examples
56
61
 
57
62
  def _init_model(
58
- self, model: Optional[Union[str, base_model.OpikBaseModel]]
63
+ self,
64
+ model: Optional[Union[str, base_model.OpikBaseModel]],
65
+ temperature: Optional[float],
59
66
  ) -> None:
60
67
  if isinstance(model, base_model.OpikBaseModel):
61
68
  self._model = model
62
69
  else:
63
- self._model = models_factory.get(model_name=model)
70
+ model_kwargs = {}
71
+ if temperature is not None:
72
+ model_kwargs["temperature"] = temperature
73
+ if self._seed is not None:
74
+ model_kwargs["seed"] = self._seed
75
+
76
+ self._model = models_factory.get(model_name=model, **model_kwargs)
64
77
 
65
78
  def score(
66
79
  self,
@@ -0,0 +1,3 @@
1
+ from .metric import LLMJuriesJudge
2
+
3
+ __all__ = ["LLMJuriesJudge"]
@@ -0,0 +1,76 @@
1
+ """LLM Juries: aggregate heterogeneous judges into a consensus score."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, Iterable, List, Optional
6
+
7
+ from opik.evaluation.metrics.base_metric import BaseMetric
8
+ from opik.evaluation.metrics.score_result import ScoreResult
9
+ import opik.exceptions as exceptions
10
+
11
+
12
+ class LLMJuriesJudge(BaseMetric):
13
+ """
14
+ Aggregate multiple judge metrics into a consensus score.
15
+
16
+ Args:
17
+ judges: Iterable of judge metrics to execute for consensus.
18
+ name: Display name for the aggregated result. Defaults to
19
+ ``"llm_juries_judge"``.
20
+ track: Whether to automatically track results. Defaults to ``True``.
21
+ project_name: Optional tracking project name. Defaults to ``None``.
22
+
23
+ Example:
24
+ >>> from opik.evaluation.metrics import LLMJuriesJudge, ComplianceRiskJudge
25
+ >>> juries = LLMJuriesJudge(judges=[ComplianceRiskJudge(model="gpt-4")])
26
+ >>> result = juries.score(output="Financial guarantees provided.") # doctest: +SKIP
27
+ >>> result.value # doctest: +SKIP
28
+ 0.6
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ judges: Iterable[BaseMetric],
34
+ name: str = "llm_juries_judge",
35
+ track: bool = True,
36
+ project_name: Optional[str] = None,
37
+ ) -> None:
38
+ super().__init__(name=name, track=track, project_name=project_name)
39
+ self._judges = list(judges)
40
+ if not self._judges:
41
+ raise ValueError("LLMJuriesJudge requires at least one judge metric.")
42
+
43
+ def score(self, *args: Any, **kwargs: Any) -> ScoreResult:
44
+ precomputed: Optional[Dict[BaseMetric, ScoreResult]] = kwargs.pop(
45
+ "precomputed", None
46
+ )
47
+ scores: List[ScoreResult] = []
48
+ for judge in self._judges:
49
+ if precomputed is not None and judge in precomputed:
50
+ raw_result: Any = precomputed[judge]
51
+ else:
52
+ raw_result = judge.score(*args, **kwargs)
53
+ judge_results = raw_result if isinstance(raw_result, list) else [raw_result]
54
+
55
+ for result in judge_results:
56
+ if not isinstance(result, ScoreResult):
57
+ raise exceptions.MetricComputationError(
58
+ f"Judge {judge.name} returned unexpected result type {type(result)!r}"
59
+ )
60
+ if result.value < 0 or result.value > 1:
61
+ raise exceptions.MetricComputationError(
62
+ f"Judge {judge.name} returned out-of-range score {result.value}"
63
+ )
64
+ scores.append(result)
65
+
66
+ if not scores:
67
+ raise exceptions.MetricComputationError("No judge scores produced")
68
+
69
+ average = sum(res.value for res in scores) / len(scores)
70
+ metadata = {
71
+ "judge_scores": {res.name: res.value for res in scores},
72
+ }
73
+ reason = f"Averaged {len(scores)} judge scores"
74
+ return ScoreResult(
75
+ value=average, name=self.name, reason=reason, metadata=metadata
76
+ )
@@ -26,6 +26,8 @@ class Moderation(base_metric.BaseMetric):
26
26
  track: Whether to track the metric. Defaults to True.
27
27
  project_name: Optional project name to track the metric in for the cases when
28
28
  there are no parent span/trace to inherit project name from.
29
+ seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
30
+ temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
29
31
 
30
32
  Example:
31
33
  >>> from opik.evaluation.metrics import Moderation
@@ -42,23 +44,33 @@ class Moderation(base_metric.BaseMetric):
42
44
  few_shot_examples: Optional[List[template.FewShotExampleModeration]] = None,
43
45
  track: bool = True,
44
46
  project_name: Optional[str] = None,
47
+ seed: Optional[int] = None,
48
+ temperature: Optional[float] = None,
45
49
  ):
46
50
  super().__init__(
47
51
  name=name,
48
52
  track=track,
49
53
  project_name=project_name,
50
54
  )
51
-
52
- self._init_model(model)
55
+ self._seed = seed
56
+ self._init_model(model, temperature=temperature)
53
57
  self.few_shot_examples = [] if few_shot_examples is None else few_shot_examples
54
58
 
55
59
  def _init_model(
56
- self, model: Optional[Union[str, base_model.OpikBaseModel]]
60
+ self,
61
+ model: Optional[Union[str, base_model.OpikBaseModel]],
62
+ temperature: Optional[float],
57
63
  ) -> None:
58
64
  if isinstance(model, base_model.OpikBaseModel):
59
65
  self._model = model
60
66
  else:
61
- self._model = models_factory.get(model_name=model)
67
+ model_kwargs = {}
68
+ if temperature is not None:
69
+ model_kwargs["temperature"] = temperature
70
+ if self._seed is not None:
71
+ model_kwargs["seed"] = self._seed
72
+
73
+ self._model = models_factory.get(model_name=model, **model_kwargs)
62
74
 
63
75
  def score(self, output: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
64
76
  """