opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. opik/__init__.py +19 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/attachment_context.py +36 -0
  9. opik/api_objects/attachment/attachments_extractor.py +153 -0
  10. opik/api_objects/attachment/client.py +1 -0
  11. opik/api_objects/attachment/converters.py +2 -0
  12. opik/api_objects/attachment/decoder.py +18 -0
  13. opik/api_objects/attachment/decoder_base64.py +83 -0
  14. opik/api_objects/attachment/decoder_helpers.py +137 -0
  15. opik/api_objects/data_helpers.py +79 -0
  16. opik/api_objects/dataset/dataset.py +64 -4
  17. opik/api_objects/dataset/rest_operations.py +11 -2
  18. opik/api_objects/experiment/experiment.py +57 -57
  19. opik/api_objects/experiment/experiment_item.py +2 -1
  20. opik/api_objects/experiment/experiments_client.py +64 -0
  21. opik/api_objects/experiment/helpers.py +35 -11
  22. opik/api_objects/experiment/rest_operations.py +65 -5
  23. opik/api_objects/helpers.py +8 -5
  24. opik/api_objects/local_recording.py +81 -0
  25. opik/api_objects/opik_client.py +600 -108
  26. opik/api_objects/opik_query_language.py +39 -5
  27. opik/api_objects/prompt/__init__.py +12 -2
  28. opik/api_objects/prompt/base_prompt.py +69 -0
  29. opik/api_objects/prompt/base_prompt_template.py +29 -0
  30. opik/api_objects/prompt/chat/__init__.py +1 -0
  31. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  32. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  33. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  34. opik/api_objects/prompt/client.py +189 -47
  35. opik/api_objects/prompt/text/__init__.py +1 -0
  36. opik/api_objects/prompt/text/prompt.py +174 -0
  37. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  38. opik/api_objects/prompt/types.py +23 -0
  39. opik/api_objects/search_helpers.py +89 -0
  40. opik/api_objects/span/span_data.py +35 -25
  41. opik/api_objects/threads/threads_client.py +39 -5
  42. opik/api_objects/trace/trace_client.py +52 -2
  43. opik/api_objects/trace/trace_data.py +15 -24
  44. opik/api_objects/validation_helpers.py +3 -3
  45. opik/cli/__init__.py +5 -0
  46. opik/cli/__main__.py +6 -0
  47. opik/cli/configure.py +66 -0
  48. opik/cli/exports/__init__.py +131 -0
  49. opik/cli/exports/dataset.py +278 -0
  50. opik/cli/exports/experiment.py +784 -0
  51. opik/cli/exports/project.py +685 -0
  52. opik/cli/exports/prompt.py +578 -0
  53. opik/cli/exports/utils.py +406 -0
  54. opik/cli/harbor.py +39 -0
  55. opik/cli/healthcheck.py +21 -0
  56. opik/cli/imports/__init__.py +439 -0
  57. opik/cli/imports/dataset.py +143 -0
  58. opik/cli/imports/experiment.py +1192 -0
  59. opik/cli/imports/project.py +262 -0
  60. opik/cli/imports/prompt.py +177 -0
  61. opik/cli/imports/utils.py +280 -0
  62. opik/cli/main.py +49 -0
  63. opik/cli/proxy.py +93 -0
  64. opik/cli/usage_report/__init__.py +16 -0
  65. opik/cli/usage_report/charts.py +783 -0
  66. opik/cli/usage_report/cli.py +274 -0
  67. opik/cli/usage_report/constants.py +9 -0
  68. opik/cli/usage_report/extraction.py +749 -0
  69. opik/cli/usage_report/pdf.py +244 -0
  70. opik/cli/usage_report/statistics.py +78 -0
  71. opik/cli/usage_report/utils.py +235 -0
  72. opik/config.py +13 -7
  73. opik/configurator/configure.py +17 -0
  74. opik/datetime_helpers.py +12 -0
  75. opik/decorator/arguments_helpers.py +9 -1
  76. opik/decorator/base_track_decorator.py +205 -133
  77. opik/decorator/context_manager/span_context_manager.py +123 -0
  78. opik/decorator/context_manager/trace_context_manager.py +84 -0
  79. opik/decorator/opik_args/__init__.py +13 -0
  80. opik/decorator/opik_args/api_classes.py +71 -0
  81. opik/decorator/opik_args/helpers.py +120 -0
  82. opik/decorator/span_creation_handler.py +25 -6
  83. opik/dict_utils.py +3 -3
  84. opik/evaluation/__init__.py +13 -2
  85. opik/evaluation/engine/engine.py +272 -75
  86. opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
  87. opik/evaluation/engine/helpers.py +31 -6
  88. opik/evaluation/engine/metrics_evaluator.py +237 -0
  89. opik/evaluation/evaluation_result.py +168 -2
  90. opik/evaluation/evaluator.py +533 -62
  91. opik/evaluation/metrics/__init__.py +103 -4
  92. opik/evaluation/metrics/aggregated_metric.py +35 -6
  93. opik/evaluation/metrics/base_metric.py +1 -1
  94. opik/evaluation/metrics/conversation/__init__.py +48 -0
  95. opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
  96. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  97. opik/evaluation/metrics/conversation/helpers.py +14 -15
  98. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  99. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  100. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  101. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  102. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  103. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  104. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  105. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
  106. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
  107. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  108. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
  109. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
  110. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  111. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
  112. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
  113. opik/evaluation/metrics/conversation/types.py +4 -5
  114. opik/evaluation/metrics/conversation_types.py +9 -0
  115. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  116. opik/evaluation/metrics/heuristics/bleu.py +35 -15
  117. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  118. opik/evaluation/metrics/heuristics/contains.py +47 -11
  119. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  120. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  121. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  122. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  123. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  124. opik/evaluation/metrics/heuristics/readability.py +129 -0
  125. opik/evaluation/metrics/heuristics/rouge.py +26 -9
  126. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  127. opik/evaluation/metrics/heuristics/tone.py +155 -0
  128. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  129. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
  130. opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
  131. opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
  132. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  133. opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
  134. opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
  135. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  136. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  137. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  138. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  139. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  140. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  141. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  142. opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
  143. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  144. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  145. opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
  146. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  147. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  148. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  149. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  150. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  151. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  152. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  153. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  154. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  155. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
  156. opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
  157. opik/evaluation/metrics/ragas_metric.py +43 -23
  158. opik/evaluation/models/__init__.py +8 -0
  159. opik/evaluation/models/base_model.py +107 -1
  160. opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
  161. opik/evaluation/models/langchain/message_converters.py +97 -15
  162. opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
  163. opik/evaluation/models/litellm/util.py +125 -0
  164. opik/evaluation/models/litellm/warning_filters.py +16 -4
  165. opik/evaluation/models/model_capabilities.py +187 -0
  166. opik/evaluation/models/models_factory.py +25 -3
  167. opik/evaluation/preprocessing.py +92 -0
  168. opik/evaluation/report.py +70 -12
  169. opik/evaluation/rest_operations.py +49 -45
  170. opik/evaluation/samplers/__init__.py +4 -0
  171. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  172. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  173. opik/evaluation/score_statistics.py +66 -0
  174. opik/evaluation/scorers/__init__.py +4 -0
  175. opik/evaluation/scorers/scorer_function.py +55 -0
  176. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  177. opik/evaluation/test_case.py +3 -2
  178. opik/evaluation/test_result.py +1 -0
  179. opik/evaluation/threads/evaluator.py +31 -3
  180. opik/evaluation/threads/helpers.py +3 -2
  181. opik/evaluation/types.py +9 -1
  182. opik/exceptions.py +33 -0
  183. opik/file_upload/file_uploader.py +13 -0
  184. opik/file_upload/upload_options.py +2 -0
  185. opik/hooks/__init__.py +23 -0
  186. opik/hooks/anonymizer_hook.py +36 -0
  187. opik/hooks/httpx_client_hook.py +112 -0
  188. opik/httpx_client.py +12 -9
  189. opik/id_helpers.py +18 -0
  190. opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
  191. opik/integrations/adk/helpers.py +16 -7
  192. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  193. opik/integrations/adk/opik_tracer.py +14 -1
  194. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  195. opik/integrations/adk/recursive_callback_injector.py +4 -7
  196. opik/integrations/bedrock/converse/__init__.py +0 -0
  197. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  198. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
  199. opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
  200. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  201. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  202. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  203. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  204. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  205. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  206. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  207. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  208. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  209. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  210. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  211. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  212. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  213. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  214. opik/integrations/bedrock/opik_tracker.py +42 -4
  215. opik/integrations/bedrock/types.py +19 -0
  216. opik/integrations/crewai/crewai_decorator.py +8 -51
  217. opik/integrations/crewai/opik_tracker.py +31 -10
  218. opik/integrations/crewai/patchers/__init__.py +5 -0
  219. opik/integrations/crewai/patchers/flow.py +118 -0
  220. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  221. opik/integrations/crewai/patchers/llm_client.py +207 -0
  222. opik/integrations/dspy/callback.py +80 -17
  223. opik/integrations/dspy/parsers.py +168 -0
  224. opik/integrations/harbor/__init__.py +17 -0
  225. opik/integrations/harbor/experiment_service.py +269 -0
  226. opik/integrations/harbor/opik_tracker.py +528 -0
  227. opik/integrations/haystack/opik_connector.py +2 -2
  228. opik/integrations/haystack/opik_tracer.py +3 -7
  229. opik/integrations/langchain/__init__.py +3 -1
  230. opik/integrations/langchain/helpers.py +96 -0
  231. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  232. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  233. opik/integrations/langchain/opik_encoder_extension.py +1 -1
  234. opik/integrations/langchain/opik_tracer.py +474 -229
  235. opik/integrations/litellm/__init__.py +5 -0
  236. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  237. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  238. opik/integrations/litellm/opik_tracker.py +43 -0
  239. opik/integrations/litellm/stream_patchers.py +151 -0
  240. opik/integrations/llama_index/callback.py +146 -107
  241. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  242. opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
  243. opik/integrations/openai/opik_tracker.py +1 -1
  244. opik/integrations/sagemaker/auth.py +5 -1
  245. opik/llm_usage/google_usage.py +3 -1
  246. opik/llm_usage/opik_usage.py +7 -8
  247. opik/llm_usage/opik_usage_factory.py +4 -2
  248. opik/logging_messages.py +6 -0
  249. opik/message_processing/batching/base_batcher.py +14 -21
  250. opik/message_processing/batching/batch_manager.py +22 -10
  251. opik/message_processing/batching/batch_manager_constuctors.py +10 -0
  252. opik/message_processing/batching/batchers.py +59 -27
  253. opik/message_processing/batching/flushing_thread.py +0 -3
  254. opik/message_processing/emulation/__init__.py +0 -0
  255. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  256. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  257. opik/message_processing/emulation/models.py +162 -0
  258. opik/message_processing/encoder_helpers.py +79 -0
  259. opik/message_processing/messages.py +56 -1
  260. opik/message_processing/preprocessing/__init__.py +0 -0
  261. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  262. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  263. opik/message_processing/preprocessing/constants.py +1 -0
  264. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  265. opik/message_processing/preprocessing/preprocessor.py +36 -0
  266. opik/message_processing/processors/__init__.py +0 -0
  267. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  268. opik/message_processing/processors/message_processors.py +92 -0
  269. opik/message_processing/processors/message_processors_chain.py +96 -0
  270. opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
  271. opik/message_processing/queue_consumer.py +9 -3
  272. opik/message_processing/streamer.py +71 -33
  273. opik/message_processing/streamer_constructors.py +43 -10
  274. opik/opik_context.py +16 -4
  275. opik/plugins/pytest/hooks.py +5 -3
  276. opik/rest_api/__init__.py +346 -15
  277. opik/rest_api/alerts/__init__.py +7 -0
  278. opik/rest_api/alerts/client.py +667 -0
  279. opik/rest_api/alerts/raw_client.py +1015 -0
  280. opik/rest_api/alerts/types/__init__.py +7 -0
  281. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  282. opik/rest_api/annotation_queues/__init__.py +4 -0
  283. opik/rest_api/annotation_queues/client.py +668 -0
  284. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  285. opik/rest_api/automation_rule_evaluators/client.py +34 -2
  286. opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
  287. opik/rest_api/client.py +15 -0
  288. opik/rest_api/dashboards/__init__.py +4 -0
  289. opik/rest_api/dashboards/client.py +462 -0
  290. opik/rest_api/dashboards/raw_client.py +648 -0
  291. opik/rest_api/datasets/client.py +1310 -44
  292. opik/rest_api/datasets/raw_client.py +2269 -358
  293. opik/rest_api/experiments/__init__.py +2 -2
  294. opik/rest_api/experiments/client.py +191 -5
  295. opik/rest_api/experiments/raw_client.py +301 -7
  296. opik/rest_api/experiments/types/__init__.py +4 -1
  297. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  298. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  299. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  300. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  301. opik/rest_api/llm_provider_key/client.py +20 -0
  302. opik/rest_api/llm_provider_key/raw_client.py +20 -0
  303. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  304. opik/rest_api/manual_evaluation/__init__.py +4 -0
  305. opik/rest_api/manual_evaluation/client.py +347 -0
  306. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  307. opik/rest_api/optimizations/client.py +145 -9
  308. opik/rest_api/optimizations/raw_client.py +237 -13
  309. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  310. opik/rest_api/prompts/__init__.py +2 -2
  311. opik/rest_api/prompts/client.py +227 -6
  312. opik/rest_api/prompts/raw_client.py +331 -2
  313. opik/rest_api/prompts/types/__init__.py +3 -1
  314. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  315. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  316. opik/rest_api/spans/__init__.py +0 -2
  317. opik/rest_api/spans/client.py +238 -76
  318. opik/rest_api/spans/raw_client.py +307 -95
  319. opik/rest_api/spans/types/__init__.py +0 -2
  320. opik/rest_api/traces/client.py +572 -161
  321. opik/rest_api/traces/raw_client.py +736 -229
  322. opik/rest_api/types/__init__.py +352 -17
  323. opik/rest_api/types/aggregation_data.py +1 -0
  324. opik/rest_api/types/alert.py +33 -0
  325. opik/rest_api/types/alert_alert_type.py +5 -0
  326. opik/rest_api/types/alert_page_public.py +24 -0
  327. opik/rest_api/types/alert_public.py +33 -0
  328. opik/rest_api/types/alert_public_alert_type.py +5 -0
  329. opik/rest_api/types/alert_trigger.py +27 -0
  330. opik/rest_api/types/alert_trigger_config.py +28 -0
  331. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  332. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  333. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  334. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  335. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  336. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  337. opik/rest_api/types/alert_trigger_public.py +27 -0
  338. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  339. opik/rest_api/types/alert_trigger_write.py +23 -0
  340. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  341. opik/rest_api/types/alert_write.py +28 -0
  342. opik/rest_api/types/alert_write_alert_type.py +5 -0
  343. opik/rest_api/types/annotation_queue.py +42 -0
  344. opik/rest_api/types/annotation_queue_batch.py +27 -0
  345. opik/rest_api/types/annotation_queue_item_ids.py +19 -0
  346. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  347. opik/rest_api/types/annotation_queue_public.py +38 -0
  348. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  349. opik/rest_api/types/annotation_queue_reviewer.py +20 -0
  350. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  351. opik/rest_api/types/annotation_queue_scope.py +5 -0
  352. opik/rest_api/types/annotation_queue_write.py +31 -0
  353. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  354. opik/rest_api/types/audio_url.py +19 -0
  355. opik/rest_api/types/audio_url_public.py +19 -0
  356. opik/rest_api/types/audio_url_write.py +19 -0
  357. opik/rest_api/types/automation_rule_evaluator.py +62 -2
  358. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  359. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  360. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  361. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  362. opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
  363. opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
  364. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  365. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  366. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  367. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  368. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  369. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  370. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  371. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  372. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  373. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  374. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  375. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  376. opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
  377. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  378. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  379. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  380. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  381. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  382. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  383. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  384. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  385. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  386. opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
  387. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  388. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  389. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  390. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  391. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  392. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  393. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  394. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  395. opik/rest_api/types/dashboard_page_public.py +24 -0
  396. opik/rest_api/types/dashboard_public.py +30 -0
  397. opik/rest_api/types/dataset.py +4 -0
  398. opik/rest_api/types/dataset_expansion.py +42 -0
  399. opik/rest_api/types/dataset_expansion_response.py +39 -0
  400. opik/rest_api/types/dataset_item.py +2 -0
  401. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  402. opik/rest_api/types/dataset_item_compare.py +2 -0
  403. opik/rest_api/types/dataset_item_filter.py +27 -0
  404. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  405. opik/rest_api/types/dataset_item_page_compare.py +5 -0
  406. opik/rest_api/types/dataset_item_page_public.py +5 -0
  407. opik/rest_api/types/dataset_item_public.py +2 -0
  408. opik/rest_api/types/dataset_item_update.py +39 -0
  409. opik/rest_api/types/dataset_item_write.py +1 -0
  410. opik/rest_api/types/dataset_public.py +4 -0
  411. opik/rest_api/types/dataset_public_status.py +5 -0
  412. opik/rest_api/types/dataset_status.py +5 -0
  413. opik/rest_api/types/dataset_version_diff.py +22 -0
  414. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  415. opik/rest_api/types/dataset_version_page_public.py +23 -0
  416. opik/rest_api/types/dataset_version_public.py +59 -0
  417. opik/rest_api/types/dataset_version_summary.py +46 -0
  418. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  419. opik/rest_api/types/experiment.py +7 -2
  420. opik/rest_api/types/experiment_group_response.py +2 -0
  421. opik/rest_api/types/experiment_public.py +7 -2
  422. opik/rest_api/types/experiment_public_status.py +5 -0
  423. opik/rest_api/types/experiment_score.py +20 -0
  424. opik/rest_api/types/experiment_score_public.py +20 -0
  425. opik/rest_api/types/experiment_score_write.py +20 -0
  426. opik/rest_api/types/experiment_status.py +5 -0
  427. opik/rest_api/types/feedback.py +25 -1
  428. opik/rest_api/types/feedback_create.py +20 -1
  429. opik/rest_api/types/feedback_object_public.py +27 -1
  430. opik/rest_api/types/feedback_public.py +25 -1
  431. opik/rest_api/types/feedback_score_batch_item.py +2 -1
  432. opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
  433. opik/rest_api/types/feedback_score_public.py +4 -0
  434. opik/rest_api/types/feedback_update.py +20 -1
  435. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  436. opik/rest_api/types/group_detail.py +19 -0
  437. opik/rest_api/types/group_details.py +20 -0
  438. opik/rest_api/types/guardrail.py +1 -0
  439. opik/rest_api/types/guardrail_write.py +1 -0
  440. opik/rest_api/types/ids_holder.py +19 -0
  441. opik/rest_api/types/image_url.py +20 -0
  442. opik/rest_api/types/image_url_public.py +20 -0
  443. opik/rest_api/types/image_url_write.py +20 -0
  444. opik/rest_api/types/llm_as_judge_message.py +5 -1
  445. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  446. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  447. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  448. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  449. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  450. opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
  451. opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
  452. opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
  453. opik/rest_api/types/manual_evaluation_request.py +38 -0
  454. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  455. opik/rest_api/types/manual_evaluation_response.py +27 -0
  456. opik/rest_api/types/optimization.py +4 -2
  457. opik/rest_api/types/optimization_public.py +4 -2
  458. opik/rest_api/types/optimization_public_status.py +3 -1
  459. opik/rest_api/types/optimization_status.py +3 -1
  460. opik/rest_api/types/optimization_studio_config.py +27 -0
  461. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  462. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  463. opik/rest_api/types/optimization_studio_log.py +22 -0
  464. opik/rest_api/types/optimization_write.py +4 -2
  465. opik/rest_api/types/optimization_write_status.py +3 -1
  466. opik/rest_api/types/project.py +1 -0
  467. opik/rest_api/types/project_detailed.py +1 -0
  468. opik/rest_api/types/project_reference.py +31 -0
  469. opik/rest_api/types/project_reference_public.py +31 -0
  470. opik/rest_api/types/project_stats_summary_item.py +1 -0
  471. opik/rest_api/types/prompt.py +6 -0
  472. opik/rest_api/types/prompt_detail.py +6 -0
  473. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  474. opik/rest_api/types/prompt_public.py +6 -0
  475. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  476. opik/rest_api/types/prompt_template_structure.py +5 -0
  477. opik/rest_api/types/prompt_version.py +3 -0
  478. opik/rest_api/types/prompt_version_detail.py +3 -0
  479. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  480. opik/rest_api/types/prompt_version_link.py +1 -0
  481. opik/rest_api/types/prompt_version_link_public.py +1 -0
  482. opik/rest_api/types/prompt_version_page_public.py +5 -0
  483. opik/rest_api/types/prompt_version_public.py +3 -0
  484. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  485. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  486. opik/rest_api/types/prompt_version_update.py +33 -0
  487. opik/rest_api/types/provider_api_key.py +9 -0
  488. opik/rest_api/types/provider_api_key_provider.py +1 -1
  489. opik/rest_api/types/provider_api_key_public.py +9 -0
  490. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  491. opik/rest_api/types/score_name.py +1 -0
  492. opik/rest_api/types/service_toggles_config.py +18 -0
  493. opik/rest_api/types/span.py +1 -2
  494. opik/rest_api/types/span_enrichment_options.py +31 -0
  495. opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
  496. opik/rest_api/types/span_filter.py +23 -0
  497. opik/rest_api/types/span_filter_operator.py +21 -0
  498. opik/rest_api/types/span_filter_write.py +23 -0
  499. opik/rest_api/types/span_filter_write_operator.py +21 -0
  500. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  501. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  502. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  503. opik/rest_api/types/span_public.py +1 -2
  504. opik/rest_api/types/span_update.py +46 -0
  505. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  506. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  507. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  508. opik/rest_api/types/span_write.py +1 -2
  509. opik/rest_api/types/studio_evaluation.py +20 -0
  510. opik/rest_api/types/studio_evaluation_public.py +20 -0
  511. opik/rest_api/types/studio_evaluation_write.py +20 -0
  512. opik/rest_api/types/studio_llm_model.py +21 -0
  513. opik/rest_api/types/studio_llm_model_public.py +21 -0
  514. opik/rest_api/types/studio_llm_model_write.py +21 -0
  515. opik/rest_api/types/studio_message.py +20 -0
  516. opik/rest_api/types/studio_message_public.py +20 -0
  517. opik/rest_api/types/studio_message_write.py +20 -0
  518. opik/rest_api/types/studio_metric.py +21 -0
  519. opik/rest_api/types/studio_metric_public.py +21 -0
  520. opik/rest_api/types/studio_metric_write.py +21 -0
  521. opik/rest_api/types/studio_optimizer.py +21 -0
  522. opik/rest_api/types/studio_optimizer_public.py +21 -0
  523. opik/rest_api/types/studio_optimizer_write.py +21 -0
  524. opik/rest_api/types/studio_prompt.py +20 -0
  525. opik/rest_api/types/studio_prompt_public.py +20 -0
  526. opik/rest_api/types/studio_prompt_write.py +20 -0
  527. opik/rest_api/types/trace.py +11 -2
  528. opik/rest_api/types/trace_enrichment_options.py +32 -0
  529. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
  530. opik/rest_api/types/trace_filter.py +23 -0
  531. opik/rest_api/types/trace_filter_operator.py +21 -0
  532. opik/rest_api/types/trace_filter_write.py +23 -0
  533. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  534. opik/rest_api/types/trace_public.py +11 -2
  535. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  536. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  537. opik/rest_api/types/trace_thread_identifier.py +1 -0
  538. opik/rest_api/types/trace_update.py +39 -0
  539. opik/rest_api/types/trace_write.py +1 -2
  540. opik/rest_api/types/value_entry.py +2 -0
  541. opik/rest_api/types/value_entry_compare.py +2 -0
  542. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  543. opik/rest_api/types/value_entry_public.py +2 -0
  544. opik/rest_api/types/video_url.py +19 -0
  545. opik/rest_api/types/video_url_public.py +19 -0
  546. opik/rest_api/types/video_url_write.py +19 -0
  547. opik/rest_api/types/webhook.py +28 -0
  548. opik/rest_api/types/webhook_examples.py +19 -0
  549. opik/rest_api/types/webhook_public.py +28 -0
  550. opik/rest_api/types/webhook_test_result.py +23 -0
  551. opik/rest_api/types/webhook_test_result_status.py +5 -0
  552. opik/rest_api/types/webhook_write.py +23 -0
  553. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  554. opik/rest_api/types/workspace_configuration.py +5 -0
  555. opik/rest_api/welcome_wizard/__init__.py +4 -0
  556. opik/rest_api/welcome_wizard/client.py +195 -0
  557. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  558. opik/rest_api/workspaces/client.py +14 -2
  559. opik/rest_api/workspaces/raw_client.py +10 -0
  560. opik/s3_httpx_client.py +14 -1
  561. opik/simulation/__init__.py +6 -0
  562. opik/simulation/simulated_user.py +99 -0
  563. opik/simulation/simulator.py +108 -0
  564. opik/synchronization.py +5 -6
  565. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  566. opik/types.py +36 -0
  567. opik/validation/chat_prompt_messages.py +241 -0
  568. opik/validation/feedback_score.py +3 -3
  569. opik/validation/validator.py +28 -0
  570. opik-1.9.71.dist-info/METADATA +370 -0
  571. opik-1.9.71.dist-info/RECORD +1110 -0
  572. opik/api_objects/prompt/prompt.py +0 -112
  573. opik/cli.py +0 -193
  574. opik/hooks.py +0 -13
  575. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  576. opik/integrations/bedrock/helpers.py +0 -8
  577. opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
  578. opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
  579. opik-1.8.39.dist-info/METADATA +0 -339
  580. opik-1.8.39.dist-info/RECORD +0 -790
  581. /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
  582. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
  583. /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
  584. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
  585. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
  586. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
  587. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  588. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  589. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
  590. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  591. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
  592. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,442 @@
1
+ """Conversation-level adapters for GEval-based judges."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Optional
6
+
7
+ import opik.exceptions as exceptions
8
+
9
+ from opik.evaluation.metrics import score_result
10
+ from opik.evaluation.metrics.conversation import types as conversation_types
11
+ from opik.evaluation.metrics.base_metric import BaseMetric
12
+ from opik.evaluation.metrics.conversation.conversation_thread_metric import (
13
+ ConversationThreadMetric,
14
+ )
15
+ from opik.evaluation.metrics.llm_judges.g_eval_presets import (
16
+ compliance_risk as compliance_presets,
17
+ prompt_uncertainty as prompt_presets,
18
+ qa_suite as qa_presets,
19
+ )
20
+
21
+
22
+ class GEvalConversationMetric(ConversationThreadMetric):
23
+ """
24
+ Wrap a GEval-style judge so it can evaluate an entire conversation transcript.
25
+
26
+ The wrapper extracts the latest assistant turn from the conversation and sends
27
+ it to the provided judge. Results are normalised into a ``ScoreResult`` so they
28
+ can plug into the wider Opik evaluation pipeline. Any errors raised by the
29
+ underlying judge are captured and reported as a failed score computation.
30
+
31
+ The conversation input should match :class:`ConversationThreadMetric`
32
+ semantics—a list of dicts containing ``role`` and ``content`` keys ordered by
33
+ time.
34
+
35
+ Args:
36
+ judge: A GEval-compatible metric instance that accepts ``output`` as its
37
+ primary argument.
38
+ name: Optional override for the metric name used in results. When ``None``
39
+ the name is derived from the wrapped judge.
40
+
41
+ Returns:
42
+ ScoreResult: Mirrors the wrapped judge's score/value/metadata fields. When
43
+ the judge fails, ``scoring_failed`` is set and ``value`` is ``0.0``.
44
+
45
+ Example:
46
+ >>> from opik.evaluation.metrics.conversation.llm_judges.g_eval_wrappers import (
47
+ ... GEvalConversationMetric,
48
+ ... )
49
+ >>> from opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite import DialogueHelpfulnessJudge
50
+ >>> conversation = [
51
+ ... {"role": "user", "content": "Summarise these notes."},
52
+ ... {"role": "assistant", "content": "Here is a concise summary..."},
53
+ ... ]
54
+ >>> metric = GEvalConversationMetric(judge=DialogueHelpfulnessJudge(model="gpt-4"))
55
+ >>> result = metric.score(conversation)
56
+ >>> result.value # doctest: +SKIP
57
+ 0.83
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ judge: BaseMetric,
63
+ name: Optional[str] = None,
64
+ ) -> None:
65
+ super().__init__(
66
+ name=name or f"conversation_{judge.name}",
67
+ track=getattr(judge, "track", True),
68
+ project_name=getattr(judge, "project_name", None),
69
+ )
70
+ self._judge = judge
71
+
72
+ def _normalize_result(self, raw_result: Any) -> score_result.ScoreResult:
73
+ if isinstance(raw_result, score_result.ScoreResult):
74
+ return raw_result
75
+ if isinstance(raw_result, list):
76
+ if not raw_result:
77
+ raise exceptions.MetricComputationError(
78
+ "Judge returned an empty list of results."
79
+ )
80
+ first = raw_result[0]
81
+ if isinstance(first, score_result.ScoreResult):
82
+ return first
83
+ raise exceptions.MetricComputationError(
84
+ f"Judge {self._judge.name} returned unsupported result type {type(raw_result)!r}"
85
+ )
86
+
87
+ def score(
88
+ self,
89
+ conversation: conversation_types.Conversation,
90
+ **_: Any,
91
+ ) -> score_result.ScoreResult:
92
+ """
93
+ Evaluate the final assistant turn in a conversation.
94
+
95
+ Args:
96
+ conversation: Sequence of dict-like turns containing ``role`` and
97
+ ``content`` keys. Only assistant turns with non-empty ``content``
98
+ are considered.
99
+
100
+ Returns:
101
+ ScoreResult: Normalised output from the wrapped judge. If no assistant
102
+ message is present, the result is marked as failed with ``value=0.0``.
103
+ """
104
+ last_assistant = next(
105
+ (
106
+ turn.get("content", "")
107
+ for turn in reversed(conversation)
108
+ if turn.get("role") == "assistant"
109
+ ),
110
+ "",
111
+ )
112
+ if not last_assistant.strip():
113
+ return score_result.ScoreResult(
114
+ name=self.name,
115
+ value=0.0,
116
+ reason="Conversation contains no assistant messages to evaluate.",
117
+ scoring_failed=True,
118
+ )
119
+
120
+ try:
121
+ raw_result = self._judge.score(output=last_assistant)
122
+ except exceptions.MetricComputationError as error:
123
+ reason = str(error)
124
+ except Exception as error:
125
+ reason = (
126
+ f"Judge {self._judge.name} raised {error.__class__.__name__}: {error}"
127
+ )
128
+ else:
129
+ judge_result = self._normalize_result(raw_result)
130
+ return score_result.ScoreResult(
131
+ name=self.name,
132
+ value=judge_result.value,
133
+ reason=judge_result.reason,
134
+ metadata=judge_result.metadata,
135
+ scoring_failed=judge_result.scoring_failed,
136
+ )
137
+
138
+ return score_result.ScoreResult(
139
+ name=self.name,
140
+ value=0.0,
141
+ reason=reason,
142
+ scoring_failed=True,
143
+ )
144
+
145
+
146
+ class ConversationComplianceRiskMetric(GEvalConversationMetric):
147
+ """
148
+ Evaluate the latest assistant response for compliance and risk exposure.
149
+
150
+ This metric forwards the final assistant turn to
151
+ :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.compliance_risk.ComplianceRiskJudge`
152
+ and returns its assessment as a conversation-level ``ScoreResult``.
153
+
154
+ Args:
155
+ model: Optional model name or identifier understood by the judge.
156
+ track: Whether to automatically track metric results. Defaults to ``True``.
157
+ project_name: Optional tracking project name. Defaults to ``None``.
158
+ temperature: Sampling temperature supplied to the underlying judge model.
159
+
160
+ Returns:
161
+ ScoreResult: Compliance score emitted by the wrapped judge; failed
162
+ evaluations set ``scoring_failed`` and ``value=0.0``.
163
+
164
+ Example:
165
+ >>> from opik.evaluation.metrics import ConversationComplianceRiskMetric
166
+ >>> conversation = [
167
+ ... {"role": "user", "content": "Generate an employment contract."},
168
+ ... {"role": "assistant", "content": "Here is a standard contract template..."},
169
+ ... ]
170
+ >>> metric = ConversationComplianceRiskMetric(model="gpt-4")
171
+ >>> result = metric.score(conversation)
172
+ >>> result.value # doctest: +SKIP
173
+ 0.12
174
+ """
175
+
176
+ def __init__(
177
+ self,
178
+ model: Optional[str] = None,
179
+ track: bool = True,
180
+ project_name: Optional[str] = None,
181
+ temperature: float = 0.0,
182
+ ) -> None:
183
+ super().__init__(
184
+ judge=compliance_presets.ComplianceRiskJudge(
185
+ model=model,
186
+ track=track,
187
+ project_name=project_name,
188
+ temperature=temperature,
189
+ ),
190
+ name="conversation_compliance_risk",
191
+ )
192
+
193
+
194
+ class ConversationDialogueHelpfulnessMetric(GEvalConversationMetric):
195
+ """
196
+ Score how helpful the closing assistant message is within the dialogue.
197
+
198
+ The metric expects the same conversation shape as
199
+ :class:`ConversationThreadMetric`. It uses
200
+ :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite.DialogueHelpfulnessJudge`
201
+ to evaluate usefulness and responsiveness of the final assistant turn.
202
+
203
+ Args:
204
+ model: Optional model name passed to the judge.
205
+ track: Whether to automatically track results. Defaults to ``True``.
206
+ project_name: Optional tracking project. Defaults to ``None``.
207
+ temperature: Temperature fed into the judge's underlying model.
208
+
209
+ Returns:
210
+ ScoreResult: Helpfulness score from the wrapped judge.
211
+
212
+ Example:
213
+ >>> from opik.evaluation.metrics import ConversationDialogueHelpfulnessMetric
214
+ >>> conversation = [
215
+ ... {"role": "user", "content": "How do I reset my password?"},
216
+ ... {"role": "assistant", "content": "Click the reset link and follow the steps."},
217
+ ... ]
218
+ >>> metric = ConversationDialogueHelpfulnessMetric(model="gpt-4")
219
+ >>> result = metric.score(conversation)
220
+ >>> result.value # doctest: +SKIP
221
+ 0.88
222
+ """
223
+
224
+ def __init__(
225
+ self,
226
+ model: Optional[str] = None,
227
+ track: bool = True,
228
+ project_name: Optional[str] = None,
229
+ temperature: float = 0.0,
230
+ ) -> None:
231
+ super().__init__(
232
+ judge=qa_presets.DialogueHelpfulnessJudge(
233
+ model=model,
234
+ track=track,
235
+ project_name=project_name,
236
+ temperature=temperature,
237
+ ),
238
+ name="conversation_dialogue_helpfulness",
239
+ )
240
+
241
+
242
+ class ConversationQARelevanceMetric(GEvalConversationMetric):
243
+ """
244
+ Quantify how relevant the assistant's final answer is to the preceding query.
245
+
246
+ This metric expects a conversation sequence compatible with
247
+ :class:`ConversationThreadMetric` and wraps
248
+ :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite.QARelevanceJudge`
249
+ and is useful when the conversation emulates a Q&A exchange.
250
+
251
+ Args:
252
+ model: Optional model name used by the judge backend.
253
+ track: Whether to automatically track outcomes. Defaults to ``True``.
254
+ project_name: Optional project for tracked scores. Defaults to ``None``.
255
+ temperature: Judge sampling temperature.
256
+
257
+ Returns:
258
+ ScoreResult: Relevance score from the judge.
259
+
260
+ Example:
261
+ >>> from opik.evaluation.metrics import ConversationQARelevanceMetric
262
+ >>> conversation = [
263
+ ... {"role": "user", "content": "Who wrote Dune?"},
264
+ ... {"role": "assistant", "content": "Frank Herbert wrote Dune."},
265
+ ... ]
266
+ >>> metric = ConversationQARelevanceMetric(model="gpt-4")
267
+ >>> result = metric.score(conversation)
268
+ >>> result.value # doctest: +SKIP
269
+ 1.0
270
+ """
271
+
272
+ def __init__(
273
+ self,
274
+ model: Optional[str] = None,
275
+ track: bool = True,
276
+ project_name: Optional[str] = None,
277
+ temperature: float = 0.0,
278
+ ) -> None:
279
+ super().__init__(
280
+ judge=qa_presets.QARelevanceJudge(
281
+ model=model,
282
+ track=track,
283
+ project_name=project_name,
284
+ temperature=temperature,
285
+ ),
286
+ name="conversation_qa_relevance",
287
+ )
288
+
289
+
290
+ class ConversationSummarizationCoherenceMetric(GEvalConversationMetric):
291
+ """
292
+ Assess the coherence of a summary-style assistant response.
293
+
294
+ The metric expects the conversation schema defined by
295
+ :class:`ConversationThreadMetric` and invokes
296
+ :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite.SummarizationCoherenceJudge`
297
+ to rate whether the summary flows naturally and captures the conversation
298
+ structure.
299
+
300
+ Args:
301
+ model: Optional model name or identifier for the judge.
302
+ track: Whether to track metric results automatically. Defaults to ``True``.
303
+ project_name: Optional project name for tracked scores. Defaults to ``None``.
304
+ temperature: Sampling temperature passed to the judge model.
305
+
306
+ Returns:
307
+ ScoreResult: Coherence score from the judge.
308
+
309
+ Example:
310
+ >>> from opik.evaluation.metrics import ConversationSummarizationCoherenceMetric
311
+ >>> conversation = [
312
+ ... {"role": "user", "content": "Summarise this chat."},
313
+ ... {"role": "assistant", "content": "Summary: we discussed timelines and budgets."},
314
+ ... ]
315
+ >>> metric = ConversationSummarizationCoherenceMetric(model="gpt-4")
316
+ >>> result = metric.score(conversation)
317
+ >>> result.value # doctest: +SKIP
318
+ 0.91
319
+ """
320
+
321
+ def __init__(
322
+ self,
323
+ model: Optional[str] = None,
324
+ track: bool = True,
325
+ project_name: Optional[str] = None,
326
+ temperature: float = 0.0,
327
+ ) -> None:
328
+ super().__init__(
329
+ judge=qa_presets.SummarizationCoherenceJudge(
330
+ model=model,
331
+ track=track,
332
+ project_name=project_name,
333
+ temperature=temperature,
334
+ ),
335
+ name="conversation_summarization_coherence",
336
+ )
337
+
338
+
339
+ class ConversationSummarizationConsistencyMetric(GEvalConversationMetric):
340
+ """
341
+ Check whether a dialogue summary stays faithful to the source turns.
342
+
343
+ The metric assumes the standard conversation schema and delegates scoring to
344
+ :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite.SummarizationConsistencyJudge`
345
+ and reports the result at the conversation level.
346
+
347
+ Args:
348
+ model: Optional model name passed through to the judge.
349
+ track: Whether to automatically track results. Defaults to ``True``.
350
+ project_name: Optional tracking project. Defaults to ``None``.
351
+ temperature: Temperature parameter supplied to the judge model.
352
+
353
+ Returns:
354
+ ScoreResult: Consistency score from the judge.
355
+
356
+ Example:
357
+ >>> from opik.evaluation.metrics import ConversationSummarizationConsistencyMetric
358
+ >>> conversation = [
359
+ ... {"role": "user", "content": "Give me a summary."},
360
+ ... {"role": "assistant", "content": "Summary: project ships next week."},
361
+ ... ]
362
+ >>> metric = ConversationSummarizationConsistencyMetric(model="gpt-4")
363
+ >>> result = metric.score(conversation)
364
+ >>> result.value # doctest: +SKIP
365
+ 0.95
366
+ """
367
+
368
+ def __init__(
369
+ self,
370
+ model: Optional[str] = None,
371
+ track: bool = True,
372
+ project_name: Optional[str] = None,
373
+ temperature: float = 0.0,
374
+ ) -> None:
375
+ super().__init__(
376
+ judge=qa_presets.SummarizationConsistencyJudge(
377
+ model=model,
378
+ track=track,
379
+ project_name=project_name,
380
+ temperature=temperature,
381
+ ),
382
+ name="conversation_summarization_consistency",
383
+ )
384
+
385
+
386
+ class ConversationPromptUncertaintyMetric(GEvalConversationMetric):
387
+ """
388
+ Measure how uncertain the assistant appears about executing the prompt.
389
+
390
+ The metric expects the standard conversation schema and pipes the latest
391
+ assistant reply into
392
+ :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.prompt_diagnostics.PromptUncertaintyJudge`
393
+ and returns the judge's score in a conversation-friendly format.
394
+
395
+ Args:
396
+ model: Optional model name for the judge to use.
397
+ track: Whether to automatically track the metric. Defaults to ``True``.
398
+ project_name: Optional tracking project. Defaults to ``None``.
399
+ temperature: Sampling temperature for the judge model.
400
+
401
+ Returns:
402
+ ScoreResult: Uncertainty score from the judge.
403
+
404
+ Example:
405
+ >>> from opik.evaluation.metrics import ConversationPromptUncertaintyMetric
406
+ >>> conversation = [
407
+ ... {"role": "user", "content": "Follow the brief precisely."},
408
+ ... {"role": "assistant", "content": "I'm not fully certain which part to prioritise."},
409
+ ... ]
410
+ >>> metric = ConversationPromptUncertaintyMetric(model="gpt-4")
411
+ >>> result = metric.score(conversation)
412
+ >>> result.value # doctest: +SKIP
413
+ 0.42
414
+ """
415
+
416
+ def __init__(
417
+ self,
418
+ model: Optional[str] = None,
419
+ track: bool = True,
420
+ project_name: Optional[str] = None,
421
+ temperature: float = 0.0,
422
+ ) -> None:
423
+ super().__init__(
424
+ judge=prompt_presets.PromptUncertaintyJudge(
425
+ model=model,
426
+ track=track,
427
+ project_name=project_name,
428
+ temperature=temperature,
429
+ ),
430
+ name="conversation_prompt_uncertainty",
431
+ )
432
+
433
+
434
+ __all__ = [
435
+ "GEvalConversationMetric",
436
+ "ConversationComplianceRiskMetric",
437
+ "ConversationDialogueHelpfulnessMetric",
438
+ "ConversationQARelevanceMetric",
439
+ "ConversationSummarizationCoherenceMetric",
440
+ "ConversationSummarizationConsistencyMetric",
441
+ "ConversationPromptUncertaintyMetric",
442
+ ]
@@ -3,17 +3,20 @@ import logging
3
3
  from typing import Optional, Any, Union, List
4
4
  import pydantic
5
5
 
6
+ from opik.evaluation.metrics.conversation import types as conversation_types
6
7
  import opik.exceptions as exceptions
8
+ from opik.evaluation.metrics import score_result
9
+ from opik.evaluation.metrics.conversation.conversation_thread_metric import (
10
+ ConversationThreadMetric,
11
+ )
12
+ from opik.evaluation.metrics.llm_judges import parsing_helpers
7
13
  from opik.evaluation.models import base_model, models_factory
8
14
  from . import schema, templates
9
- from .. import conversation_thread_metric, types as conversation_types
10
- from ... import score_result
11
- from ...llm_judges import parsing_helpers
12
15
 
13
16
  LOGGER = logging.getLogger(__name__)
14
17
 
15
18
 
16
- class SessionCompletenessQuality(conversation_thread_metric.ConversationThreadMetric):
19
+ class SessionCompletenessQuality(ConversationThreadMetric):
17
20
  """
18
21
  Represents the Session Completeness Quality metric for a conversation thread.
19
22
 
@@ -34,6 +37,7 @@ class SessionCompletenessQuality(conversation_thread_metric.ConversationThreadMe
34
37
  include_reason: Whether to include a reason for the score.
35
38
  track: Whether to track the metric. Default is True.
36
39
  project_name: The project name to track the metric in.
40
+ temperature: The temperature to use for the model. Defaults to 1e-8.
37
41
 
38
42
  Example:
39
43
  >>> from opik.evaluation.metrics import SessionCompletenessQuality
@@ -59,22 +63,24 @@ class SessionCompletenessQuality(conversation_thread_metric.ConversationThreadMe
59
63
  include_reason: bool = True,
60
64
  track: bool = True,
61
65
  project_name: Optional[str] = None,
66
+ temperature: float = 1e-8,
62
67
  ):
63
68
  super().__init__(
64
69
  name=name,
65
70
  track=track,
66
71
  project_name=project_name,
67
72
  )
68
- self._init_model(model)
69
73
  self._include_reason = include_reason
70
74
 
75
+ self._init_model(model, temperature=temperature)
76
+
71
77
  def _init_model(
72
- self, model: Optional[Union[str, base_model.OpikBaseModel]]
78
+ self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
73
79
  ) -> None:
74
80
  if isinstance(model, base_model.OpikBaseModel):
75
81
  self._model = model
76
82
  else:
77
- self._model = models_factory.get(model_name=model)
83
+ self._model = models_factory.get(model_name=model, temperature=temperature)
78
84
 
79
85
  def score(
80
86
  self,
@@ -1,6 +1,6 @@
1
1
  from typing import List
2
2
 
3
- from .. import types as conversation_types
3
+ from opik.evaluation.metrics.conversation import types as conversation_types
4
4
 
5
5
 
6
6
  def extract_user_goals(conversation: conversation_types.Conversation) -> str:
@@ -4,21 +4,21 @@ from typing import Optional, Union, Any, List, Dict
4
4
 
5
5
  import pydantic
6
6
 
7
+ from opik.evaluation.metrics.conversation import helpers as conversation_helpers
8
+ from opik.evaluation.metrics.conversation import types as conversation_types
7
9
  import opik.exceptions as exceptions
8
10
  from opik.evaluation.metrics import score_result
11
+ from opik.evaluation.metrics.conversation.conversation_thread_metric import (
12
+ ConversationThreadMetric,
13
+ )
14
+ from opik.evaluation.metrics.llm_judges import parsing_helpers
9
15
  from opik.evaluation.models import base_model, models_factory
10
16
  from . import schema, templates
11
- from .. import (
12
- types as conversation_types,
13
- conversation_thread_metric,
14
- helpers,
15
- )
16
- from ...llm_judges import parsing_helpers
17
17
 
18
18
  LOGGER = logging.getLogger(__name__)
19
19
 
20
20
 
21
- class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric):
21
+ class UserFrustrationMetric(ConversationThreadMetric):
22
22
  """
23
23
  A heuristic score estimating the likelihood that the user experienced confusion, annoyance,
24
24
  or disengagement during the session — due to repetition, lack of adaptation, ignored
@@ -47,6 +47,7 @@ class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric)
47
47
  window_size: The window size to use for calculating the score. It defines the
48
48
  maximal number of historical turns to include in each window when assessing
49
49
  the frustration of the current turn in the conversation. Default is 10.
50
+ temperature: The temperature to use for the model. Defaults to 1e-8.
50
51
 
51
52
  Example:
52
53
  >>> from opik.evaluation.metrics import UserFrustrationMetric
@@ -73,23 +74,25 @@ class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric)
73
74
  track: bool = True,
74
75
  project_name: Optional[str] = None,
75
76
  window_size: int = 10,
77
+ temperature: float = 1e-8,
76
78
  ):
77
79
  super().__init__(
78
80
  name=name,
79
81
  track=track,
80
82
  project_name=project_name,
81
83
  )
82
- self._init_model(model)
83
84
  self._include_reason = include_reason
84
85
  self._window_size = window_size
85
86
 
87
+ self._init_model(model, temperature=temperature)
88
+
86
89
  def _init_model(
87
- self, model: Optional[Union[str, base_model.OpikBaseModel]]
90
+ self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
88
91
  ) -> None:
89
92
  if isinstance(model, base_model.OpikBaseModel):
90
93
  self._model = model
91
94
  else:
92
- self._model = models_factory.get(model_name=model)
95
+ self._model = models_factory.get(model_name=model, temperature=temperature)
93
96
 
94
97
  def score(
95
98
  self,
@@ -110,8 +113,10 @@ class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric)
110
113
  conversation: conversation_types.Conversation,
111
114
  ) -> score_result.ScoreResult:
112
115
  try:
113
- turns_windows = helpers.extract_turns_windows_from_conversation(
114
- conversation=conversation, window_size=self._window_size
116
+ turns_windows = (
117
+ conversation_helpers.extract_turns_windows_from_conversation(
118
+ conversation=conversation, window_size=self._window_size
119
+ )
115
120
  )
116
121
 
117
122
  verdicts = [
@@ -141,8 +146,10 @@ class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric)
141
146
  conversation: conversation_types.Conversation,
142
147
  ) -> score_result.ScoreResult:
143
148
  try:
144
- turns_windows = helpers.extract_turns_windows_from_conversation(
145
- conversation=conversation, window_size=self._window_size
149
+ turns_windows = (
150
+ conversation_helpers.extract_turns_windows_from_conversation(
151
+ conversation=conversation, window_size=self._window_size
152
+ )
146
153
  )
147
154
 
148
155
  verdicts = await asyncio.gather(
@@ -1,6 +1,6 @@
1
1
  from typing import List, Dict
2
2
 
3
- from .. import types as conversation_types
3
+ from opik.evaluation.metrics.conversation import types as conversation_types
4
4
 
5
5
 
6
6
  def evaluate_conversation(sliding_window: conversation_types.Conversation) -> str:
@@ -1,4 +1,4 @@
1
- from typing import Dict, Literal, List, Optional
1
+ from typing import Dict, List, Literal, Optional
2
2
 
3
3
  import pydantic
4
4
 
@@ -9,15 +9,12 @@ Conversation = List[ConversationDict]
9
9
  class ConversationTurn(pydantic.BaseModel):
10
10
  """
11
11
  Representation of a single turn in a conversation.
12
-
13
12
  This class defines a model for encapsulating a single conversational
14
13
  turn consisting of an input user's message and an output LLM message. It is
15
14
  designed to handle the exchange of messages in a structured format.
16
-
17
15
  Args:
18
16
  input: The input message of the conversation turn.
19
17
  output: The output message of the conversation turn.
20
-
21
18
  Example:
22
19
  >>> conversation_turn = ConversationTurn(
23
20
  >>> input={"role": "user", "content": "Hello!"},
@@ -31,5 +28,7 @@ class ConversationTurn(pydantic.BaseModel):
31
28
  def as_list(self) -> List[ConversationDict]:
32
29
  if self.output is None:
33
30
  return [self.input]
34
-
35
31
  return [self.input, self.output]
32
+
33
+
34
+ __all__ = ["ConversationDict", "Conversation", "ConversationTurn"]
@@ -0,0 +1,9 @@
1
+ """Public aliases for conversation type helpers."""
2
+
3
+ from .conversation.types import (
4
+ Conversation,
5
+ ConversationDict,
6
+ ConversationTurn,
7
+ )
8
+
9
+ __all__ = ["Conversation", "ConversationDict", "ConversationTurn"]