opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. opik/__init__.py +19 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/attachment_context.py +36 -0
  9. opik/api_objects/attachment/attachments_extractor.py +153 -0
  10. opik/api_objects/attachment/client.py +1 -0
  11. opik/api_objects/attachment/converters.py +2 -0
  12. opik/api_objects/attachment/decoder.py +18 -0
  13. opik/api_objects/attachment/decoder_base64.py +83 -0
  14. opik/api_objects/attachment/decoder_helpers.py +137 -0
  15. opik/api_objects/data_helpers.py +79 -0
  16. opik/api_objects/dataset/dataset.py +64 -4
  17. opik/api_objects/dataset/rest_operations.py +11 -2
  18. opik/api_objects/experiment/experiment.py +57 -57
  19. opik/api_objects/experiment/experiment_item.py +2 -1
  20. opik/api_objects/experiment/experiments_client.py +64 -0
  21. opik/api_objects/experiment/helpers.py +35 -11
  22. opik/api_objects/experiment/rest_operations.py +65 -5
  23. opik/api_objects/helpers.py +8 -5
  24. opik/api_objects/local_recording.py +81 -0
  25. opik/api_objects/opik_client.py +600 -108
  26. opik/api_objects/opik_query_language.py +39 -5
  27. opik/api_objects/prompt/__init__.py +12 -2
  28. opik/api_objects/prompt/base_prompt.py +69 -0
  29. opik/api_objects/prompt/base_prompt_template.py +29 -0
  30. opik/api_objects/prompt/chat/__init__.py +1 -0
  31. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  32. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  33. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  34. opik/api_objects/prompt/client.py +189 -47
  35. opik/api_objects/prompt/text/__init__.py +1 -0
  36. opik/api_objects/prompt/text/prompt.py +174 -0
  37. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  38. opik/api_objects/prompt/types.py +23 -0
  39. opik/api_objects/search_helpers.py +89 -0
  40. opik/api_objects/span/span_data.py +35 -25
  41. opik/api_objects/threads/threads_client.py +39 -5
  42. opik/api_objects/trace/trace_client.py +52 -2
  43. opik/api_objects/trace/trace_data.py +15 -24
  44. opik/api_objects/validation_helpers.py +3 -3
  45. opik/cli/__init__.py +5 -0
  46. opik/cli/__main__.py +6 -0
  47. opik/cli/configure.py +66 -0
  48. opik/cli/exports/__init__.py +131 -0
  49. opik/cli/exports/dataset.py +278 -0
  50. opik/cli/exports/experiment.py +784 -0
  51. opik/cli/exports/project.py +685 -0
  52. opik/cli/exports/prompt.py +578 -0
  53. opik/cli/exports/utils.py +406 -0
  54. opik/cli/harbor.py +39 -0
  55. opik/cli/healthcheck.py +21 -0
  56. opik/cli/imports/__init__.py +439 -0
  57. opik/cli/imports/dataset.py +143 -0
  58. opik/cli/imports/experiment.py +1192 -0
  59. opik/cli/imports/project.py +262 -0
  60. opik/cli/imports/prompt.py +177 -0
  61. opik/cli/imports/utils.py +280 -0
  62. opik/cli/main.py +49 -0
  63. opik/cli/proxy.py +93 -0
  64. opik/cli/usage_report/__init__.py +16 -0
  65. opik/cli/usage_report/charts.py +783 -0
  66. opik/cli/usage_report/cli.py +274 -0
  67. opik/cli/usage_report/constants.py +9 -0
  68. opik/cli/usage_report/extraction.py +749 -0
  69. opik/cli/usage_report/pdf.py +244 -0
  70. opik/cli/usage_report/statistics.py +78 -0
  71. opik/cli/usage_report/utils.py +235 -0
  72. opik/config.py +13 -7
  73. opik/configurator/configure.py +17 -0
  74. opik/datetime_helpers.py +12 -0
  75. opik/decorator/arguments_helpers.py +9 -1
  76. opik/decorator/base_track_decorator.py +205 -133
  77. opik/decorator/context_manager/span_context_manager.py +123 -0
  78. opik/decorator/context_manager/trace_context_manager.py +84 -0
  79. opik/decorator/opik_args/__init__.py +13 -0
  80. opik/decorator/opik_args/api_classes.py +71 -0
  81. opik/decorator/opik_args/helpers.py +120 -0
  82. opik/decorator/span_creation_handler.py +25 -6
  83. opik/dict_utils.py +3 -3
  84. opik/evaluation/__init__.py +13 -2
  85. opik/evaluation/engine/engine.py +272 -75
  86. opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
  87. opik/evaluation/engine/helpers.py +31 -6
  88. opik/evaluation/engine/metrics_evaluator.py +237 -0
  89. opik/evaluation/evaluation_result.py +168 -2
  90. opik/evaluation/evaluator.py +533 -62
  91. opik/evaluation/metrics/__init__.py +103 -4
  92. opik/evaluation/metrics/aggregated_metric.py +35 -6
  93. opik/evaluation/metrics/base_metric.py +1 -1
  94. opik/evaluation/metrics/conversation/__init__.py +48 -0
  95. opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
  96. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  97. opik/evaluation/metrics/conversation/helpers.py +14 -15
  98. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  99. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  100. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  101. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  102. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  103. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  104. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  105. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
  106. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
  107. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  108. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
  109. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
  110. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  111. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
  112. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
  113. opik/evaluation/metrics/conversation/types.py +4 -5
  114. opik/evaluation/metrics/conversation_types.py +9 -0
  115. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  116. opik/evaluation/metrics/heuristics/bleu.py +35 -15
  117. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  118. opik/evaluation/metrics/heuristics/contains.py +47 -11
  119. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  120. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  121. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  122. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  123. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  124. opik/evaluation/metrics/heuristics/readability.py +129 -0
  125. opik/evaluation/metrics/heuristics/rouge.py +26 -9
  126. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  127. opik/evaluation/metrics/heuristics/tone.py +155 -0
  128. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  129. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
  130. opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
  131. opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
  132. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  133. opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
  134. opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
  135. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  136. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  137. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  138. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  139. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  140. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  141. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  142. opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
  143. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  144. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  145. opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
  146. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  147. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  148. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  149. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  150. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  151. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  152. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  153. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  154. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  155. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
  156. opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
  157. opik/evaluation/metrics/ragas_metric.py +43 -23
  158. opik/evaluation/models/__init__.py +8 -0
  159. opik/evaluation/models/base_model.py +107 -1
  160. opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
  161. opik/evaluation/models/langchain/message_converters.py +97 -15
  162. opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
  163. opik/evaluation/models/litellm/util.py +125 -0
  164. opik/evaluation/models/litellm/warning_filters.py +16 -4
  165. opik/evaluation/models/model_capabilities.py +187 -0
  166. opik/evaluation/models/models_factory.py +25 -3
  167. opik/evaluation/preprocessing.py +92 -0
  168. opik/evaluation/report.py +70 -12
  169. opik/evaluation/rest_operations.py +49 -45
  170. opik/evaluation/samplers/__init__.py +4 -0
  171. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  172. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  173. opik/evaluation/score_statistics.py +66 -0
  174. opik/evaluation/scorers/__init__.py +4 -0
  175. opik/evaluation/scorers/scorer_function.py +55 -0
  176. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  177. opik/evaluation/test_case.py +3 -2
  178. opik/evaluation/test_result.py +1 -0
  179. opik/evaluation/threads/evaluator.py +31 -3
  180. opik/evaluation/threads/helpers.py +3 -2
  181. opik/evaluation/types.py +9 -1
  182. opik/exceptions.py +33 -0
  183. opik/file_upload/file_uploader.py +13 -0
  184. opik/file_upload/upload_options.py +2 -0
  185. opik/hooks/__init__.py +23 -0
  186. opik/hooks/anonymizer_hook.py +36 -0
  187. opik/hooks/httpx_client_hook.py +112 -0
  188. opik/httpx_client.py +12 -9
  189. opik/id_helpers.py +18 -0
  190. opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
  191. opik/integrations/adk/helpers.py +16 -7
  192. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  193. opik/integrations/adk/opik_tracer.py +14 -1
  194. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  195. opik/integrations/adk/recursive_callback_injector.py +4 -7
  196. opik/integrations/bedrock/converse/__init__.py +0 -0
  197. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  198. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
  199. opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
  200. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  201. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  202. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  203. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  204. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  205. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  206. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  207. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  208. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  209. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  210. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  211. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  212. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  213. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  214. opik/integrations/bedrock/opik_tracker.py +42 -4
  215. opik/integrations/bedrock/types.py +19 -0
  216. opik/integrations/crewai/crewai_decorator.py +8 -51
  217. opik/integrations/crewai/opik_tracker.py +31 -10
  218. opik/integrations/crewai/patchers/__init__.py +5 -0
  219. opik/integrations/crewai/patchers/flow.py +118 -0
  220. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  221. opik/integrations/crewai/patchers/llm_client.py +207 -0
  222. opik/integrations/dspy/callback.py +80 -17
  223. opik/integrations/dspy/parsers.py +168 -0
  224. opik/integrations/harbor/__init__.py +17 -0
  225. opik/integrations/harbor/experiment_service.py +269 -0
  226. opik/integrations/harbor/opik_tracker.py +528 -0
  227. opik/integrations/haystack/opik_connector.py +2 -2
  228. opik/integrations/haystack/opik_tracer.py +3 -7
  229. opik/integrations/langchain/__init__.py +3 -1
  230. opik/integrations/langchain/helpers.py +96 -0
  231. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  232. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  233. opik/integrations/langchain/opik_encoder_extension.py +1 -1
  234. opik/integrations/langchain/opik_tracer.py +474 -229
  235. opik/integrations/litellm/__init__.py +5 -0
  236. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  237. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  238. opik/integrations/litellm/opik_tracker.py +43 -0
  239. opik/integrations/litellm/stream_patchers.py +151 -0
  240. opik/integrations/llama_index/callback.py +146 -107
  241. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  242. opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
  243. opik/integrations/openai/opik_tracker.py +1 -1
  244. opik/integrations/sagemaker/auth.py +5 -1
  245. opik/llm_usage/google_usage.py +3 -1
  246. opik/llm_usage/opik_usage.py +7 -8
  247. opik/llm_usage/opik_usage_factory.py +4 -2
  248. opik/logging_messages.py +6 -0
  249. opik/message_processing/batching/base_batcher.py +14 -21
  250. opik/message_processing/batching/batch_manager.py +22 -10
  251. opik/message_processing/batching/batch_manager_constuctors.py +10 -0
  252. opik/message_processing/batching/batchers.py +59 -27
  253. opik/message_processing/batching/flushing_thread.py +0 -3
  254. opik/message_processing/emulation/__init__.py +0 -0
  255. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  256. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  257. opik/message_processing/emulation/models.py +162 -0
  258. opik/message_processing/encoder_helpers.py +79 -0
  259. opik/message_processing/messages.py +56 -1
  260. opik/message_processing/preprocessing/__init__.py +0 -0
  261. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  262. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  263. opik/message_processing/preprocessing/constants.py +1 -0
  264. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  265. opik/message_processing/preprocessing/preprocessor.py +36 -0
  266. opik/message_processing/processors/__init__.py +0 -0
  267. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  268. opik/message_processing/processors/message_processors.py +92 -0
  269. opik/message_processing/processors/message_processors_chain.py +96 -0
  270. opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
  271. opik/message_processing/queue_consumer.py +9 -3
  272. opik/message_processing/streamer.py +71 -33
  273. opik/message_processing/streamer_constructors.py +43 -10
  274. opik/opik_context.py +16 -4
  275. opik/plugins/pytest/hooks.py +5 -3
  276. opik/rest_api/__init__.py +346 -15
  277. opik/rest_api/alerts/__init__.py +7 -0
  278. opik/rest_api/alerts/client.py +667 -0
  279. opik/rest_api/alerts/raw_client.py +1015 -0
  280. opik/rest_api/alerts/types/__init__.py +7 -0
  281. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  282. opik/rest_api/annotation_queues/__init__.py +4 -0
  283. opik/rest_api/annotation_queues/client.py +668 -0
  284. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  285. opik/rest_api/automation_rule_evaluators/client.py +34 -2
  286. opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
  287. opik/rest_api/client.py +15 -0
  288. opik/rest_api/dashboards/__init__.py +4 -0
  289. opik/rest_api/dashboards/client.py +462 -0
  290. opik/rest_api/dashboards/raw_client.py +648 -0
  291. opik/rest_api/datasets/client.py +1310 -44
  292. opik/rest_api/datasets/raw_client.py +2269 -358
  293. opik/rest_api/experiments/__init__.py +2 -2
  294. opik/rest_api/experiments/client.py +191 -5
  295. opik/rest_api/experiments/raw_client.py +301 -7
  296. opik/rest_api/experiments/types/__init__.py +4 -1
  297. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  298. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  299. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  300. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  301. opik/rest_api/llm_provider_key/client.py +20 -0
  302. opik/rest_api/llm_provider_key/raw_client.py +20 -0
  303. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  304. opik/rest_api/manual_evaluation/__init__.py +4 -0
  305. opik/rest_api/manual_evaluation/client.py +347 -0
  306. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  307. opik/rest_api/optimizations/client.py +145 -9
  308. opik/rest_api/optimizations/raw_client.py +237 -13
  309. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  310. opik/rest_api/prompts/__init__.py +2 -2
  311. opik/rest_api/prompts/client.py +227 -6
  312. opik/rest_api/prompts/raw_client.py +331 -2
  313. opik/rest_api/prompts/types/__init__.py +3 -1
  314. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  315. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  316. opik/rest_api/spans/__init__.py +0 -2
  317. opik/rest_api/spans/client.py +238 -76
  318. opik/rest_api/spans/raw_client.py +307 -95
  319. opik/rest_api/spans/types/__init__.py +0 -2
  320. opik/rest_api/traces/client.py +572 -161
  321. opik/rest_api/traces/raw_client.py +736 -229
  322. opik/rest_api/types/__init__.py +352 -17
  323. opik/rest_api/types/aggregation_data.py +1 -0
  324. opik/rest_api/types/alert.py +33 -0
  325. opik/rest_api/types/alert_alert_type.py +5 -0
  326. opik/rest_api/types/alert_page_public.py +24 -0
  327. opik/rest_api/types/alert_public.py +33 -0
  328. opik/rest_api/types/alert_public_alert_type.py +5 -0
  329. opik/rest_api/types/alert_trigger.py +27 -0
  330. opik/rest_api/types/alert_trigger_config.py +28 -0
  331. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  332. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  333. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  334. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  335. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  336. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  337. opik/rest_api/types/alert_trigger_public.py +27 -0
  338. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  339. opik/rest_api/types/alert_trigger_write.py +23 -0
  340. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  341. opik/rest_api/types/alert_write.py +28 -0
  342. opik/rest_api/types/alert_write_alert_type.py +5 -0
  343. opik/rest_api/types/annotation_queue.py +42 -0
  344. opik/rest_api/types/annotation_queue_batch.py +27 -0
  345. opik/rest_api/types/annotation_queue_item_ids.py +19 -0
  346. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  347. opik/rest_api/types/annotation_queue_public.py +38 -0
  348. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  349. opik/rest_api/types/annotation_queue_reviewer.py +20 -0
  350. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  351. opik/rest_api/types/annotation_queue_scope.py +5 -0
  352. opik/rest_api/types/annotation_queue_write.py +31 -0
  353. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  354. opik/rest_api/types/audio_url.py +19 -0
  355. opik/rest_api/types/audio_url_public.py +19 -0
  356. opik/rest_api/types/audio_url_write.py +19 -0
  357. opik/rest_api/types/automation_rule_evaluator.py +62 -2
  358. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  359. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  360. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  361. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  362. opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
  363. opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
  364. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  365. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  366. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  367. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  368. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  369. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  370. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  371. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  372. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  373. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  374. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  375. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  376. opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
  377. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  378. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  379. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  380. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  381. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  382. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  383. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  384. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  385. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  386. opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
  387. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  388. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  389. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  390. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  391. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  392. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  393. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  394. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  395. opik/rest_api/types/dashboard_page_public.py +24 -0
  396. opik/rest_api/types/dashboard_public.py +30 -0
  397. opik/rest_api/types/dataset.py +4 -0
  398. opik/rest_api/types/dataset_expansion.py +42 -0
  399. opik/rest_api/types/dataset_expansion_response.py +39 -0
  400. opik/rest_api/types/dataset_item.py +2 -0
  401. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  402. opik/rest_api/types/dataset_item_compare.py +2 -0
  403. opik/rest_api/types/dataset_item_filter.py +27 -0
  404. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  405. opik/rest_api/types/dataset_item_page_compare.py +5 -0
  406. opik/rest_api/types/dataset_item_page_public.py +5 -0
  407. opik/rest_api/types/dataset_item_public.py +2 -0
  408. opik/rest_api/types/dataset_item_update.py +39 -0
  409. opik/rest_api/types/dataset_item_write.py +1 -0
  410. opik/rest_api/types/dataset_public.py +4 -0
  411. opik/rest_api/types/dataset_public_status.py +5 -0
  412. opik/rest_api/types/dataset_status.py +5 -0
  413. opik/rest_api/types/dataset_version_diff.py +22 -0
  414. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  415. opik/rest_api/types/dataset_version_page_public.py +23 -0
  416. opik/rest_api/types/dataset_version_public.py +59 -0
  417. opik/rest_api/types/dataset_version_summary.py +46 -0
  418. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  419. opik/rest_api/types/experiment.py +7 -2
  420. opik/rest_api/types/experiment_group_response.py +2 -0
  421. opik/rest_api/types/experiment_public.py +7 -2
  422. opik/rest_api/types/experiment_public_status.py +5 -0
  423. opik/rest_api/types/experiment_score.py +20 -0
  424. opik/rest_api/types/experiment_score_public.py +20 -0
  425. opik/rest_api/types/experiment_score_write.py +20 -0
  426. opik/rest_api/types/experiment_status.py +5 -0
  427. opik/rest_api/types/feedback.py +25 -1
  428. opik/rest_api/types/feedback_create.py +20 -1
  429. opik/rest_api/types/feedback_object_public.py +27 -1
  430. opik/rest_api/types/feedback_public.py +25 -1
  431. opik/rest_api/types/feedback_score_batch_item.py +2 -1
  432. opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
  433. opik/rest_api/types/feedback_score_public.py +4 -0
  434. opik/rest_api/types/feedback_update.py +20 -1
  435. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  436. opik/rest_api/types/group_detail.py +19 -0
  437. opik/rest_api/types/group_details.py +20 -0
  438. opik/rest_api/types/guardrail.py +1 -0
  439. opik/rest_api/types/guardrail_write.py +1 -0
  440. opik/rest_api/types/ids_holder.py +19 -0
  441. opik/rest_api/types/image_url.py +20 -0
  442. opik/rest_api/types/image_url_public.py +20 -0
  443. opik/rest_api/types/image_url_write.py +20 -0
  444. opik/rest_api/types/llm_as_judge_message.py +5 -1
  445. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  446. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  447. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  448. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  449. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  450. opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
  451. opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
  452. opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
  453. opik/rest_api/types/manual_evaluation_request.py +38 -0
  454. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  455. opik/rest_api/types/manual_evaluation_response.py +27 -0
  456. opik/rest_api/types/optimization.py +4 -2
  457. opik/rest_api/types/optimization_public.py +4 -2
  458. opik/rest_api/types/optimization_public_status.py +3 -1
  459. opik/rest_api/types/optimization_status.py +3 -1
  460. opik/rest_api/types/optimization_studio_config.py +27 -0
  461. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  462. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  463. opik/rest_api/types/optimization_studio_log.py +22 -0
  464. opik/rest_api/types/optimization_write.py +4 -2
  465. opik/rest_api/types/optimization_write_status.py +3 -1
  466. opik/rest_api/types/project.py +1 -0
  467. opik/rest_api/types/project_detailed.py +1 -0
  468. opik/rest_api/types/project_reference.py +31 -0
  469. opik/rest_api/types/project_reference_public.py +31 -0
  470. opik/rest_api/types/project_stats_summary_item.py +1 -0
  471. opik/rest_api/types/prompt.py +6 -0
  472. opik/rest_api/types/prompt_detail.py +6 -0
  473. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  474. opik/rest_api/types/prompt_public.py +6 -0
  475. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  476. opik/rest_api/types/prompt_template_structure.py +5 -0
  477. opik/rest_api/types/prompt_version.py +3 -0
  478. opik/rest_api/types/prompt_version_detail.py +3 -0
  479. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  480. opik/rest_api/types/prompt_version_link.py +1 -0
  481. opik/rest_api/types/prompt_version_link_public.py +1 -0
  482. opik/rest_api/types/prompt_version_page_public.py +5 -0
  483. opik/rest_api/types/prompt_version_public.py +3 -0
  484. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  485. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  486. opik/rest_api/types/prompt_version_update.py +33 -0
  487. opik/rest_api/types/provider_api_key.py +9 -0
  488. opik/rest_api/types/provider_api_key_provider.py +1 -1
  489. opik/rest_api/types/provider_api_key_public.py +9 -0
  490. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  491. opik/rest_api/types/score_name.py +1 -0
  492. opik/rest_api/types/service_toggles_config.py +18 -0
  493. opik/rest_api/types/span.py +1 -2
  494. opik/rest_api/types/span_enrichment_options.py +31 -0
  495. opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
  496. opik/rest_api/types/span_filter.py +23 -0
  497. opik/rest_api/types/span_filter_operator.py +21 -0
  498. opik/rest_api/types/span_filter_write.py +23 -0
  499. opik/rest_api/types/span_filter_write_operator.py +21 -0
  500. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  501. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  502. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  503. opik/rest_api/types/span_public.py +1 -2
  504. opik/rest_api/types/span_update.py +46 -0
  505. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  506. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  507. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  508. opik/rest_api/types/span_write.py +1 -2
  509. opik/rest_api/types/studio_evaluation.py +20 -0
  510. opik/rest_api/types/studio_evaluation_public.py +20 -0
  511. opik/rest_api/types/studio_evaluation_write.py +20 -0
  512. opik/rest_api/types/studio_llm_model.py +21 -0
  513. opik/rest_api/types/studio_llm_model_public.py +21 -0
  514. opik/rest_api/types/studio_llm_model_write.py +21 -0
  515. opik/rest_api/types/studio_message.py +20 -0
  516. opik/rest_api/types/studio_message_public.py +20 -0
  517. opik/rest_api/types/studio_message_write.py +20 -0
  518. opik/rest_api/types/studio_metric.py +21 -0
  519. opik/rest_api/types/studio_metric_public.py +21 -0
  520. opik/rest_api/types/studio_metric_write.py +21 -0
  521. opik/rest_api/types/studio_optimizer.py +21 -0
  522. opik/rest_api/types/studio_optimizer_public.py +21 -0
  523. opik/rest_api/types/studio_optimizer_write.py +21 -0
  524. opik/rest_api/types/studio_prompt.py +20 -0
  525. opik/rest_api/types/studio_prompt_public.py +20 -0
  526. opik/rest_api/types/studio_prompt_write.py +20 -0
  527. opik/rest_api/types/trace.py +11 -2
  528. opik/rest_api/types/trace_enrichment_options.py +32 -0
  529. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
  530. opik/rest_api/types/trace_filter.py +23 -0
  531. opik/rest_api/types/trace_filter_operator.py +21 -0
  532. opik/rest_api/types/trace_filter_write.py +23 -0
  533. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  534. opik/rest_api/types/trace_public.py +11 -2
  535. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  536. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  537. opik/rest_api/types/trace_thread_identifier.py +1 -0
  538. opik/rest_api/types/trace_update.py +39 -0
  539. opik/rest_api/types/trace_write.py +1 -2
  540. opik/rest_api/types/value_entry.py +2 -0
  541. opik/rest_api/types/value_entry_compare.py +2 -0
  542. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  543. opik/rest_api/types/value_entry_public.py +2 -0
  544. opik/rest_api/types/video_url.py +19 -0
  545. opik/rest_api/types/video_url_public.py +19 -0
  546. opik/rest_api/types/video_url_write.py +19 -0
  547. opik/rest_api/types/webhook.py +28 -0
  548. opik/rest_api/types/webhook_examples.py +19 -0
  549. opik/rest_api/types/webhook_public.py +28 -0
  550. opik/rest_api/types/webhook_test_result.py +23 -0
  551. opik/rest_api/types/webhook_test_result_status.py +5 -0
  552. opik/rest_api/types/webhook_write.py +23 -0
  553. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  554. opik/rest_api/types/workspace_configuration.py +5 -0
  555. opik/rest_api/welcome_wizard/__init__.py +4 -0
  556. opik/rest_api/welcome_wizard/client.py +195 -0
  557. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  558. opik/rest_api/workspaces/client.py +14 -2
  559. opik/rest_api/workspaces/raw_client.py +10 -0
  560. opik/s3_httpx_client.py +14 -1
  561. opik/simulation/__init__.py +6 -0
  562. opik/simulation/simulated_user.py +99 -0
  563. opik/simulation/simulator.py +108 -0
  564. opik/synchronization.py +5 -6
  565. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  566. opik/types.py +36 -0
  567. opik/validation/chat_prompt_messages.py +241 -0
  568. opik/validation/feedback_score.py +3 -3
  569. opik/validation/validator.py +28 -0
  570. opik-1.9.71.dist-info/METADATA +370 -0
  571. opik-1.9.71.dist-info/RECORD +1110 -0
  572. opik/api_objects/prompt/prompt.py +0 -112
  573. opik/cli.py +0 -193
  574. opik/hooks.py +0 -13
  575. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  576. opik/integrations/bedrock/helpers.py +0 -8
  577. opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
  578. opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
  579. opik-1.8.39.dist-info/METADATA +0 -339
  580. opik-1.8.39.dist-info/RECORD +0 -790
  581. /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
  582. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
  583. /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
  584. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
  585. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
  586. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
  587. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  588. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  589. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
  590. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  591. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
  592. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,81 @@
1
1
  from .aggregated_metric import AggregatedMetric
2
- from .conversation.session_completeness.metric import SessionCompletenessQuality
3
- from .conversation.conversational_coherence.metric import ConversationalCoherenceMetric
4
- from .conversation.user_frustration.metric import UserFrustrationMetric
2
+
3
+ # Keep the canonical import first for the new layout while still tolerating
4
+ # older packaging artefacts (some environments import this module before the
5
+ # conversation package is available). If the eager import fails we fall back
6
+ # to the lazy getter below, letting legacy entry-points keep working.
7
+ from .conversation.conversation_thread_metric import ConversationThreadMetric
8
+
9
+ from .conversation import types as conversation_types
10
+ from .conversation.heuristics.degeneration.metric import ConversationDegenerationMetric
11
+ from .conversation.heuristics.knowledge_retention.metric import (
12
+ KnowledgeRetentionMetric,
13
+ )
14
+ from .conversation.llm_judges.conversational_coherence.metric import (
15
+ ConversationalCoherenceMetric,
16
+ )
17
+ from .conversation.llm_judges.g_eval_wrappers import (
18
+ GEvalConversationMetric,
19
+ ConversationComplianceRiskMetric,
20
+ ConversationDialogueHelpfulnessMetric,
21
+ ConversationQARelevanceMetric,
22
+ ConversationSummarizationCoherenceMetric,
23
+ ConversationSummarizationConsistencyMetric,
24
+ ConversationPromptUncertaintyMetric,
25
+ )
26
+ from .conversation.llm_judges.session_completeness.metric import (
27
+ SessionCompletenessQuality,
28
+ )
29
+ from .conversation.llm_judges.user_frustration.metric import UserFrustrationMetric
5
30
  from .heuristics.contains import Contains
6
31
  from .heuristics.equals import Equals
32
+ from .heuristics.gleu import GLEU
33
+ from .heuristics.chrf import ChrF
7
34
  from .heuristics.is_json import IsJson
35
+ from .heuristics.distribution_metrics import (
36
+ JSDivergence,
37
+ JSDistance,
38
+ KLDivergence,
39
+ )
8
40
  from .heuristics.levenshtein_ratio import LevenshteinRatio
41
+ from .heuristics.meteor import METEOR
42
+ from .heuristics.bertscore import BERTScore
43
+ from .heuristics.spearman import SpearmanRanking
44
+ from .heuristics.readability import Readability
45
+ from .heuristics.tone import Tone
46
+ from .heuristics.prompt_injection import PromptInjection
47
+ from .heuristics.language_adherence import LanguageAdherenceMetric
9
48
  from .heuristics.regex_match import RegexMatch
10
49
  from .heuristics.bleu import SentenceBLEU, CorpusBLEU
11
50
  from .heuristics.rouge import ROUGE
12
51
  from .heuristics.sentiment import Sentiment
52
+ from .heuristics.vader_sentiment import VADERSentiment
13
53
  from .llm_judges.answer_relevance.metric import AnswerRelevance
54
+ from .llm_judges.g_eval_presets import (
55
+ AgentTaskCompletionJudge,
56
+ AgentToolCorrectnessJudge,
57
+ ComplianceRiskJudge,
58
+ DemographicBiasJudge,
59
+ DialogueHelpfulnessJudge,
60
+ GenderBiasJudge,
61
+ PoliticalBiasJudge,
62
+ PromptUncertaintyJudge,
63
+ QARelevanceJudge,
64
+ RegionalBiasJudge,
65
+ ReligiousBiasJudge,
66
+ SummarizationCoherenceJudge,
67
+ SummarizationConsistencyJudge,
68
+ )
14
69
  from .llm_judges.context_precision.metric import ContextPrecision
15
70
  from .llm_judges.context_recall.metric import ContextRecall
16
- from .llm_judges.g_eval.metric import GEval
71
+ from .llm_judges.g_eval.metric import GEval, GEvalPreset
17
72
  from .llm_judges.hallucination.metric import Hallucination
18
73
  from .llm_judges.moderation.metric import Moderation
74
+ from .llm_judges.llm_juries.metric import LLMJuriesJudge
19
75
  from .llm_judges.trajectory_accuracy import TrajectoryAccuracy
76
+ from .llm_judges.syc_eval.metric import SycEval
20
77
  from .llm_judges.usefulness.metric import Usefulness
78
+ from .llm_judges.structure_output_compliance.metric import StructuredOutputCompliance
21
79
  from .base_metric import BaseMetric
22
80
  from .ragas_metric import RagasMetricWrapper
23
81
  from opik.exceptions import MetricComputationError
@@ -27,17 +85,51 @@ from opik.exceptions import MetricComputationError
27
85
  __all__ = [
28
86
  "AggregatedMetric",
29
87
  "AnswerRelevance",
88
+ "AgentTaskCompletionJudge",
89
+ "AgentToolCorrectnessJudge",
30
90
  "BaseMetric",
91
+ "ConversationDegenerationMetric",
92
+ "KnowledgeRetentionMetric",
93
+ "GEvalConversationMetric",
94
+ "ConversationComplianceRiskMetric",
95
+ "ConversationDialogueHelpfulnessMetric",
96
+ "ConversationQARelevanceMetric",
97
+ "ConversationSummarizationCoherenceMetric",
98
+ "ConversationSummarizationConsistencyMetric",
99
+ "ConversationPromptUncertaintyMetric",
100
+ "conversation_types",
101
+ "ComplianceRiskJudge",
31
102
  "Contains",
32
103
  "ContextPrecision",
33
104
  "ContextRecall",
34
105
  "ConversationalCoherenceMetric",
35
106
  "CorpusBLEU",
107
+ "DemographicBiasJudge",
36
108
  "Equals",
37
109
  "GEval",
110
+ "GEvalPreset",
111
+ "GLEU",
112
+ "GenderBiasJudge",
38
113
  "Hallucination",
39
114
  "IsJson",
115
+ "JSDivergence",
116
+ "JSDistance",
117
+ "KLDivergence",
40
118
  "LevenshteinRatio",
119
+ "BERTScore",
120
+ "METEOR",
121
+ "ChrF",
122
+ "Readability",
123
+ "PromptInjection",
124
+ "LanguageAdherenceMetric",
125
+ "PoliticalBiasJudge",
126
+ "PromptUncertaintyJudge",
127
+ "SpearmanRanking",
128
+ "ReligiousBiasJudge",
129
+ "RegionalBiasJudge",
130
+ "VADERSentiment",
131
+ "Tone",
132
+ "StructuredOutputCompliance",
41
133
  "MetricComputationError",
42
134
  "Moderation",
43
135
  "RagasMetricWrapper",
@@ -46,8 +138,15 @@ __all__ = [
46
138
  "SentenceBLEU",
47
139
  "Sentiment",
48
140
  "SessionCompletenessQuality",
141
+ "SycEval",
49
142
  "Usefulness",
50
143
  "UserFrustrationMetric",
51
144
  "TrajectoryAccuracy",
145
+ "DialogueHelpfulnessJudge",
146
+ "QARelevanceJudge",
147
+ "SummarizationCoherenceJudge",
148
+ "SummarizationConsistencyJudge",
149
+ "LLMJuriesJudge",
150
+ "ConversationThreadMetric",
52
151
  # "Factuality",
53
152
  ]
@@ -8,14 +8,43 @@ from . import arguments_helpers, arguments_validator, base_metric, score_result
8
8
  class AggregatedMetric(
9
9
  base_metric.BaseMetric, arguments_validator.ScoreArgumentsValidator
10
10
  ):
11
- """A metric that aggregates results obtained from a list of provided metrics using specified aggregation function.
11
+ """
12
+ Combine the output of multiple metrics into a single aggregated ``ScoreResult``.
13
+
14
+ Each metric in ``metrics`` is executed with the provided scoring kwargs, then the
15
+ ``aggregator`` callback decides how to merge the individual results. This is
16
+ handy for building ensembles such as min/max, weighted averages, or custom
17
+ pass/fail checks without re-implementing the metrics themselves.
12
18
 
13
19
  Args:
14
- name: The name of the metric.
15
- metrics: A list of concrete metric instances that inherit the `opik.evaluation.base_metric.BaseMetric`.
16
- aggregator: The aggregation function to use for evaluation.
17
- track: Whether to track the metric. Defaults to True.
18
- project_name: Optional project name to track the metric in for the cases when there are no parent span/trace to inherit project name from.
20
+ name: Display name for the aggregated metric result.
21
+ metrics: Ordered list of metric instances that should be executed.
22
+ aggregator: Callable receiving the list of ``ScoreResult`` objects and
23
+ returning the final aggregated ``ScoreResult``.
24
+ track: Whether to automatically track the metric in Opik. Defaults to
25
+ ``True``.
26
+ project_name: Optional tracking project used when no parent context exists.
27
+
28
+ Example:
29
+ >>> from opik.evaluation.metrics import AggregatedMetric, Contains, RegexMatch
30
+ >>> metrics = [Contains(track=False), RegexMatch(pattern=r"\\d+", track=False)]
31
+ >>> from opik.evaluation.metrics import score_result
32
+ >>> def combine(results):
33
+ ... score = sum(result.value for result in results) / len(results)
34
+ ... return score_result.ScoreResult(
35
+ ... name="combined_contains_regex",
36
+ ... value=score,
37
+ ... reason="Average of contains and regex checks",
38
+ ... )
39
+ >>> metric = AggregatedMetric(
40
+ ... name="combined_contains_regex",
41
+ ... metrics=metrics,
42
+ ... aggregator=combine,
43
+ ... )
44
+ >>> response = "Order number 12345 confirmed"
45
+ >>> result = metric.score(output=response, reference="order")
46
+ >>> float(result.value) # doctest: +SKIP
47
+ 1.0
19
48
  """
20
49
 
21
50
  def __init__(
@@ -1,5 +1,5 @@
1
1
  import abc
2
- from typing import Any, List, Union, Optional
2
+ from typing import Any, List, Optional, Union
3
3
 
4
4
  import opik
5
5
  import opik.config as opik_config
@@ -0,0 +1,48 @@
1
+ """Public conversation metrics API."""
2
+
3
+ from .conversation_thread_metric import ConversationThreadMetric
4
+ from .conversation_turns_factory import build_conversation_turns
5
+ from .helpers import (
6
+ extract_turns_windows_from_conversation,
7
+ get_turns_in_sliding_window,
8
+ merge_turns,
9
+ )
10
+ from .types import Conversation, ConversationDict, ConversationTurn
11
+
12
+ __all__ = [
13
+ "ConversationThreadMetric",
14
+ "Conversation",
15
+ "ConversationDict",
16
+ "ConversationTurn",
17
+ "build_conversation_turns",
18
+ "extract_turns_windows_from_conversation",
19
+ "get_turns_in_sliding_window",
20
+ "merge_turns",
21
+ "ConversationDegenerationMetric",
22
+ "KnowledgeRetentionMetric",
23
+ "ConversationalCoherenceMetric",
24
+ "SessionCompletenessQuality",
25
+ "UserFrustrationMetric",
26
+ "ConversationComplianceRiskMetric",
27
+ "ConversationDialogueHelpfulnessMetric",
28
+ "ConversationPromptUncertaintyMetric",
29
+ "ConversationQARelevanceMetric",
30
+ "ConversationSummarizationCoherenceMetric",
31
+ "ConversationSummarizationConsistencyMetric",
32
+ "GEvalConversationMetric",
33
+ ]
34
+
35
+ from .heuristics.degeneration.metric import ConversationDegenerationMetric
36
+ from .heuristics.knowledge_retention.metric import KnowledgeRetentionMetric
37
+ from .llm_judges.conversational_coherence.metric import ConversationalCoherenceMetric
38
+ from .llm_judges.g_eval_wrappers import (
39
+ GEvalConversationMetric,
40
+ ConversationComplianceRiskMetric,
41
+ ConversationDialogueHelpfulnessMetric,
42
+ ConversationQARelevanceMetric,
43
+ ConversationSummarizationCoherenceMetric,
44
+ ConversationSummarizationConsistencyMetric,
45
+ ConversationPromptUncertaintyMetric,
46
+ )
47
+ from .llm_judges.session_completeness.metric import SessionCompletenessQuality
48
+ from .llm_judges.user_frustration.metric import UserFrustrationMetric
@@ -5,11 +5,53 @@ from .. import base_metric, score_result
5
5
 
6
6
 
7
7
  class ConversationThreadMetric(base_metric.BaseMetric):
8
- """Abstract base class for all conversation thread metrics."""
8
+ """
9
+ Abstract base class for all conversation thread metrics. When creating a custom
10
+ conversation metric, you should inherit from this class and implement the abstract methods.
11
+
12
+ Conversation metrics are designed to evaluate multi-turn conversations rather than
13
+ single input-output pairs. They accept a conversation as a list of message dictionaries,
14
+ where each message has a 'role' (either 'user' or 'assistant') and 'content'.
15
+
16
+ Args:
17
+ name: The name of the metric. If not provided, uses the class name as default.
18
+ track: Whether to track the metric. Defaults to True.
19
+ project_name: Optional project name to track the metric in for the cases when
20
+ there is no parent span/trace to inherit project name from.
21
+
22
+ Example:
23
+ >>> from opik.evaluation.metrics.conversation import conversation_thread_metric, types
24
+ >>> from opik.evaluation.metrics import score_result
25
+ >>> from typing import Any
26
+ >>>
27
+ >>> class ConversationLengthMetric(conversation_thread_metric.ConversationThreadMetric):
28
+ >>> def __init__(self, name: str = "conversation_length_score"):
29
+ >>> super().__init__(name)
30
+ >>>
31
+ >>> def score(self, conversation: types.Conversation, **kwargs: Any):
32
+ >>> num_turns = sum(1 for msg in conversation if msg["role"] == "assistant")
33
+ >>> return score_result.ScoreResult(
34
+ >>> name=self.name,
35
+ >>> value=num_turns,
36
+ >>> reason=f"Conversation has {num_turns} turns"
37
+ >>> )
38
+ """
9
39
 
10
40
  def score(
11
41
  self, conversation: types.Conversation, **kwargs: Any
12
42
  ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
43
+ """
44
+ Evaluate a conversation and return a score.
45
+
46
+ Args:
47
+ conversation: A list of conversation messages. Each message is a dictionary
48
+ with 'role' (either 'user' or 'assistant') and 'content' (the message text).
49
+ **kwargs: Additional keyword arguments that may be used by specific metric implementations.
50
+
51
+ Returns:
52
+ A ScoreResult object or list of ScoreResult objects containing the evaluation score,
53
+ metric name, and optional reasoning.
54
+ """
13
55
  raise NotImplementedError(
14
56
  "Please use concrete metric classes instead of this one."
15
57
  )
@@ -18,7 +60,19 @@ class ConversationThreadMetric(base_metric.BaseMetric):
18
60
  self, conversation: types.Conversation, **kwargs: Any
19
61
  ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
20
62
  """
21
- Async public method that can be called independently.
63
+ Asynchronously evaluate a conversation and return a score.
64
+
65
+ This is the async version of the score method. By default, it calls the
66
+ synchronous score method, but can be overridden for true async implementations.
67
+
68
+ Args:
69
+ conversation: A list of conversation messages. Each message is a dictionary
70
+ with 'role' (either 'user' or 'assistant') and 'content' (the message text).
71
+ **kwargs: Additional keyword arguments that may be used by specific metric implementations.
72
+
73
+ Returns:
74
+ A ScoreResult object or list of ScoreResult objects containing the evaluation score,
75
+ metric name, and optional reasoning.
22
76
  """
23
77
  raise NotImplementedError(
24
78
  "Please use concrete metric classes instead of this one."
@@ -0,0 +1,19 @@
1
+ from .llm_judges.g_eval_wrappers import (
2
+ ConversationComplianceRiskMetric,
3
+ ConversationDialogueHelpfulnessMetric,
4
+ ConversationPromptUncertaintyMetric,
5
+ ConversationQARelevanceMetric,
6
+ ConversationSummarizationCoherenceMetric,
7
+ ConversationSummarizationConsistencyMetric,
8
+ GEvalConversationMetric,
9
+ )
10
+
11
+ __all__ = [
12
+ "GEvalConversationMetric",
13
+ "ConversationComplianceRiskMetric",
14
+ "ConversationDialogueHelpfulnessMetric",
15
+ "ConversationPromptUncertaintyMetric",
16
+ "ConversationQARelevanceMetric",
17
+ "ConversationSummarizationCoherenceMetric",
18
+ "ConversationSummarizationConsistencyMetric",
19
+ ]
@@ -1,6 +1,7 @@
1
1
  from typing import Any, Generator, List
2
2
 
3
- from . import types, conversation_turns_factory
3
+ from . import types
4
+ from .conversation_turns_factory import build_conversation_turns
4
5
 
5
6
 
6
7
  def get_turns_in_sliding_window(
@@ -8,22 +9,20 @@ def get_turns_in_sliding_window(
8
9
  ) -> Generator[List[types.ConversationTurn], Any, None]:
9
10
  """
10
11
  Generates windows of conversation turns of a fixed size from a list of turns.
11
-
12
12
  This function creates a sliding window over the list of conversation turns.
13
13
  Each window includes the current turn and up to `window_size - 1` previous
14
14
  conversation turns. If there are fewer turns available than the `window_size`,
15
15
  the window will consist of all available turns up to the current turn.
16
-
17
16
  Args:
18
17
  turns: List of conversation turn objects representing the interactions
19
18
  in a conversation.
20
19
  window_size: Integer specifying the maximum number of turns to include
21
20
  in each window.
22
-
23
21
  Yields:
24
22
  A generator that produces lists of conversation turns, where each list
25
23
  represents a sliding window of turns.
26
24
  """
25
+
27
26
  for i in range(len(turns)):
28
27
  yield turns[max(0, i - window_size + 1) : i + 1]
29
28
 
@@ -31,19 +30,17 @@ def get_turns_in_sliding_window(
31
30
  def merge_turns(turns: List[types.ConversationTurn]) -> types.Conversation:
32
31
  """
33
32
  Merges a list of conversation turns into a single conversation.
34
-
35
33
  This function takes a list of conversation turns and combines them
36
34
  into a single conversation by extending the output list with the data
37
35
  from each turn.
38
-
39
36
  Args:
40
37
  turns: A list of conversation turn objects to be combined.
41
-
42
38
  Returns:
43
39
  types.Conversation: A combined conversation object containing all
44
40
  the turns from the input list.
45
41
  """
46
- output = []
42
+
43
+ output: types.Conversation = []
47
44
  for turn in turns:
48
45
  output.extend(turn.as_list())
49
46
  return output
@@ -56,24 +53,20 @@ def extract_turns_windows_from_conversation(
56
53
  Extracts a list of conversation windows based on turns using a sliding window
57
54
  approach. This function divides a conversation into consecutive overlapping
58
55
  windows, where each window contains a specified number of turns.
59
-
60
56
  Args:
61
57
  conversation: The input conversation from which turns will be processed.
62
58
  window_size: The number of turns to include in each sliding window.
63
-
64
59
  Returns:
65
60
  A list of conversations, each representing a window of turns specified
66
61
  by the given window size.
67
-
68
62
  Raises:
69
63
  ValueError: If the conversation is empty or if it has no turns.
70
64
  """
65
+
71
66
  if len(conversation) == 0:
72
67
  raise ValueError("Conversation is empty")
73
68
 
74
- turns = conversation_turns_factory.build_conversation_turns(
75
- conversation=conversation
76
- )
69
+ turns = build_conversation_turns(conversation=conversation)
77
70
  if len(turns) == 0:
78
71
  raise ValueError("Conversation has no turns")
79
72
 
@@ -81,5 +74,11 @@ def extract_turns_windows_from_conversation(
81
74
  merge_turns(turns_window)
82
75
  for turns_window in get_turns_in_sliding_window(turns, window_size)
83
76
  ]
84
-
85
77
  return turns_windows
78
+
79
+
80
+ __all__ = [
81
+ "get_turns_in_sliding_window",
82
+ "merge_turns",
83
+ "extract_turns_windows_from_conversation",
84
+ ]
@@ -0,0 +1,14 @@
1
+ """Heuristic conversation-level metrics.
2
+
3
+ Exposes the reusable conversation-level heuristics under the public namespace
4
+ ``opik.evaluation.metrics.conversation.heuristics.*`` so documentation and downstream
5
+ code can import them directly.
6
+ """
7
+
8
+ from .degeneration.metric import ConversationDegenerationMetric
9
+ from .knowledge_retention.metric import KnowledgeRetentionMetric
10
+
11
+ __all__ = [
12
+ "ConversationDegenerationMetric",
13
+ "KnowledgeRetentionMetric",
14
+ ]
@@ -0,0 +1,3 @@
1
+ from .metric import ConversationDegenerationMetric
2
+
3
+ __all__ = ["ConversationDegenerationMetric"]
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import re
5
+ from collections import Counter
6
+ from typing import Dict, List, Optional
7
+
8
+ from opik.evaluation.metrics.conversation import types as conversation_types
9
+ from opik.evaluation.metrics.conversation.conversation_thread_metric import (
10
+ ConversationThreadMetric,
11
+ )
12
+ from opik.evaluation.metrics.score_result import ScoreResult
13
+ from opik.exceptions import MetricComputationError
14
+ from .phrases import DEFAULT_FALLBACK_PHRASES
15
+
16
+
17
+ def _tokenize(text: str) -> List[str]:
18
+ return re.findall(r"\b\w+\b", text.lower())
19
+
20
+
21
+ def _ngram_counts(tokens: List[str], n: int) -> Counter:
22
+ if len(tokens) < n:
23
+ return Counter()
24
+ return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
25
+
26
+
27
+ class ConversationDegenerationMetric(ConversationThreadMetric):
28
+ """
29
+ Score how strongly an assistant conversation shows degeneration or repetition.
30
+
31
+ The metric inspects each assistant turn, measuring repeated n-grams, overlap with
32
+ the previous reply, low lexical diversity, and presence of known fallback
33
+ phrases (for example, "as an AI language model..."). Each turn receives a
34
+ degeneration score between `0.0` and `1.0`; the overall metric reports the peak
35
+ risk observed so you can quickly flag sections where the assistant got stuck or
36
+ stopped being helpful. Detailed per-turn diagnostics are returned in the
37
+ ``ScoreResult.metadata`` payload.
38
+
39
+ Args:
40
+ name: Display name for the metric result. Defaults to
41
+ ``"conversation_degeneration_metric"``.
42
+ track: Whether the metric should automatically track to an Opik project.
43
+ Defaults to ``True``.
44
+ project_name: Optional project to store tracked results in. Defaults to
45
+ ``None`` (inherit global setting).
46
+ ngram_size: Size of the n-grams used to detect repetition within a single
47
+ response. Must be at least ``2``. Defaults to ``3``.
48
+ fallback_phrases: Custom list of phrases that should be treated as
49
+ degeneration signatures. If ``None``, a sensible default list is used.
50
+
51
+ Example:
52
+ >>> from opik.evaluation.metrics import ConversationDegenerationMetric
53
+ >>> conversation = [
54
+ ... {"role": "user", "content": "Can you draft a short bio for Ada?"},
55
+ ... {"role": "assistant", "content": "Sure, here is a short bio for Ada."},
56
+ ... {"role": "user", "content": "Could you add more detail?"},
57
+ ... {"role": "assistant", "content": "Sure, here is a short bio for Ada."},
58
+ ... ]
59
+ >>> metric = ConversationDegenerationMetric(ngram_size=3)
60
+ >>> result = metric.score(conversation)
61
+ >>> float(result.value) # doctest: +SKIP
62
+ 0.75
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ name: str = "conversation_degeneration_metric",
68
+ track: bool = True,
69
+ project_name: Optional[str] = None,
70
+ ngram_size: int = 3,
71
+ fallback_phrases: Optional[List[str]] = None,
72
+ ) -> None:
73
+ super().__init__(name=name, track=track, project_name=project_name)
74
+ if ngram_size < 2:
75
+ raise MetricComputationError("ngram_size must be >= 2")
76
+ self._ngram_size = ngram_size
77
+ phrases = (
78
+ fallback_phrases
79
+ if fallback_phrases is not None
80
+ else DEFAULT_FALLBACK_PHRASES
81
+ )
82
+ self._fallback_phrases = [phrase.lower() for phrase in phrases]
83
+
84
+ def score(
85
+ self,
86
+ conversation: conversation_types.Conversation,
87
+ **ignored_kwargs: object,
88
+ ) -> ScoreResult:
89
+ assistant_turns = [
90
+ turn["content"]
91
+ for turn in conversation
92
+ if turn.get("role") == "assistant" and turn.get("content")
93
+ ]
94
+ if not assistant_turns:
95
+ raise MetricComputationError("Conversation contains no assistant messages")
96
+
97
+ per_turn_metadata: List[Dict[str, float]] = []
98
+ degeneracy_scores: List[float] = []
99
+
100
+ prev_tokens: Optional[List[str]] = None
101
+ for content in assistant_turns:
102
+ tokens = _tokenize(content)
103
+ if not tokens:
104
+ continue
105
+
106
+ entropy_norm = self._token_entropy(tokens)
107
+ repetition_ratio = self._repetition_ratio(tokens)
108
+ prev_overlap = self._overlap_with_previous(tokens, prev_tokens)
109
+ fallback_score = 1.0 if self._contains_fallback_phrase(content) else 0.0
110
+
111
+ normalized_entropy = 1.0 - entropy_norm
112
+ # Combine all four risk factors with equal weight; this keeps the
113
+ # heuristic interpretable and matches the legacy scoring behaviour.
114
+ deg_score = min(
115
+ 1.0,
116
+ (repetition_ratio + prev_overlap + fallback_score + normalized_entropy)
117
+ / 4.0,
118
+ )
119
+
120
+ per_turn_metadata.append(
121
+ {
122
+ "repetition_ratio": repetition_ratio,
123
+ "overlap_previous": prev_overlap,
124
+ "fallback_hit": fallback_score,
125
+ "normalized_entropy": normalized_entropy,
126
+ "degeneration_score": deg_score,
127
+ }
128
+ )
129
+ degeneracy_scores.append(deg_score)
130
+ prev_tokens = tokens
131
+
132
+ if not degeneracy_scores:
133
+ raise MetricComputationError(
134
+ "Assistant messages were empty after tokenization"
135
+ )
136
+
137
+ average_score = sum(degeneracy_scores) / len(degeneracy_scores)
138
+ peak_score = max(degeneracy_scores)
139
+
140
+ return ScoreResult(
141
+ value=peak_score,
142
+ name=self.name,
143
+ reason=(
144
+ f"Peak degeneration risk ({len(degeneracy_scores)} turns):"
145
+ f" {peak_score:.3f}"
146
+ ),
147
+ metadata={
148
+ "per_turn": per_turn_metadata,
149
+ "average_score": average_score,
150
+ "peak_score": peak_score,
151
+ },
152
+ )
153
+
154
+ def _token_entropy(self, tokens: List[str]) -> float:
155
+ counts = Counter(tokens)
156
+ total = float(len(tokens))
157
+ entropy = 0.0
158
+ for count in counts.values():
159
+ prob = count / total
160
+ entropy -= prob * math.log(prob, 2)
161
+ max_entropy = math.log(len(counts), 2) if counts else 1.0
162
+ if max_entropy == 0:
163
+ return 0.0
164
+ return min(1.0, entropy / max_entropy)
165
+
166
+ def _repetition_ratio(self, tokens: List[str]) -> float:
167
+ ngram_counts = _ngram_counts(tokens, self._ngram_size)
168
+ total = sum(ngram_counts.values())
169
+ if total == 0:
170
+ return 0.0
171
+ repeated = sum(count for count in ngram_counts.values() if count > 1)
172
+ return repeated / total
173
+
174
+ def _overlap_with_previous(
175
+ self, tokens: List[str], prev_tokens: Optional[List[str]]
176
+ ) -> float:
177
+ if not prev_tokens:
178
+ return 0.0
179
+ current_set = set(tokens)
180
+ prev_set = set(prev_tokens)
181
+ if not current_set or not prev_set:
182
+ return 0.0
183
+ intersection = len(current_set & prev_set)
184
+ union = len(current_set | prev_set)
185
+ return intersection / union
186
+
187
+ def _contains_fallback_phrase(self, content: str) -> bool:
188
+ lowered = content.lower()
189
+ return any(phrase in lowered for phrase in self._fallback_phrases)
@@ -0,0 +1,12 @@
1
+ """Phrase lists used by the conversation degeneration metric."""
2
+
3
+ DEFAULT_FALLBACK_PHRASES = [
4
+ "i'm sorry",
5
+ "as an ai language model",
6
+ "i cannot",
7
+ "i'm unable",
8
+ "please provide",
9
+ "i don't have access",
10
+ "i don't understand",
11
+ "could you please clarify",
12
+ ]
@@ -0,0 +1,3 @@
1
+ from .metric import KnowledgeRetentionMetric
2
+
3
+ __all__ = ["KnowledgeRetentionMetric"]