opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. opik/__init__.py +19 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/attachment_context.py +36 -0
  9. opik/api_objects/attachment/attachments_extractor.py +153 -0
  10. opik/api_objects/attachment/client.py +1 -0
  11. opik/api_objects/attachment/converters.py +2 -0
  12. opik/api_objects/attachment/decoder.py +18 -0
  13. opik/api_objects/attachment/decoder_base64.py +83 -0
  14. opik/api_objects/attachment/decoder_helpers.py +137 -0
  15. opik/api_objects/data_helpers.py +79 -0
  16. opik/api_objects/dataset/dataset.py +64 -4
  17. opik/api_objects/dataset/rest_operations.py +11 -2
  18. opik/api_objects/experiment/experiment.py +57 -57
  19. opik/api_objects/experiment/experiment_item.py +2 -1
  20. opik/api_objects/experiment/experiments_client.py +64 -0
  21. opik/api_objects/experiment/helpers.py +35 -11
  22. opik/api_objects/experiment/rest_operations.py +65 -5
  23. opik/api_objects/helpers.py +8 -5
  24. opik/api_objects/local_recording.py +81 -0
  25. opik/api_objects/opik_client.py +600 -108
  26. opik/api_objects/opik_query_language.py +39 -5
  27. opik/api_objects/prompt/__init__.py +12 -2
  28. opik/api_objects/prompt/base_prompt.py +69 -0
  29. opik/api_objects/prompt/base_prompt_template.py +29 -0
  30. opik/api_objects/prompt/chat/__init__.py +1 -0
  31. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  32. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  33. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  34. opik/api_objects/prompt/client.py +189 -47
  35. opik/api_objects/prompt/text/__init__.py +1 -0
  36. opik/api_objects/prompt/text/prompt.py +174 -0
  37. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  38. opik/api_objects/prompt/types.py +23 -0
  39. opik/api_objects/search_helpers.py +89 -0
  40. opik/api_objects/span/span_data.py +35 -25
  41. opik/api_objects/threads/threads_client.py +39 -5
  42. opik/api_objects/trace/trace_client.py +52 -2
  43. opik/api_objects/trace/trace_data.py +15 -24
  44. opik/api_objects/validation_helpers.py +3 -3
  45. opik/cli/__init__.py +5 -0
  46. opik/cli/__main__.py +6 -0
  47. opik/cli/configure.py +66 -0
  48. opik/cli/exports/__init__.py +131 -0
  49. opik/cli/exports/dataset.py +278 -0
  50. opik/cli/exports/experiment.py +784 -0
  51. opik/cli/exports/project.py +685 -0
  52. opik/cli/exports/prompt.py +578 -0
  53. opik/cli/exports/utils.py +406 -0
  54. opik/cli/harbor.py +39 -0
  55. opik/cli/healthcheck.py +21 -0
  56. opik/cli/imports/__init__.py +439 -0
  57. opik/cli/imports/dataset.py +143 -0
  58. opik/cli/imports/experiment.py +1192 -0
  59. opik/cli/imports/project.py +262 -0
  60. opik/cli/imports/prompt.py +177 -0
  61. opik/cli/imports/utils.py +280 -0
  62. opik/cli/main.py +49 -0
  63. opik/cli/proxy.py +93 -0
  64. opik/cli/usage_report/__init__.py +16 -0
  65. opik/cli/usage_report/charts.py +783 -0
  66. opik/cli/usage_report/cli.py +274 -0
  67. opik/cli/usage_report/constants.py +9 -0
  68. opik/cli/usage_report/extraction.py +749 -0
  69. opik/cli/usage_report/pdf.py +244 -0
  70. opik/cli/usage_report/statistics.py +78 -0
  71. opik/cli/usage_report/utils.py +235 -0
  72. opik/config.py +13 -7
  73. opik/configurator/configure.py +17 -0
  74. opik/datetime_helpers.py +12 -0
  75. opik/decorator/arguments_helpers.py +9 -1
  76. opik/decorator/base_track_decorator.py +205 -133
  77. opik/decorator/context_manager/span_context_manager.py +123 -0
  78. opik/decorator/context_manager/trace_context_manager.py +84 -0
  79. opik/decorator/opik_args/__init__.py +13 -0
  80. opik/decorator/opik_args/api_classes.py +71 -0
  81. opik/decorator/opik_args/helpers.py +120 -0
  82. opik/decorator/span_creation_handler.py +25 -6
  83. opik/dict_utils.py +3 -3
  84. opik/evaluation/__init__.py +13 -2
  85. opik/evaluation/engine/engine.py +272 -75
  86. opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
  87. opik/evaluation/engine/helpers.py +31 -6
  88. opik/evaluation/engine/metrics_evaluator.py +237 -0
  89. opik/evaluation/evaluation_result.py +168 -2
  90. opik/evaluation/evaluator.py +533 -62
  91. opik/evaluation/metrics/__init__.py +103 -4
  92. opik/evaluation/metrics/aggregated_metric.py +35 -6
  93. opik/evaluation/metrics/base_metric.py +1 -1
  94. opik/evaluation/metrics/conversation/__init__.py +48 -0
  95. opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
  96. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  97. opik/evaluation/metrics/conversation/helpers.py +14 -15
  98. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  99. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  100. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  101. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  102. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  103. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  104. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  105. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
  106. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
  107. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  108. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
  109. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
  110. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  111. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
  112. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
  113. opik/evaluation/metrics/conversation/types.py +4 -5
  114. opik/evaluation/metrics/conversation_types.py +9 -0
  115. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  116. opik/evaluation/metrics/heuristics/bleu.py +35 -15
  117. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  118. opik/evaluation/metrics/heuristics/contains.py +47 -11
  119. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  120. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  121. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  122. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  123. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  124. opik/evaluation/metrics/heuristics/readability.py +129 -0
  125. opik/evaluation/metrics/heuristics/rouge.py +26 -9
  126. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  127. opik/evaluation/metrics/heuristics/tone.py +155 -0
  128. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  129. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
  130. opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
  131. opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
  132. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  133. opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
  134. opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
  135. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  136. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  137. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  138. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  139. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  140. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  141. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  142. opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
  143. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  144. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  145. opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
  146. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  147. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  148. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  149. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  150. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  151. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  152. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  153. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  154. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  155. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
  156. opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
  157. opik/evaluation/metrics/ragas_metric.py +43 -23
  158. opik/evaluation/models/__init__.py +8 -0
  159. opik/evaluation/models/base_model.py +107 -1
  160. opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
  161. opik/evaluation/models/langchain/message_converters.py +97 -15
  162. opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
  163. opik/evaluation/models/litellm/util.py +125 -0
  164. opik/evaluation/models/litellm/warning_filters.py +16 -4
  165. opik/evaluation/models/model_capabilities.py +187 -0
  166. opik/evaluation/models/models_factory.py +25 -3
  167. opik/evaluation/preprocessing.py +92 -0
  168. opik/evaluation/report.py +70 -12
  169. opik/evaluation/rest_operations.py +49 -45
  170. opik/evaluation/samplers/__init__.py +4 -0
  171. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  172. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  173. opik/evaluation/score_statistics.py +66 -0
  174. opik/evaluation/scorers/__init__.py +4 -0
  175. opik/evaluation/scorers/scorer_function.py +55 -0
  176. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  177. opik/evaluation/test_case.py +3 -2
  178. opik/evaluation/test_result.py +1 -0
  179. opik/evaluation/threads/evaluator.py +31 -3
  180. opik/evaluation/threads/helpers.py +3 -2
  181. opik/evaluation/types.py +9 -1
  182. opik/exceptions.py +33 -0
  183. opik/file_upload/file_uploader.py +13 -0
  184. opik/file_upload/upload_options.py +2 -0
  185. opik/hooks/__init__.py +23 -0
  186. opik/hooks/anonymizer_hook.py +36 -0
  187. opik/hooks/httpx_client_hook.py +112 -0
  188. opik/httpx_client.py +12 -9
  189. opik/id_helpers.py +18 -0
  190. opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
  191. opik/integrations/adk/helpers.py +16 -7
  192. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  193. opik/integrations/adk/opik_tracer.py +14 -1
  194. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  195. opik/integrations/adk/recursive_callback_injector.py +4 -7
  196. opik/integrations/bedrock/converse/__init__.py +0 -0
  197. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  198. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
  199. opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
  200. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  201. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  202. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  203. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  204. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  205. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  206. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  207. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  208. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  209. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  210. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  211. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  212. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  213. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  214. opik/integrations/bedrock/opik_tracker.py +42 -4
  215. opik/integrations/bedrock/types.py +19 -0
  216. opik/integrations/crewai/crewai_decorator.py +8 -51
  217. opik/integrations/crewai/opik_tracker.py +31 -10
  218. opik/integrations/crewai/patchers/__init__.py +5 -0
  219. opik/integrations/crewai/patchers/flow.py +118 -0
  220. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  221. opik/integrations/crewai/patchers/llm_client.py +207 -0
  222. opik/integrations/dspy/callback.py +80 -17
  223. opik/integrations/dspy/parsers.py +168 -0
  224. opik/integrations/harbor/__init__.py +17 -0
  225. opik/integrations/harbor/experiment_service.py +269 -0
  226. opik/integrations/harbor/opik_tracker.py +528 -0
  227. opik/integrations/haystack/opik_connector.py +2 -2
  228. opik/integrations/haystack/opik_tracer.py +3 -7
  229. opik/integrations/langchain/__init__.py +3 -1
  230. opik/integrations/langchain/helpers.py +96 -0
  231. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  232. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  233. opik/integrations/langchain/opik_encoder_extension.py +1 -1
  234. opik/integrations/langchain/opik_tracer.py +474 -229
  235. opik/integrations/litellm/__init__.py +5 -0
  236. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  237. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  238. opik/integrations/litellm/opik_tracker.py +43 -0
  239. opik/integrations/litellm/stream_patchers.py +151 -0
  240. opik/integrations/llama_index/callback.py +146 -107
  241. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  242. opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
  243. opik/integrations/openai/opik_tracker.py +1 -1
  244. opik/integrations/sagemaker/auth.py +5 -1
  245. opik/llm_usage/google_usage.py +3 -1
  246. opik/llm_usage/opik_usage.py +7 -8
  247. opik/llm_usage/opik_usage_factory.py +4 -2
  248. opik/logging_messages.py +6 -0
  249. opik/message_processing/batching/base_batcher.py +14 -21
  250. opik/message_processing/batching/batch_manager.py +22 -10
  251. opik/message_processing/batching/batch_manager_constuctors.py +10 -0
  252. opik/message_processing/batching/batchers.py +59 -27
  253. opik/message_processing/batching/flushing_thread.py +0 -3
  254. opik/message_processing/emulation/__init__.py +0 -0
  255. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  256. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  257. opik/message_processing/emulation/models.py +162 -0
  258. opik/message_processing/encoder_helpers.py +79 -0
  259. opik/message_processing/messages.py +56 -1
  260. opik/message_processing/preprocessing/__init__.py +0 -0
  261. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  262. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  263. opik/message_processing/preprocessing/constants.py +1 -0
  264. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  265. opik/message_processing/preprocessing/preprocessor.py +36 -0
  266. opik/message_processing/processors/__init__.py +0 -0
  267. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  268. opik/message_processing/processors/message_processors.py +92 -0
  269. opik/message_processing/processors/message_processors_chain.py +96 -0
  270. opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
  271. opik/message_processing/queue_consumer.py +9 -3
  272. opik/message_processing/streamer.py +71 -33
  273. opik/message_processing/streamer_constructors.py +43 -10
  274. opik/opik_context.py +16 -4
  275. opik/plugins/pytest/hooks.py +5 -3
  276. opik/rest_api/__init__.py +346 -15
  277. opik/rest_api/alerts/__init__.py +7 -0
  278. opik/rest_api/alerts/client.py +667 -0
  279. opik/rest_api/alerts/raw_client.py +1015 -0
  280. opik/rest_api/alerts/types/__init__.py +7 -0
  281. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  282. opik/rest_api/annotation_queues/__init__.py +4 -0
  283. opik/rest_api/annotation_queues/client.py +668 -0
  284. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  285. opik/rest_api/automation_rule_evaluators/client.py +34 -2
  286. opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
  287. opik/rest_api/client.py +15 -0
  288. opik/rest_api/dashboards/__init__.py +4 -0
  289. opik/rest_api/dashboards/client.py +462 -0
  290. opik/rest_api/dashboards/raw_client.py +648 -0
  291. opik/rest_api/datasets/client.py +1310 -44
  292. opik/rest_api/datasets/raw_client.py +2269 -358
  293. opik/rest_api/experiments/__init__.py +2 -2
  294. opik/rest_api/experiments/client.py +191 -5
  295. opik/rest_api/experiments/raw_client.py +301 -7
  296. opik/rest_api/experiments/types/__init__.py +4 -1
  297. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  298. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  299. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  300. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  301. opik/rest_api/llm_provider_key/client.py +20 -0
  302. opik/rest_api/llm_provider_key/raw_client.py +20 -0
  303. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  304. opik/rest_api/manual_evaluation/__init__.py +4 -0
  305. opik/rest_api/manual_evaluation/client.py +347 -0
  306. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  307. opik/rest_api/optimizations/client.py +145 -9
  308. opik/rest_api/optimizations/raw_client.py +237 -13
  309. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  310. opik/rest_api/prompts/__init__.py +2 -2
  311. opik/rest_api/prompts/client.py +227 -6
  312. opik/rest_api/prompts/raw_client.py +331 -2
  313. opik/rest_api/prompts/types/__init__.py +3 -1
  314. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  315. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  316. opik/rest_api/spans/__init__.py +0 -2
  317. opik/rest_api/spans/client.py +238 -76
  318. opik/rest_api/spans/raw_client.py +307 -95
  319. opik/rest_api/spans/types/__init__.py +0 -2
  320. opik/rest_api/traces/client.py +572 -161
  321. opik/rest_api/traces/raw_client.py +736 -229
  322. opik/rest_api/types/__init__.py +352 -17
  323. opik/rest_api/types/aggregation_data.py +1 -0
  324. opik/rest_api/types/alert.py +33 -0
  325. opik/rest_api/types/alert_alert_type.py +5 -0
  326. opik/rest_api/types/alert_page_public.py +24 -0
  327. opik/rest_api/types/alert_public.py +33 -0
  328. opik/rest_api/types/alert_public_alert_type.py +5 -0
  329. opik/rest_api/types/alert_trigger.py +27 -0
  330. opik/rest_api/types/alert_trigger_config.py +28 -0
  331. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  332. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  333. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  334. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  335. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  336. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  337. opik/rest_api/types/alert_trigger_public.py +27 -0
  338. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  339. opik/rest_api/types/alert_trigger_write.py +23 -0
  340. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  341. opik/rest_api/types/alert_write.py +28 -0
  342. opik/rest_api/types/alert_write_alert_type.py +5 -0
  343. opik/rest_api/types/annotation_queue.py +42 -0
  344. opik/rest_api/types/annotation_queue_batch.py +27 -0
  345. opik/rest_api/types/annotation_queue_item_ids.py +19 -0
  346. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  347. opik/rest_api/types/annotation_queue_public.py +38 -0
  348. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  349. opik/rest_api/types/annotation_queue_reviewer.py +20 -0
  350. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  351. opik/rest_api/types/annotation_queue_scope.py +5 -0
  352. opik/rest_api/types/annotation_queue_write.py +31 -0
  353. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  354. opik/rest_api/types/audio_url.py +19 -0
  355. opik/rest_api/types/audio_url_public.py +19 -0
  356. opik/rest_api/types/audio_url_write.py +19 -0
  357. opik/rest_api/types/automation_rule_evaluator.py +62 -2
  358. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  359. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  360. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  361. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  362. opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
  363. opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
  364. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  365. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  366. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  367. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  368. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  369. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  370. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  371. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  372. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  373. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  374. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  375. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  376. opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
  377. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  378. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  379. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  380. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  381. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  382. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  383. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  384. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  385. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  386. opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
  387. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  388. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  389. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  390. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  391. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  392. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  393. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  394. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  395. opik/rest_api/types/dashboard_page_public.py +24 -0
  396. opik/rest_api/types/dashboard_public.py +30 -0
  397. opik/rest_api/types/dataset.py +4 -0
  398. opik/rest_api/types/dataset_expansion.py +42 -0
  399. opik/rest_api/types/dataset_expansion_response.py +39 -0
  400. opik/rest_api/types/dataset_item.py +2 -0
  401. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  402. opik/rest_api/types/dataset_item_compare.py +2 -0
  403. opik/rest_api/types/dataset_item_filter.py +27 -0
  404. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  405. opik/rest_api/types/dataset_item_page_compare.py +5 -0
  406. opik/rest_api/types/dataset_item_page_public.py +5 -0
  407. opik/rest_api/types/dataset_item_public.py +2 -0
  408. opik/rest_api/types/dataset_item_update.py +39 -0
  409. opik/rest_api/types/dataset_item_write.py +1 -0
  410. opik/rest_api/types/dataset_public.py +4 -0
  411. opik/rest_api/types/dataset_public_status.py +5 -0
  412. opik/rest_api/types/dataset_status.py +5 -0
  413. opik/rest_api/types/dataset_version_diff.py +22 -0
  414. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  415. opik/rest_api/types/dataset_version_page_public.py +23 -0
  416. opik/rest_api/types/dataset_version_public.py +59 -0
  417. opik/rest_api/types/dataset_version_summary.py +46 -0
  418. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  419. opik/rest_api/types/experiment.py +7 -2
  420. opik/rest_api/types/experiment_group_response.py +2 -0
  421. opik/rest_api/types/experiment_public.py +7 -2
  422. opik/rest_api/types/experiment_public_status.py +5 -0
  423. opik/rest_api/types/experiment_score.py +20 -0
  424. opik/rest_api/types/experiment_score_public.py +20 -0
  425. opik/rest_api/types/experiment_score_write.py +20 -0
  426. opik/rest_api/types/experiment_status.py +5 -0
  427. opik/rest_api/types/feedback.py +25 -1
  428. opik/rest_api/types/feedback_create.py +20 -1
  429. opik/rest_api/types/feedback_object_public.py +27 -1
  430. opik/rest_api/types/feedback_public.py +25 -1
  431. opik/rest_api/types/feedback_score_batch_item.py +2 -1
  432. opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
  433. opik/rest_api/types/feedback_score_public.py +4 -0
  434. opik/rest_api/types/feedback_update.py +20 -1
  435. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  436. opik/rest_api/types/group_detail.py +19 -0
  437. opik/rest_api/types/group_details.py +20 -0
  438. opik/rest_api/types/guardrail.py +1 -0
  439. opik/rest_api/types/guardrail_write.py +1 -0
  440. opik/rest_api/types/ids_holder.py +19 -0
  441. opik/rest_api/types/image_url.py +20 -0
  442. opik/rest_api/types/image_url_public.py +20 -0
  443. opik/rest_api/types/image_url_write.py +20 -0
  444. opik/rest_api/types/llm_as_judge_message.py +5 -1
  445. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  446. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  447. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  448. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  449. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  450. opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
  451. opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
  452. opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
  453. opik/rest_api/types/manual_evaluation_request.py +38 -0
  454. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  455. opik/rest_api/types/manual_evaluation_response.py +27 -0
  456. opik/rest_api/types/optimization.py +4 -2
  457. opik/rest_api/types/optimization_public.py +4 -2
  458. opik/rest_api/types/optimization_public_status.py +3 -1
  459. opik/rest_api/types/optimization_status.py +3 -1
  460. opik/rest_api/types/optimization_studio_config.py +27 -0
  461. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  462. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  463. opik/rest_api/types/optimization_studio_log.py +22 -0
  464. opik/rest_api/types/optimization_write.py +4 -2
  465. opik/rest_api/types/optimization_write_status.py +3 -1
  466. opik/rest_api/types/project.py +1 -0
  467. opik/rest_api/types/project_detailed.py +1 -0
  468. opik/rest_api/types/project_reference.py +31 -0
  469. opik/rest_api/types/project_reference_public.py +31 -0
  470. opik/rest_api/types/project_stats_summary_item.py +1 -0
  471. opik/rest_api/types/prompt.py +6 -0
  472. opik/rest_api/types/prompt_detail.py +6 -0
  473. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  474. opik/rest_api/types/prompt_public.py +6 -0
  475. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  476. opik/rest_api/types/prompt_template_structure.py +5 -0
  477. opik/rest_api/types/prompt_version.py +3 -0
  478. opik/rest_api/types/prompt_version_detail.py +3 -0
  479. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  480. opik/rest_api/types/prompt_version_link.py +1 -0
  481. opik/rest_api/types/prompt_version_link_public.py +1 -0
  482. opik/rest_api/types/prompt_version_page_public.py +5 -0
  483. opik/rest_api/types/prompt_version_public.py +3 -0
  484. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  485. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  486. opik/rest_api/types/prompt_version_update.py +33 -0
  487. opik/rest_api/types/provider_api_key.py +9 -0
  488. opik/rest_api/types/provider_api_key_provider.py +1 -1
  489. opik/rest_api/types/provider_api_key_public.py +9 -0
  490. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  491. opik/rest_api/types/score_name.py +1 -0
  492. opik/rest_api/types/service_toggles_config.py +18 -0
  493. opik/rest_api/types/span.py +1 -2
  494. opik/rest_api/types/span_enrichment_options.py +31 -0
  495. opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
  496. opik/rest_api/types/span_filter.py +23 -0
  497. opik/rest_api/types/span_filter_operator.py +21 -0
  498. opik/rest_api/types/span_filter_write.py +23 -0
  499. opik/rest_api/types/span_filter_write_operator.py +21 -0
  500. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  501. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  502. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  503. opik/rest_api/types/span_public.py +1 -2
  504. opik/rest_api/types/span_update.py +46 -0
  505. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  506. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  507. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  508. opik/rest_api/types/span_write.py +1 -2
  509. opik/rest_api/types/studio_evaluation.py +20 -0
  510. opik/rest_api/types/studio_evaluation_public.py +20 -0
  511. opik/rest_api/types/studio_evaluation_write.py +20 -0
  512. opik/rest_api/types/studio_llm_model.py +21 -0
  513. opik/rest_api/types/studio_llm_model_public.py +21 -0
  514. opik/rest_api/types/studio_llm_model_write.py +21 -0
  515. opik/rest_api/types/studio_message.py +20 -0
  516. opik/rest_api/types/studio_message_public.py +20 -0
  517. opik/rest_api/types/studio_message_write.py +20 -0
  518. opik/rest_api/types/studio_metric.py +21 -0
  519. opik/rest_api/types/studio_metric_public.py +21 -0
  520. opik/rest_api/types/studio_metric_write.py +21 -0
  521. opik/rest_api/types/studio_optimizer.py +21 -0
  522. opik/rest_api/types/studio_optimizer_public.py +21 -0
  523. opik/rest_api/types/studio_optimizer_write.py +21 -0
  524. opik/rest_api/types/studio_prompt.py +20 -0
  525. opik/rest_api/types/studio_prompt_public.py +20 -0
  526. opik/rest_api/types/studio_prompt_write.py +20 -0
  527. opik/rest_api/types/trace.py +11 -2
  528. opik/rest_api/types/trace_enrichment_options.py +32 -0
  529. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
  530. opik/rest_api/types/trace_filter.py +23 -0
  531. opik/rest_api/types/trace_filter_operator.py +21 -0
  532. opik/rest_api/types/trace_filter_write.py +23 -0
  533. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  534. opik/rest_api/types/trace_public.py +11 -2
  535. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  536. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  537. opik/rest_api/types/trace_thread_identifier.py +1 -0
  538. opik/rest_api/types/trace_update.py +39 -0
  539. opik/rest_api/types/trace_write.py +1 -2
  540. opik/rest_api/types/value_entry.py +2 -0
  541. opik/rest_api/types/value_entry_compare.py +2 -0
  542. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  543. opik/rest_api/types/value_entry_public.py +2 -0
  544. opik/rest_api/types/video_url.py +19 -0
  545. opik/rest_api/types/video_url_public.py +19 -0
  546. opik/rest_api/types/video_url_write.py +19 -0
  547. opik/rest_api/types/webhook.py +28 -0
  548. opik/rest_api/types/webhook_examples.py +19 -0
  549. opik/rest_api/types/webhook_public.py +28 -0
  550. opik/rest_api/types/webhook_test_result.py +23 -0
  551. opik/rest_api/types/webhook_test_result_status.py +5 -0
  552. opik/rest_api/types/webhook_write.py +23 -0
  553. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  554. opik/rest_api/types/workspace_configuration.py +5 -0
  555. opik/rest_api/welcome_wizard/__init__.py +4 -0
  556. opik/rest_api/welcome_wizard/client.py +195 -0
  557. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  558. opik/rest_api/workspaces/client.py +14 -2
  559. opik/rest_api/workspaces/raw_client.py +10 -0
  560. opik/s3_httpx_client.py +14 -1
  561. opik/simulation/__init__.py +6 -0
  562. opik/simulation/simulated_user.py +99 -0
  563. opik/simulation/simulator.py +108 -0
  564. opik/synchronization.py +5 -6
  565. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  566. opik/types.py +36 -0
  567. opik/validation/chat_prompt_messages.py +241 -0
  568. opik/validation/feedback_score.py +3 -3
  569. opik/validation/validator.py +28 -0
  570. opik-1.9.71.dist-info/METADATA +370 -0
  571. opik-1.9.71.dist-info/RECORD +1110 -0
  572. opik/api_objects/prompt/prompt.py +0 -112
  573. opik/cli.py +0 -193
  574. opik/hooks.py +0 -13
  575. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  576. opik/integrations/bedrock/helpers.py +0 -8
  577. opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
  578. opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
  579. opik-1.8.39.dist-info/METADATA +0 -339
  580. opik-1.8.39.dist-info/RECORD +0 -790
  581. /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
  582. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
  583. /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
  584. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
  585. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
  586. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
  587. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  588. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  589. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
  590. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  591. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
  592. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,331 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from collections import Counter
5
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Protocol, Sequence
6
+
7
+ from opik.exceptions import MetricComputationError
8
+ from opik.evaluation.metrics import base_metric, score_result
9
+
10
+ TokenizeFn = Callable[[str], Iterable[str]]
11
+
12
+
13
+ class _JSDistanceFn(Protocol):
14
+ def __call__(
15
+ self,
16
+ p: Sequence[float],
17
+ q: Sequence[float],
18
+ base: Optional[
19
+ float
20
+ ] = ..., # matches scipy signature allowing positional or keyword use
21
+ ) -> float: ...
22
+
23
+
24
+ def _load_jensen_shannon_distance() -> _JSDistanceFn:
25
+ try:
26
+ from scipy.spatial.distance import jensenshannon
27
+ except ImportError as error: # pragma: no cover - optional dependency
28
+ raise ImportError(
29
+ "Install scipy via `pip install scipy` to use Jensen-Shannon metrics."
30
+ ) from error
31
+
32
+ return jensenshannon
33
+
34
+
35
+ def _default_tokenizer(text: str) -> Iterable[str]:
36
+ return text.lower().split()
37
+
38
+
39
+ class _DistributionMetricBase(base_metric.BaseMetric):
40
+ """
41
+ Internal helper for metrics that compare token distributions.
42
+
43
+ Args:
44
+ tokenizer: Optional tokenizer returning an iterable of tokens given text.
45
+ name: Display name for the metric.
46
+ track: Whether to automatically track metric results.
47
+ project_name: Optional tracking project.
48
+ normalize: When ``True`` the histogram is converted to probabilities.
49
+ smoothing: Optional additive constant applied during KL-like computations.
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ tokenizer: Optional[TokenizeFn],
55
+ name: str,
56
+ track: bool,
57
+ project_name: Optional[str],
58
+ normalize: bool,
59
+ smoothing: float = 0.0,
60
+ ) -> None:
61
+ super().__init__(name=name, track=track, project_name=project_name)
62
+ self._tokenizer = tokenizer or _default_tokenizer
63
+ self._normalize = normalize
64
+ self._smoothing = max(0.0, smoothing)
65
+
66
+ def _build_distribution(self, text: str) -> Dict[str, float]:
67
+ tokens = list(self._tokenizer(text))
68
+ if len(tokens) == 0:
69
+ raise MetricComputationError(
70
+ "Tokenized text is empty (distribution-based metric)."
71
+ )
72
+
73
+ counts = Counter(tokens)
74
+ if not self._normalize:
75
+ return {token: float(count) for token, count in counts.items()}
76
+
77
+ total = float(sum(counts.values()))
78
+ return {token: count / total for token, count in counts.items()}
79
+
80
+ def _smooth(self, value: float) -> float:
81
+ if self._smoothing == 0.0:
82
+ return value
83
+ return value + self._smoothing
84
+
85
+
86
+ class JSDivergence(_DistributionMetricBase):
87
+ """
88
+ Compute Jensen–Shannon similarity (``1 - JSD``) between two texts.
89
+
90
+ Args:
91
+ tokenizer: Optional tokenizer function. Defaults to whitespace split.
92
+ base: Logarithm base used when computing divergence (> ``1.0``).
93
+ normalize: Whether to normalise token counts to probabilities first.
94
+ name: Display name for the metric result.
95
+ track: Whether to automatically track metric results.
96
+ project_name: Optional tracking project name.
97
+
98
+ Note:
99
+ Requires :mod:`scipy` to be installed.
100
+
101
+ Example:
102
+ >>> from opik.evaluation.metrics import JSDivergence
103
+ >>> metric = JSDivergence()
104
+ >>> result = metric.score(
105
+ ... output="cat cat sat",
106
+ ... reference="cat sat on mat",
107
+ ... )
108
+ >>> round(result.value, 3) # doctest: +SKIP
109
+ 0.812
110
+ """
111
+
112
+ def __init__(
113
+ self,
114
+ tokenizer: Optional[TokenizeFn] = None,
115
+ base: float = 2.0,
116
+ normalize: bool = True,
117
+ name: str = "js_divergence_metric",
118
+ track: bool = True,
119
+ project_name: Optional[str] = None,
120
+ ) -> None:
121
+ if base <= 1.0:
122
+ raise ValueError("base must be greater than 1.0")
123
+ super().__init__(
124
+ tokenizer=tokenizer,
125
+ name=name,
126
+ track=track,
127
+ project_name=project_name,
128
+ normalize=normalize,
129
+ )
130
+ self._base = base
131
+ self._js_distance_fn = _load_jensen_shannon_distance()
132
+
133
+ def score(
134
+ self,
135
+ output: str,
136
+ reference: str,
137
+ **ignored_kwargs: Any,
138
+ ) -> score_result.ScoreResult:
139
+ if not output.strip():
140
+ raise MetricComputationError(
141
+ "Candidate is empty (Jensen-Shannon divergence)."
142
+ )
143
+ if not reference.strip():
144
+ raise MetricComputationError(
145
+ "Reference is empty (Jensen-Shannon divergence)."
146
+ )
147
+
148
+ output_dist = self._build_distribution(output)
149
+ reference_dist = self._build_distribution(reference)
150
+
151
+ divergence = self._js_divergence(output_dist, reference_dist)
152
+ score = max(0.0, min(1.0, 1.0 - divergence))
153
+
154
+ return score_result.ScoreResult(
155
+ value=score,
156
+ name=self.name,
157
+ reason=(
158
+ f"Jensen-Shannon similarity (base={self._base:g}): {score:.4f} "
159
+ f"(divergence={divergence:.4f})"
160
+ ),
161
+ metadata={
162
+ "divergence": divergence,
163
+ "distance": math.sqrt(divergence),
164
+ "base": self._base,
165
+ },
166
+ )
167
+
168
+ def _js_divergence(
169
+ self,
170
+ p_dist: Dict[str, float],
171
+ q_dist: Dict[str, float],
172
+ ) -> float:
173
+ vocabulary = sorted(set(p_dist) | set(q_dist))
174
+ if not vocabulary:
175
+ return 0.0
176
+
177
+ p_vector = [p_dist.get(token, 0.0) for token in vocabulary]
178
+ q_vector = [q_dist.get(token, 0.0) for token in vocabulary]
179
+
180
+ p_probs = self._ensure_probability_vector(p_vector)
181
+ q_probs = self._ensure_probability_vector(q_vector)
182
+
183
+ distance = float(self._js_distance_fn(p_probs, q_probs, base=self._base))
184
+ return distance**2
185
+
186
+ def _ensure_probability_vector(self, values: Sequence[float]) -> List[float]:
187
+ total = sum(values)
188
+ if total <= 0.0:
189
+ raise MetricComputationError(
190
+ "Distribution is empty after tokenisation (Jensen-Shannon metric)."
191
+ )
192
+ return [value / total for value in values]
193
+
194
+
195
+ class JSDistance(JSDivergence):
196
+ """
197
+ Return the raw Jensen–Shannon divergence instead of similarity.
198
+
199
+ Args:
200
+ tokenizer: Optional tokenizer function.
201
+ base: Logarithm base used for the divergence calculation.
202
+ normalize: Whether to normalise counts into probabilities.
203
+ name: Display name for the metric result.
204
+ track: Whether to automatically track metric results.
205
+ project_name: Optional tracking project name.
206
+
207
+ Example:
208
+ >>> from opik.evaluation.metrics import JSDistance
209
+ >>> metric = JSDistance()
210
+ >>> result = metric.score("a a b", reference="a b b")
211
+ >>> round(result.value, 3) # doctest: +SKIP
212
+ 0.188
213
+ """
214
+
215
+ def __init__(
216
+ self,
217
+ tokenizer: Optional[TokenizeFn] = None,
218
+ base: float = 2.0,
219
+ normalize: bool = True,
220
+ name: str = "js_distance_metric",
221
+ track: bool = True,
222
+ project_name: Optional[str] = None,
223
+ ) -> None:
224
+ super().__init__(
225
+ tokenizer=tokenizer,
226
+ base=base,
227
+ normalize=normalize,
228
+ name=name,
229
+ track=track,
230
+ project_name=project_name,
231
+ )
232
+
233
+ def score(
234
+ self,
235
+ output: str,
236
+ reference: str,
237
+ **ignored_kwargs: Any,
238
+ ) -> score_result.ScoreResult:
239
+ similarity = super().score(output=output, reference=reference)
240
+ metadata = similarity.metadata or {}
241
+ divergence = float(metadata.get("divergence", 0.0))
242
+ distance = float(metadata.get("distance", math.sqrt(divergence)))
243
+ return score_result.ScoreResult(
244
+ value=divergence,
245
+ name=self.name,
246
+ reason=f"Jensen-Shannon divergence (base={self._base:g}): {divergence:.4f}",
247
+ metadata={
248
+ "distance": distance,
249
+ "base": metadata.get("base", self._base),
250
+ },
251
+ )
252
+
253
+
254
+ class KLDivergence(_DistributionMetricBase):
255
+ """
256
+ Compute the (optionally symmetric) KL divergence between token distributions.
257
+
258
+ Args:
259
+ tokenizer: Optional tokenizer function. Defaults to whitespace split.
260
+ direction: Direction to compute (``"pq"``, ``"qp"``, or ``"avg"`` for
261
+ symmetric).
262
+ normalize: Whether to normalise token counts to probabilities first.
263
+ smoothing: Additive smoothing constant to avoid divide-by-zero.
264
+ name: Display name for the metric result.
265
+ track: Whether to automatically track metric results.
266
+ project_name: Optional tracking project name.
267
+
268
+ Example:
269
+ >>> from opik.evaluation.metrics import KLDivergence
270
+ >>> metric = KLDivergence(direction="avg")
271
+ >>> result = metric.score("hello hello world", reference="hello world")
272
+ >>> round(result.value, 4) # doctest: +SKIP
273
+ 0.0583
274
+ """
275
+
276
+ def __init__(
277
+ self,
278
+ tokenizer: Optional[TokenizeFn] = None,
279
+ direction: str = "pq",
280
+ normalize: bool = True,
281
+ smoothing: float = 1e-12,
282
+ name: str = "kl_divergence_metric",
283
+ track: bool = True,
284
+ project_name: Optional[str] = None,
285
+ ) -> None:
286
+ if direction not in {"pq", "qp", "avg"}:
287
+ raise ValueError("direction must be one of {'pq', 'qp', 'avg'}")
288
+ super().__init__(
289
+ tokenizer=tokenizer,
290
+ name=name,
291
+ track=track,
292
+ project_name=project_name,
293
+ normalize=normalize,
294
+ smoothing=smoothing,
295
+ )
296
+ self._direction = direction
297
+
298
+ def score(
299
+ self,
300
+ output: str,
301
+ reference: str,
302
+ **ignored_kwargs: Any,
303
+ ) -> score_result.ScoreResult:
304
+ if not output.strip():
305
+ raise MetricComputationError("Candidate is empty (KL divergence metric).")
306
+ if not reference.strip():
307
+ raise MetricComputationError("Reference is empty (KL divergence metric).")
308
+
309
+ p_dist = self._build_distribution(output)
310
+ q_dist = self._build_distribution(reference)
311
+
312
+ if self._direction == "pq":
313
+ divergence = self._kl(p_dist, q_dist)
314
+ elif self._direction == "qp":
315
+ divergence = self._kl(q_dist, p_dist)
316
+ else:
317
+ divergence = 0.5 * (self._kl(p_dist, q_dist) + self._kl(q_dist, p_dist))
318
+
319
+ return score_result.ScoreResult(
320
+ value=divergence,
321
+ name=self.name,
322
+ reason=f"KL divergence ({self._direction}): {divergence:.4f}",
323
+ )
324
+
325
+ def _kl(self, p_dist: Dict[str, float], q_dist: Dict[str, float]) -> float:
326
+ divergence = 0.0
327
+ for token, p_val in p_dist.items():
328
+ p_val = self._smooth(p_val)
329
+ q_val = self._smooth(q_dist.get(token, 0.0))
330
+ divergence += p_val * math.log(p_val / q_val)
331
+ return divergence
@@ -0,0 +1,113 @@
1
+ from typing import Any, Callable, Optional, Sequence, Union
2
+
3
+ from opik.exceptions import MetricComputationError
4
+ from opik.evaluation.metrics import base_metric, score_result
5
+
6
+ try:
7
+ from nltk.translate import gleu_score as nltk_gleu_score
8
+ except ImportError: # pragma: no cover - optional dependency
9
+ nltk_gleu_score = None
10
+
11
+
12
+ GleuFn = Callable[[Sequence[Sequence[str]], Sequence[str]], float]
13
+
14
+
15
+ class GLEU(base_metric.BaseMetric):
16
+ """
17
+ Sentence-level GLEU metric powered by ``nltk.translate.gleu_score``.
18
+
19
+ References:
20
+ - NLTK Reference Documentation on GLEU
21
+ https://www.nltk.org/api/nltk.translate.gleu_score.html
22
+ - OECD Catalogue of Tools & Metrics for Trustworthy AI
23
+ https://oecd.ai/en/catalogue/metrics/google-bleu-gleu
24
+ - Hugging Face Evaluate: Google BLEU (GLEU) metric overview
25
+ https://huggingface.co/spaces/evaluate-metric/google_bleu
26
+
27
+ Args:
28
+ gleu_fn: Optional custom scoring callable compatible with
29
+ ``nltk.translate.gleu_score.sentence_gleu``. Useful for testing.
30
+ min_len: Minimum n-gram size considered.
31
+ max_len: Maximum n-gram size considered.
32
+ name: Display name for the metric result.
33
+ track: Whether to automatically track metric results.
34
+ project_name: Optional tracking project name.
35
+
36
+ Example:
37
+ >>> from opik.evaluation.metrics import GLEU
38
+ >>> metric = GLEU(min_len=1, max_len=4)
39
+ >>> result = metric.score(
40
+ ... output="The cat sat on the mat",
41
+ ... reference="The cat is on the mat",
42
+ ... )
43
+ >>> round(result.value, 3) # doctest: +SKIP
44
+ 0.816
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ gleu_fn: Optional[GleuFn] = None,
50
+ min_len: int = 1,
51
+ max_len: int = 4,
52
+ name: str = "gleu_metric",
53
+ track: bool = True,
54
+ project_name: Optional[str] = None,
55
+ ) -> None:
56
+ if min_len <= 0 or max_len <= 0:
57
+ raise ValueError("min_len and max_len must be positive integers.")
58
+ if min_len > max_len:
59
+ raise ValueError("min_len cannot exceed max_len.")
60
+
61
+ super().__init__(name=name, track=track, project_name=project_name)
62
+
63
+ if gleu_fn is not None:
64
+ self._gleu_fn = gleu_fn
65
+ else:
66
+ if nltk_gleu_score is None: # pragma: no cover - optional dependency
67
+ raise ImportError(
68
+ "GLEU metric requires the optional 'nltk' package. Install via"
69
+ " `pip install nltk` or provide `gleu_fn`."
70
+ )
71
+
72
+ def _scorer(
73
+ references: Sequence[Sequence[str]], hypothesis: Sequence[str]
74
+ ) -> float:
75
+ return float(
76
+ nltk_gleu_score.sentence_gleu(
77
+ references,
78
+ hypothesis,
79
+ min_len=min_len,
80
+ max_len=max_len,
81
+ )
82
+ )
83
+
84
+ self._gleu_fn = _scorer
85
+
86
+ def score(
87
+ self,
88
+ output: str,
89
+ reference: Union[str, Sequence[str]],
90
+ **ignored_kwargs: Any,
91
+ ) -> score_result.ScoreResult:
92
+ if not output.strip():
93
+ raise MetricComputationError("Candidate is empty (GLEU metric).")
94
+ hypothesis_tokens = output.split()
95
+ if isinstance(reference, str):
96
+ references = [reference.split()]
97
+ else:
98
+ ref_list = list(reference)
99
+ if not ref_list:
100
+ raise MetricComputationError("Reference is empty (GLEU metric).")
101
+ references = [ref.split() for ref in ref_list]
102
+
103
+ if any(len(ref) == 0 for ref in references):
104
+ raise MetricComputationError(
105
+ "Reference contains empty segment (GLEU metric)."
106
+ )
107
+
108
+ score = self._gleu_fn(references, hypothesis_tokens)
109
+ return score_result.ScoreResult(
110
+ value=float(score),
111
+ name=self.name,
112
+ reason=f"GLEU score: {float(score):.4f}",
113
+ )
@@ -0,0 +1,123 @@
1
+ """Language adherence metric leveraging fastText-style language identification."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Callable, Optional, Tuple
6
+
7
+ from opik.exceptions import MetricComputationError
8
+ from opik.evaluation.metrics.base_metric import BaseMetric
9
+ from opik.evaluation.metrics.score_result import ScoreResult
10
+
11
+ try: # optional dependency
12
+ import fasttext
13
+ except ImportError: # pragma: no cover
14
+ fasttext = None # type: ignore
15
+
16
+
17
+ DetectorFn = Callable[[str], Tuple[str, float]]
18
+
19
+
20
+ class LanguageAdherenceMetric(BaseMetric):
21
+ """
22
+ Check whether text is written in the expected language.
23
+
24
+ The metric relies on a fastText language identification model (or a
25
+ user-supplied detector callable) to predict the language of the evaluated text
26
+ and compares it with ``expected_language``. It outputs ``1.0`` when the detected
27
+ language matches and ``0.0`` otherwise, along with the detected label and
28
+ confidence score in ``metadata``.
29
+
30
+ References:
31
+ - fastText language identification models
32
+ https://fasttext.cc/docs/en/language-identification.html
33
+ - Joulin et al., "Bag of Tricks for Efficient Text Classification" (EACL 2017)
34
+ https://aclanthology.org/E17-2068/
35
+
36
+ Args:
37
+ expected_language: Language code the text should conform to, e.g. ``"en"``.
38
+ model_path: Path to a fastText language identification model. Required unless
39
+ ``detector`` is provided.
40
+ name: Display name for the metric result. Defaults to
41
+ ``"language_adherence_metric"``.
42
+ track: Whether to automatically track metric results. Defaults to ``True``.
43
+ project_name: Optional tracking project name. Defaults to ``None``.
44
+ detector: Optional callable accepting text and returning a
45
+ ``(language, confidence)`` tuple. When provided, ``model_path`` is not
46
+ needed.
47
+
48
+ Example:
49
+ >>> from opik.evaluation.metrics import LanguageAdherenceMetric
50
+ >>> # Assuming `lid.176.ftz` is available locally for fastText
51
+ >>> metric = LanguageAdherenceMetric(expected_language="en", model_path="lid.176.ftz")
52
+ >>> result = metric.score("This response is written in English.") # doctest: +SKIP
53
+ >>> result.value # doctest: +SKIP
54
+ 1.0
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ expected_language: str,
60
+ model_path: Optional[str] = None,
61
+ name: str = "language_adherence_metric",
62
+ track: bool = True,
63
+ project_name: Optional[str] = None,
64
+ detector: Optional[DetectorFn] = None,
65
+ ) -> None:
66
+ super().__init__(name=name, track=track, project_name=project_name)
67
+ self._expected_language = expected_language
68
+ self._detector_fn: DetectorFn
69
+ self._model_path = model_path
70
+
71
+ self._fasttext_model: Optional[Any]
72
+
73
+ if detector is not None:
74
+ self._detector_fn = detector
75
+ self._fasttext_model = None
76
+ return
77
+
78
+ if fasttext is None:
79
+ raise ImportError(
80
+ "Install fasttext via `pip install fasttext` and provide a fastText language"
81
+ " model (e.g., lid.176.ftz) or supply a custom detector callable."
82
+ )
83
+ if model_path is None:
84
+ raise ValueError(
85
+ "model_path is required when using the fastText-based detector."
86
+ )
87
+ self._fasttext_model = fasttext.load_model(model_path)
88
+ self._detector_fn = self._predict_with_fasttext
89
+
90
+ def score(self, output: str, **ignored_kwargs: Any) -> ScoreResult:
91
+ processed = output
92
+ if not processed.strip():
93
+ raise MetricComputationError("Text is empty for language adherence check.")
94
+
95
+ language, confidence = self._detector_fn(processed)
96
+ adherence = 1.0 if language == self._expected_language else 0.0
97
+
98
+ metadata = {
99
+ "detected_language": language,
100
+ "confidence": confidence,
101
+ "expected_language": self._expected_language,
102
+ }
103
+
104
+ reason = (
105
+ "Language adheres to expectation"
106
+ if adherence == 1.0
107
+ else f"Detected language '{language}' differs from expected '{self._expected_language}'"
108
+ )
109
+
110
+ return ScoreResult(
111
+ value=adherence, name=self.name, reason=reason, metadata=metadata
112
+ )
113
+
114
+ def _predict_with_fasttext(self, text: str) -> tuple[str, float]:
115
+ if self._fasttext_model is None:
116
+ raise MetricComputationError(
117
+ "fastText model is not loaded. Ensure that LanguageAdherenceMetric was initialized with a valid model_path and fastText is installed."
118
+ )
119
+ prediction = self._fasttext_model.predict(text)
120
+ label = prediction[0][0] if prediction[0] else ""
121
+ language = label.replace("__label__", "")
122
+ confidence = float(prediction[1][0]) if prediction[1] else 0.0
123
+ return language, confidence
@@ -0,0 +1,119 @@
1
+ from typing import Any, Callable, Optional, Sequence, Union
2
+
3
+ try:
4
+ import nltk # type: ignore
5
+ from nltk.corpus import wordnet # type: ignore
6
+ except ImportError: # pragma: no cover - optional dependency
7
+ nltk = None
8
+ wordnet = None
9
+
10
+ from opik.exceptions import MetricComputationError
11
+ from opik.evaluation.metrics import base_metric, score_result
12
+
13
+ try:
14
+ from nltk.translate import meteor_score as nltk_meteor_score
15
+ except ImportError: # pragma: no cover - optional dependency
16
+ nltk_meteor_score = None
17
+
18
+
19
+ MeteorFn = Callable[[Sequence[str], str], float]
20
+
21
+
22
+ class METEOR(base_metric.BaseMetric):
23
+ """Computes the METEOR score between output and reference text.
24
+
25
+ This implementation wraps ``nltk.translate.meteor_score.meteor_score`` while
26
+ allowing a custom scoring function to be injected (useful for testing).
27
+
28
+ References:
29
+ - Banerjee & Lavie, "METEOR: An Automatic Metric for MT Evaluation with Improved
30
+ Correlation with Human Judgments" (ACL Workshop 2005)
31
+ https://aclanthology.org/W05-0909/
32
+ - Hugging Face Evaluate: METEOR metric overview
33
+ https://huggingface.co/spaces/evaluate-metric/meteor
34
+
35
+ Args:
36
+ meteor_fn: Optional callable with the same interface as
37
+ ``nltk.translate.meteor_score.meteor_score``. When omitted the
38
+ function from NLTK is used.
39
+ alpha: Precision weight.
40
+ beta: Penalty exponent.
41
+ gamma: Fragmentation penalty weight.
42
+ name: Optional metric name.
43
+ track: Whether Opik should track the metric automatically.
44
+ project_name: Optional project name used when tracking.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ meteor_fn: Optional[MeteorFn] = None,
50
+ alpha: float = 0.9,
51
+ beta: float = 3.0,
52
+ gamma: float = 0.5,
53
+ name: str = "meteor_metric",
54
+ track: bool = True,
55
+ project_name: Optional[str] = None,
56
+ ) -> None:
57
+ super().__init__(name=name, track=track, project_name=project_name)
58
+
59
+ if meteor_fn is not None:
60
+ self._meteor_fn = meteor_fn
61
+ else:
62
+ if nltk_meteor_score is None: # pragma: no cover - optional dependency
63
+ raise ImportError(
64
+ "METEOR metric requires the optional 'nltk' package. Install via"
65
+ " `pip install nltk` or provide `meteor_fn`."
66
+ )
67
+
68
+ if nltk is not None and wordnet is not None:
69
+ try:
70
+ wordnet.ensure_loaded() # type: ignore[attr-defined]
71
+ except (
72
+ LookupError
73
+ ): # pragma: no cover - download path relies on network access
74
+ try:
75
+ nltk.download("wordnet", quiet=True)
76
+ nltk.download("omw-1.4", quiet=True)
77
+ wordnet.ensure_loaded() # type: ignore[attr-defined]
78
+ except Exception as download_error:
79
+ raise ImportError(
80
+ "METEOR metric requires the NLTK corpora 'wordnet' and 'omw-1.4'. "
81
+ "Install manually via `python -m nltk.downloader wordnet omw-1.4`."
82
+ ) from download_error
83
+
84
+ def _scorer(references: Sequence[str], hypothesis: str) -> float:
85
+ try:
86
+ return float(
87
+ nltk_meteor_score.meteor_score(
88
+ references, hypothesis, alpha=alpha, beta=beta, gamma=gamma
89
+ )
90
+ )
91
+ except LookupError as error:
92
+ raise MetricComputationError(
93
+ "NLTK resource requirement for METEOR not satisfied. "
94
+ "Download WordNet via `nltk.download('wordnet')`."
95
+ ) from error
96
+
97
+ self._meteor_fn = _scorer
98
+
99
+ def score(
100
+ self,
101
+ output: str,
102
+ reference: Union[str, Sequence[str]],
103
+ **ignored_kwargs: Any,
104
+ ) -> score_result.ScoreResult:
105
+ if not output.strip():
106
+ raise MetricComputationError("Candidate is empty (METEOR metric).")
107
+ if isinstance(reference, str):
108
+ references: Sequence[str] = [reference]
109
+ else:
110
+ references = list(reference)
111
+ if not references or any(not ref.strip() for ref in references):
112
+ raise MetricComputationError("Reference is empty (METEOR metric).")
113
+
114
+ score = self._meteor_fn(references, output)
115
+ return score_result.ScoreResult(
116
+ value=float(score),
117
+ name=self.name,
118
+ reason=f"METEOR score: {float(score):.4f}",
119
+ )