opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. opik/__init__.py +19 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/attachment_context.py +36 -0
  9. opik/api_objects/attachment/attachments_extractor.py +153 -0
  10. opik/api_objects/attachment/client.py +1 -0
  11. opik/api_objects/attachment/converters.py +2 -0
  12. opik/api_objects/attachment/decoder.py +18 -0
  13. opik/api_objects/attachment/decoder_base64.py +83 -0
  14. opik/api_objects/attachment/decoder_helpers.py +137 -0
  15. opik/api_objects/data_helpers.py +79 -0
  16. opik/api_objects/dataset/dataset.py +64 -4
  17. opik/api_objects/dataset/rest_operations.py +11 -2
  18. opik/api_objects/experiment/experiment.py +57 -57
  19. opik/api_objects/experiment/experiment_item.py +2 -1
  20. opik/api_objects/experiment/experiments_client.py +64 -0
  21. opik/api_objects/experiment/helpers.py +35 -11
  22. opik/api_objects/experiment/rest_operations.py +65 -5
  23. opik/api_objects/helpers.py +8 -5
  24. opik/api_objects/local_recording.py +81 -0
  25. opik/api_objects/opik_client.py +600 -108
  26. opik/api_objects/opik_query_language.py +39 -5
  27. opik/api_objects/prompt/__init__.py +12 -2
  28. opik/api_objects/prompt/base_prompt.py +69 -0
  29. opik/api_objects/prompt/base_prompt_template.py +29 -0
  30. opik/api_objects/prompt/chat/__init__.py +1 -0
  31. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  32. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  33. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  34. opik/api_objects/prompt/client.py +189 -47
  35. opik/api_objects/prompt/text/__init__.py +1 -0
  36. opik/api_objects/prompt/text/prompt.py +174 -0
  37. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  38. opik/api_objects/prompt/types.py +23 -0
  39. opik/api_objects/search_helpers.py +89 -0
  40. opik/api_objects/span/span_data.py +35 -25
  41. opik/api_objects/threads/threads_client.py +39 -5
  42. opik/api_objects/trace/trace_client.py +52 -2
  43. opik/api_objects/trace/trace_data.py +15 -24
  44. opik/api_objects/validation_helpers.py +3 -3
  45. opik/cli/__init__.py +5 -0
  46. opik/cli/__main__.py +6 -0
  47. opik/cli/configure.py +66 -0
  48. opik/cli/exports/__init__.py +131 -0
  49. opik/cli/exports/dataset.py +278 -0
  50. opik/cli/exports/experiment.py +784 -0
  51. opik/cli/exports/project.py +685 -0
  52. opik/cli/exports/prompt.py +578 -0
  53. opik/cli/exports/utils.py +406 -0
  54. opik/cli/harbor.py +39 -0
  55. opik/cli/healthcheck.py +21 -0
  56. opik/cli/imports/__init__.py +439 -0
  57. opik/cli/imports/dataset.py +143 -0
  58. opik/cli/imports/experiment.py +1192 -0
  59. opik/cli/imports/project.py +262 -0
  60. opik/cli/imports/prompt.py +177 -0
  61. opik/cli/imports/utils.py +280 -0
  62. opik/cli/main.py +49 -0
  63. opik/cli/proxy.py +93 -0
  64. opik/cli/usage_report/__init__.py +16 -0
  65. opik/cli/usage_report/charts.py +783 -0
  66. opik/cli/usage_report/cli.py +274 -0
  67. opik/cli/usage_report/constants.py +9 -0
  68. opik/cli/usage_report/extraction.py +749 -0
  69. opik/cli/usage_report/pdf.py +244 -0
  70. opik/cli/usage_report/statistics.py +78 -0
  71. opik/cli/usage_report/utils.py +235 -0
  72. opik/config.py +13 -7
  73. opik/configurator/configure.py +17 -0
  74. opik/datetime_helpers.py +12 -0
  75. opik/decorator/arguments_helpers.py +9 -1
  76. opik/decorator/base_track_decorator.py +205 -133
  77. opik/decorator/context_manager/span_context_manager.py +123 -0
  78. opik/decorator/context_manager/trace_context_manager.py +84 -0
  79. opik/decorator/opik_args/__init__.py +13 -0
  80. opik/decorator/opik_args/api_classes.py +71 -0
  81. opik/decorator/opik_args/helpers.py +120 -0
  82. opik/decorator/span_creation_handler.py +25 -6
  83. opik/dict_utils.py +3 -3
  84. opik/evaluation/__init__.py +13 -2
  85. opik/evaluation/engine/engine.py +272 -75
  86. opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
  87. opik/evaluation/engine/helpers.py +31 -6
  88. opik/evaluation/engine/metrics_evaluator.py +237 -0
  89. opik/evaluation/evaluation_result.py +168 -2
  90. opik/evaluation/evaluator.py +533 -62
  91. opik/evaluation/metrics/__init__.py +103 -4
  92. opik/evaluation/metrics/aggregated_metric.py +35 -6
  93. opik/evaluation/metrics/base_metric.py +1 -1
  94. opik/evaluation/metrics/conversation/__init__.py +48 -0
  95. opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
  96. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  97. opik/evaluation/metrics/conversation/helpers.py +14 -15
  98. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  99. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  100. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  101. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  102. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  103. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  104. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  105. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
  106. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
  107. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  108. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
  109. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
  110. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  111. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
  112. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
  113. opik/evaluation/metrics/conversation/types.py +4 -5
  114. opik/evaluation/metrics/conversation_types.py +9 -0
  115. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  116. opik/evaluation/metrics/heuristics/bleu.py +35 -15
  117. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  118. opik/evaluation/metrics/heuristics/contains.py +47 -11
  119. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  120. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  121. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  122. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  123. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  124. opik/evaluation/metrics/heuristics/readability.py +129 -0
  125. opik/evaluation/metrics/heuristics/rouge.py +26 -9
  126. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  127. opik/evaluation/metrics/heuristics/tone.py +155 -0
  128. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  129. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
  130. opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
  131. opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
  132. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  133. opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
  134. opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
  135. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  136. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  137. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  138. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  139. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  140. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  141. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  142. opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
  143. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  144. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  145. opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
  146. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  147. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  148. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  149. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  150. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  151. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  152. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  153. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  154. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  155. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
  156. opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
  157. opik/evaluation/metrics/ragas_metric.py +43 -23
  158. opik/evaluation/models/__init__.py +8 -0
  159. opik/evaluation/models/base_model.py +107 -1
  160. opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
  161. opik/evaluation/models/langchain/message_converters.py +97 -15
  162. opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
  163. opik/evaluation/models/litellm/util.py +125 -0
  164. opik/evaluation/models/litellm/warning_filters.py +16 -4
  165. opik/evaluation/models/model_capabilities.py +187 -0
  166. opik/evaluation/models/models_factory.py +25 -3
  167. opik/evaluation/preprocessing.py +92 -0
  168. opik/evaluation/report.py +70 -12
  169. opik/evaluation/rest_operations.py +49 -45
  170. opik/evaluation/samplers/__init__.py +4 -0
  171. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  172. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  173. opik/evaluation/score_statistics.py +66 -0
  174. opik/evaluation/scorers/__init__.py +4 -0
  175. opik/evaluation/scorers/scorer_function.py +55 -0
  176. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  177. opik/evaluation/test_case.py +3 -2
  178. opik/evaluation/test_result.py +1 -0
  179. opik/evaluation/threads/evaluator.py +31 -3
  180. opik/evaluation/threads/helpers.py +3 -2
  181. opik/evaluation/types.py +9 -1
  182. opik/exceptions.py +33 -0
  183. opik/file_upload/file_uploader.py +13 -0
  184. opik/file_upload/upload_options.py +2 -0
  185. opik/hooks/__init__.py +23 -0
  186. opik/hooks/anonymizer_hook.py +36 -0
  187. opik/hooks/httpx_client_hook.py +112 -0
  188. opik/httpx_client.py +12 -9
  189. opik/id_helpers.py +18 -0
  190. opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
  191. opik/integrations/adk/helpers.py +16 -7
  192. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  193. opik/integrations/adk/opik_tracer.py +14 -1
  194. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  195. opik/integrations/adk/recursive_callback_injector.py +4 -7
  196. opik/integrations/bedrock/converse/__init__.py +0 -0
  197. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  198. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
  199. opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
  200. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  201. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  202. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  203. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  204. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  205. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  206. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  207. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  208. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  209. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  210. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  211. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  212. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  213. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  214. opik/integrations/bedrock/opik_tracker.py +42 -4
  215. opik/integrations/bedrock/types.py +19 -0
  216. opik/integrations/crewai/crewai_decorator.py +8 -51
  217. opik/integrations/crewai/opik_tracker.py +31 -10
  218. opik/integrations/crewai/patchers/__init__.py +5 -0
  219. opik/integrations/crewai/patchers/flow.py +118 -0
  220. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  221. opik/integrations/crewai/patchers/llm_client.py +207 -0
  222. opik/integrations/dspy/callback.py +80 -17
  223. opik/integrations/dspy/parsers.py +168 -0
  224. opik/integrations/harbor/__init__.py +17 -0
  225. opik/integrations/harbor/experiment_service.py +269 -0
  226. opik/integrations/harbor/opik_tracker.py +528 -0
  227. opik/integrations/haystack/opik_connector.py +2 -2
  228. opik/integrations/haystack/opik_tracer.py +3 -7
  229. opik/integrations/langchain/__init__.py +3 -1
  230. opik/integrations/langchain/helpers.py +96 -0
  231. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  232. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  233. opik/integrations/langchain/opik_encoder_extension.py +1 -1
  234. opik/integrations/langchain/opik_tracer.py +474 -229
  235. opik/integrations/litellm/__init__.py +5 -0
  236. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  237. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  238. opik/integrations/litellm/opik_tracker.py +43 -0
  239. opik/integrations/litellm/stream_patchers.py +151 -0
  240. opik/integrations/llama_index/callback.py +146 -107
  241. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  242. opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
  243. opik/integrations/openai/opik_tracker.py +1 -1
  244. opik/integrations/sagemaker/auth.py +5 -1
  245. opik/llm_usage/google_usage.py +3 -1
  246. opik/llm_usage/opik_usage.py +7 -8
  247. opik/llm_usage/opik_usage_factory.py +4 -2
  248. opik/logging_messages.py +6 -0
  249. opik/message_processing/batching/base_batcher.py +14 -21
  250. opik/message_processing/batching/batch_manager.py +22 -10
  251. opik/message_processing/batching/batch_manager_constuctors.py +10 -0
  252. opik/message_processing/batching/batchers.py +59 -27
  253. opik/message_processing/batching/flushing_thread.py +0 -3
  254. opik/message_processing/emulation/__init__.py +0 -0
  255. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  256. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  257. opik/message_processing/emulation/models.py +162 -0
  258. opik/message_processing/encoder_helpers.py +79 -0
  259. opik/message_processing/messages.py +56 -1
  260. opik/message_processing/preprocessing/__init__.py +0 -0
  261. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  262. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  263. opik/message_processing/preprocessing/constants.py +1 -0
  264. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  265. opik/message_processing/preprocessing/preprocessor.py +36 -0
  266. opik/message_processing/processors/__init__.py +0 -0
  267. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  268. opik/message_processing/processors/message_processors.py +92 -0
  269. opik/message_processing/processors/message_processors_chain.py +96 -0
  270. opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
  271. opik/message_processing/queue_consumer.py +9 -3
  272. opik/message_processing/streamer.py +71 -33
  273. opik/message_processing/streamer_constructors.py +43 -10
  274. opik/opik_context.py +16 -4
  275. opik/plugins/pytest/hooks.py +5 -3
  276. opik/rest_api/__init__.py +346 -15
  277. opik/rest_api/alerts/__init__.py +7 -0
  278. opik/rest_api/alerts/client.py +667 -0
  279. opik/rest_api/alerts/raw_client.py +1015 -0
  280. opik/rest_api/alerts/types/__init__.py +7 -0
  281. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  282. opik/rest_api/annotation_queues/__init__.py +4 -0
  283. opik/rest_api/annotation_queues/client.py +668 -0
  284. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  285. opik/rest_api/automation_rule_evaluators/client.py +34 -2
  286. opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
  287. opik/rest_api/client.py +15 -0
  288. opik/rest_api/dashboards/__init__.py +4 -0
  289. opik/rest_api/dashboards/client.py +462 -0
  290. opik/rest_api/dashboards/raw_client.py +648 -0
  291. opik/rest_api/datasets/client.py +1310 -44
  292. opik/rest_api/datasets/raw_client.py +2269 -358
  293. opik/rest_api/experiments/__init__.py +2 -2
  294. opik/rest_api/experiments/client.py +191 -5
  295. opik/rest_api/experiments/raw_client.py +301 -7
  296. opik/rest_api/experiments/types/__init__.py +4 -1
  297. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  298. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  299. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  300. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  301. opik/rest_api/llm_provider_key/client.py +20 -0
  302. opik/rest_api/llm_provider_key/raw_client.py +20 -0
  303. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  304. opik/rest_api/manual_evaluation/__init__.py +4 -0
  305. opik/rest_api/manual_evaluation/client.py +347 -0
  306. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  307. opik/rest_api/optimizations/client.py +145 -9
  308. opik/rest_api/optimizations/raw_client.py +237 -13
  309. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  310. opik/rest_api/prompts/__init__.py +2 -2
  311. opik/rest_api/prompts/client.py +227 -6
  312. opik/rest_api/prompts/raw_client.py +331 -2
  313. opik/rest_api/prompts/types/__init__.py +3 -1
  314. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  315. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  316. opik/rest_api/spans/__init__.py +0 -2
  317. opik/rest_api/spans/client.py +238 -76
  318. opik/rest_api/spans/raw_client.py +307 -95
  319. opik/rest_api/spans/types/__init__.py +0 -2
  320. opik/rest_api/traces/client.py +572 -161
  321. opik/rest_api/traces/raw_client.py +736 -229
  322. opik/rest_api/types/__init__.py +352 -17
  323. opik/rest_api/types/aggregation_data.py +1 -0
  324. opik/rest_api/types/alert.py +33 -0
  325. opik/rest_api/types/alert_alert_type.py +5 -0
  326. opik/rest_api/types/alert_page_public.py +24 -0
  327. opik/rest_api/types/alert_public.py +33 -0
  328. opik/rest_api/types/alert_public_alert_type.py +5 -0
  329. opik/rest_api/types/alert_trigger.py +27 -0
  330. opik/rest_api/types/alert_trigger_config.py +28 -0
  331. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  332. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  333. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  334. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  335. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  336. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  337. opik/rest_api/types/alert_trigger_public.py +27 -0
  338. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  339. opik/rest_api/types/alert_trigger_write.py +23 -0
  340. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  341. opik/rest_api/types/alert_write.py +28 -0
  342. opik/rest_api/types/alert_write_alert_type.py +5 -0
  343. opik/rest_api/types/annotation_queue.py +42 -0
  344. opik/rest_api/types/annotation_queue_batch.py +27 -0
  345. opik/rest_api/types/annotation_queue_item_ids.py +19 -0
  346. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  347. opik/rest_api/types/annotation_queue_public.py +38 -0
  348. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  349. opik/rest_api/types/annotation_queue_reviewer.py +20 -0
  350. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  351. opik/rest_api/types/annotation_queue_scope.py +5 -0
  352. opik/rest_api/types/annotation_queue_write.py +31 -0
  353. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  354. opik/rest_api/types/audio_url.py +19 -0
  355. opik/rest_api/types/audio_url_public.py +19 -0
  356. opik/rest_api/types/audio_url_write.py +19 -0
  357. opik/rest_api/types/automation_rule_evaluator.py +62 -2
  358. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  359. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  360. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  361. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  362. opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
  363. opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
  364. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  365. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  366. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  367. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  368. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  369. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  370. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  371. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  372. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  373. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  374. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  375. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  376. opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
  377. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  378. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  379. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  380. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  381. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  382. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  383. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  384. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  385. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  386. opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
  387. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  388. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  389. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  390. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  391. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  392. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  393. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  394. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  395. opik/rest_api/types/dashboard_page_public.py +24 -0
  396. opik/rest_api/types/dashboard_public.py +30 -0
  397. opik/rest_api/types/dataset.py +4 -0
  398. opik/rest_api/types/dataset_expansion.py +42 -0
  399. opik/rest_api/types/dataset_expansion_response.py +39 -0
  400. opik/rest_api/types/dataset_item.py +2 -0
  401. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  402. opik/rest_api/types/dataset_item_compare.py +2 -0
  403. opik/rest_api/types/dataset_item_filter.py +27 -0
  404. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  405. opik/rest_api/types/dataset_item_page_compare.py +5 -0
  406. opik/rest_api/types/dataset_item_page_public.py +5 -0
  407. opik/rest_api/types/dataset_item_public.py +2 -0
  408. opik/rest_api/types/dataset_item_update.py +39 -0
  409. opik/rest_api/types/dataset_item_write.py +1 -0
  410. opik/rest_api/types/dataset_public.py +4 -0
  411. opik/rest_api/types/dataset_public_status.py +5 -0
  412. opik/rest_api/types/dataset_status.py +5 -0
  413. opik/rest_api/types/dataset_version_diff.py +22 -0
  414. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  415. opik/rest_api/types/dataset_version_page_public.py +23 -0
  416. opik/rest_api/types/dataset_version_public.py +59 -0
  417. opik/rest_api/types/dataset_version_summary.py +46 -0
  418. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  419. opik/rest_api/types/experiment.py +7 -2
  420. opik/rest_api/types/experiment_group_response.py +2 -0
  421. opik/rest_api/types/experiment_public.py +7 -2
  422. opik/rest_api/types/experiment_public_status.py +5 -0
  423. opik/rest_api/types/experiment_score.py +20 -0
  424. opik/rest_api/types/experiment_score_public.py +20 -0
  425. opik/rest_api/types/experiment_score_write.py +20 -0
  426. opik/rest_api/types/experiment_status.py +5 -0
  427. opik/rest_api/types/feedback.py +25 -1
  428. opik/rest_api/types/feedback_create.py +20 -1
  429. opik/rest_api/types/feedback_object_public.py +27 -1
  430. opik/rest_api/types/feedback_public.py +25 -1
  431. opik/rest_api/types/feedback_score_batch_item.py +2 -1
  432. opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
  433. opik/rest_api/types/feedback_score_public.py +4 -0
  434. opik/rest_api/types/feedback_update.py +20 -1
  435. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  436. opik/rest_api/types/group_detail.py +19 -0
  437. opik/rest_api/types/group_details.py +20 -0
  438. opik/rest_api/types/guardrail.py +1 -0
  439. opik/rest_api/types/guardrail_write.py +1 -0
  440. opik/rest_api/types/ids_holder.py +19 -0
  441. opik/rest_api/types/image_url.py +20 -0
  442. opik/rest_api/types/image_url_public.py +20 -0
  443. opik/rest_api/types/image_url_write.py +20 -0
  444. opik/rest_api/types/llm_as_judge_message.py +5 -1
  445. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  446. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  447. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  448. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  449. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  450. opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
  451. opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
  452. opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
  453. opik/rest_api/types/manual_evaluation_request.py +38 -0
  454. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  455. opik/rest_api/types/manual_evaluation_response.py +27 -0
  456. opik/rest_api/types/optimization.py +4 -2
  457. opik/rest_api/types/optimization_public.py +4 -2
  458. opik/rest_api/types/optimization_public_status.py +3 -1
  459. opik/rest_api/types/optimization_status.py +3 -1
  460. opik/rest_api/types/optimization_studio_config.py +27 -0
  461. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  462. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  463. opik/rest_api/types/optimization_studio_log.py +22 -0
  464. opik/rest_api/types/optimization_write.py +4 -2
  465. opik/rest_api/types/optimization_write_status.py +3 -1
  466. opik/rest_api/types/project.py +1 -0
  467. opik/rest_api/types/project_detailed.py +1 -0
  468. opik/rest_api/types/project_reference.py +31 -0
  469. opik/rest_api/types/project_reference_public.py +31 -0
  470. opik/rest_api/types/project_stats_summary_item.py +1 -0
  471. opik/rest_api/types/prompt.py +6 -0
  472. opik/rest_api/types/prompt_detail.py +6 -0
  473. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  474. opik/rest_api/types/prompt_public.py +6 -0
  475. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  476. opik/rest_api/types/prompt_template_structure.py +5 -0
  477. opik/rest_api/types/prompt_version.py +3 -0
  478. opik/rest_api/types/prompt_version_detail.py +3 -0
  479. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  480. opik/rest_api/types/prompt_version_link.py +1 -0
  481. opik/rest_api/types/prompt_version_link_public.py +1 -0
  482. opik/rest_api/types/prompt_version_page_public.py +5 -0
  483. opik/rest_api/types/prompt_version_public.py +3 -0
  484. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  485. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  486. opik/rest_api/types/prompt_version_update.py +33 -0
  487. opik/rest_api/types/provider_api_key.py +9 -0
  488. opik/rest_api/types/provider_api_key_provider.py +1 -1
  489. opik/rest_api/types/provider_api_key_public.py +9 -0
  490. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  491. opik/rest_api/types/score_name.py +1 -0
  492. opik/rest_api/types/service_toggles_config.py +18 -0
  493. opik/rest_api/types/span.py +1 -2
  494. opik/rest_api/types/span_enrichment_options.py +31 -0
  495. opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
  496. opik/rest_api/types/span_filter.py +23 -0
  497. opik/rest_api/types/span_filter_operator.py +21 -0
  498. opik/rest_api/types/span_filter_write.py +23 -0
  499. opik/rest_api/types/span_filter_write_operator.py +21 -0
  500. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  501. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  502. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  503. opik/rest_api/types/span_public.py +1 -2
  504. opik/rest_api/types/span_update.py +46 -0
  505. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  506. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  507. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  508. opik/rest_api/types/span_write.py +1 -2
  509. opik/rest_api/types/studio_evaluation.py +20 -0
  510. opik/rest_api/types/studio_evaluation_public.py +20 -0
  511. opik/rest_api/types/studio_evaluation_write.py +20 -0
  512. opik/rest_api/types/studio_llm_model.py +21 -0
  513. opik/rest_api/types/studio_llm_model_public.py +21 -0
  514. opik/rest_api/types/studio_llm_model_write.py +21 -0
  515. opik/rest_api/types/studio_message.py +20 -0
  516. opik/rest_api/types/studio_message_public.py +20 -0
  517. opik/rest_api/types/studio_message_write.py +20 -0
  518. opik/rest_api/types/studio_metric.py +21 -0
  519. opik/rest_api/types/studio_metric_public.py +21 -0
  520. opik/rest_api/types/studio_metric_write.py +21 -0
  521. opik/rest_api/types/studio_optimizer.py +21 -0
  522. opik/rest_api/types/studio_optimizer_public.py +21 -0
  523. opik/rest_api/types/studio_optimizer_write.py +21 -0
  524. opik/rest_api/types/studio_prompt.py +20 -0
  525. opik/rest_api/types/studio_prompt_public.py +20 -0
  526. opik/rest_api/types/studio_prompt_write.py +20 -0
  527. opik/rest_api/types/trace.py +11 -2
  528. opik/rest_api/types/trace_enrichment_options.py +32 -0
  529. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
  530. opik/rest_api/types/trace_filter.py +23 -0
  531. opik/rest_api/types/trace_filter_operator.py +21 -0
  532. opik/rest_api/types/trace_filter_write.py +23 -0
  533. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  534. opik/rest_api/types/trace_public.py +11 -2
  535. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  536. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  537. opik/rest_api/types/trace_thread_identifier.py +1 -0
  538. opik/rest_api/types/trace_update.py +39 -0
  539. opik/rest_api/types/trace_write.py +1 -2
  540. opik/rest_api/types/value_entry.py +2 -0
  541. opik/rest_api/types/value_entry_compare.py +2 -0
  542. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  543. opik/rest_api/types/value_entry_public.py +2 -0
  544. opik/rest_api/types/video_url.py +19 -0
  545. opik/rest_api/types/video_url_public.py +19 -0
  546. opik/rest_api/types/video_url_write.py +19 -0
  547. opik/rest_api/types/webhook.py +28 -0
  548. opik/rest_api/types/webhook_examples.py +19 -0
  549. opik/rest_api/types/webhook_public.py +28 -0
  550. opik/rest_api/types/webhook_test_result.py +23 -0
  551. opik/rest_api/types/webhook_test_result_status.py +5 -0
  552. opik/rest_api/types/webhook_write.py +23 -0
  553. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  554. opik/rest_api/types/workspace_configuration.py +5 -0
  555. opik/rest_api/welcome_wizard/__init__.py +4 -0
  556. opik/rest_api/welcome_wizard/client.py +195 -0
  557. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  558. opik/rest_api/workspaces/client.py +14 -2
  559. opik/rest_api/workspaces/raw_client.py +10 -0
  560. opik/s3_httpx_client.py +14 -1
  561. opik/simulation/__init__.py +6 -0
  562. opik/simulation/simulated_user.py +99 -0
  563. opik/simulation/simulator.py +108 -0
  564. opik/synchronization.py +5 -6
  565. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  566. opik/types.py +36 -0
  567. opik/validation/chat_prompt_messages.py +241 -0
  568. opik/validation/feedback_score.py +3 -3
  569. opik/validation/validator.py +28 -0
  570. opik-1.9.71.dist-info/METADATA +370 -0
  571. opik-1.9.71.dist-info/RECORD +1110 -0
  572. opik/api_objects/prompt/prompt.py +0 -112
  573. opik/cli.py +0 -193
  574. opik/hooks.py +0 -13
  575. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  576. opik/integrations/bedrock/helpers.py +0 -8
  577. opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
  578. opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
  579. opik-1.8.39.dist-info/METADATA +0 -339
  580. opik-1.8.39.dist-info/RECORD +0 -790
  581. /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
  582. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
  583. /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
  584. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
  585. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
  586. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
  587. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  588. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  589. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
  590. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  591. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
  592. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,92 @@
1
+ """Shared text preprocessing utilities for metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import string
7
+ import unicodedata
8
+ from typing import Callable, Literal
9
+
10
+ try: # optional dependency for emoji detection
11
+ import emoji
12
+ except ImportError: # pragma: no cover
13
+ emoji = None # type: ignore
14
+
15
+ _Normalizer = Callable[[str], str]
16
+
17
+
18
+ def normalize_text(
19
+ text: str,
20
+ *,
21
+ lowercase: bool = True,
22
+ strip_accents: bool = False,
23
+ remove_punctuation: bool = False,
24
+ keep_emoji: bool = True,
25
+ normalize_form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFKC",
26
+ ) -> str:
27
+ """Normalize text before metric processing.
28
+
29
+ Args:
30
+ text: Input string.
31
+ lowercase: Whether to lowercase the text.
32
+ strip_accents: Remove diacritical marks.
33
+ remove_punctuation: Strip ASCII punctuation.
34
+ keep_emoji: Preserve emoji characters; if False they are removed.
35
+ normalize_form: Unicode normalization form to apply (default NFKC).
36
+ """
37
+
38
+ normalized = unicodedata.normalize(normalize_form, text)
39
+ if lowercase:
40
+ normalized = normalized.lower()
41
+
42
+ if not keep_emoji:
43
+ normalized = _remove_emoji(normalized)
44
+
45
+ if strip_accents:
46
+ normalized = _strip_accents(normalized)
47
+
48
+ if remove_punctuation:
49
+ normalized = _remove_punctuation(normalized)
50
+
51
+ normalized = _collapse_whitespace(normalized)
52
+ return normalized.strip()
53
+
54
+
55
+ def _remove_emoji(text: str) -> str:
56
+ if emoji is None: # pragma: no cover
57
+ return "".join(
58
+ ch for ch in text if unicodedata.category(ch) not in {"So", "Sk"}
59
+ )
60
+ return emoji.replace_emoji(text, replace="")
61
+
62
+
63
+ def _strip_accents(text: str) -> str:
64
+ decomposed = unicodedata.normalize("NFD", text)
65
+ return "".join(ch for ch in decomposed if unicodedata.category(ch) != "Mn")
66
+
67
+
68
+ def _remove_punctuation(text: str) -> str:
69
+ translator = str.maketrans("", "", string.punctuation)
70
+ stripped = text.translate(translator)
71
+ return re.sub(
72
+ r"[\u2010-\u2015\u2018-\u201f\u2020-\u2027\u2030-\u2043]", "", stripped
73
+ )
74
+
75
+
76
+ def _collapse_whitespace(text: str) -> str:
77
+ return re.sub(r"\s+", " ", text)
78
+
79
+
80
+ DEFAULT_NORMALIZER: _Normalizer = normalize_text
81
+
82
+
83
+ def ascii_normalizer(text: str) -> str:
84
+ return normalize_text(
85
+ text,
86
+ strip_accents=True,
87
+ remove_punctuation=True,
88
+ keep_emoji=False,
89
+ )
90
+
91
+
92
+ ASCII_NORMALIZER: _Normalizer = ascii_normalizer
opik/evaluation/report.py CHANGED
@@ -1,11 +1,11 @@
1
1
  from collections import defaultdict
2
- from typing import Dict, List, Tuple
2
+ from typing import Dict, List, Optional, Tuple
3
3
 
4
4
  from rich import align, console, panel, table, text
5
5
 
6
6
 
7
- from .. import url_helpers
8
- from . import test_result
7
+ from . import test_result, evaluation_result
8
+ from .metrics import score_result
9
9
 
10
10
 
11
11
  def _format_time(seconds: float) -> str:
@@ -42,7 +42,10 @@ def _compute_average_scores(
42
42
 
43
43
 
44
44
  def display_experiment_results(
45
- dataset_name: str, total_time: float, test_results: List[test_result.TestResult]
45
+ dataset_name: str,
46
+ total_time: float,
47
+ test_results: List[test_result.TestResult],
48
+ experiment_scores: Optional[List[score_result.ScoreResult]] = None,
46
49
  ) -> None:
47
50
  average_scores, failed_scores = _compute_average_scores(test_results)
48
51
  nb_items = len(test_results)
@@ -63,6 +66,14 @@ def display_experiment_results(
63
66
  score_strings += text.Text(f" - {failed_scores[name]} failed", style="red")
64
67
  score_strings += text.Text("\n")
65
68
 
69
+ # Add experiment scores if available
70
+ if experiment_scores:
71
+ for score in experiment_scores:
72
+ score_strings += text.Text(
73
+ f"{score.name}: {score.value:.4f}", style="green bold"
74
+ )
75
+ score_strings += text.Text("\n")
76
+
66
77
  aligned_test_results = align.Align.left(score_strings)
67
78
 
68
79
  # Combine table, time text, and test results
@@ -87,16 +98,63 @@ def display_experiment_results(
87
98
  console_container.print("Uploading results to Opik ... ")
88
99
 
89
100
 
90
- def display_experiment_link(
91
- experiment_id: str, dataset_id: str, url_override: str
92
- ) -> None:
101
+ def display_experiment_link(experiment_url: str) -> None:
93
102
  console_container = console.Console()
94
103
 
95
- experiment_url = url_helpers.get_experiment_url_by_id(
96
- experiment_id=experiment_id,
97
- dataset_id=dataset_id,
98
- url_override=url_override,
99
- )
100
104
  console_container.print(
101
105
  f"View the results [link={experiment_url}]in your Opik dashboard[/link]."
102
106
  )
107
+
108
+
109
+ def display_evaluation_scores_statistics(
110
+ dataset_name: str,
111
+ evaluation_results: evaluation_result.EvaluationResult,
112
+ ) -> None:
113
+ """
114
+ Displays evaluation scores statistics for a given dataset.
115
+
116
+ The function generates a summary of evaluation scores including mean, max,
117
+ min, and optionally standard deviation for each metric in the evaluation
118
+ results. The summarized scores are formatted and presented in a table
119
+ within a panel for user clarity.
120
+
121
+ Args:
122
+ dataset_name: Name of the dataset for which evaluation statistics are
123
+ being displayed.
124
+ evaluation_results: An object containing evaluation results with
125
+ aggregated scores and statistical data.
126
+ """
127
+ aggregated_view = evaluation_results.aggregate_evaluation_scores()
128
+ if not aggregated_view.aggregated_scores:
129
+ return
130
+
131
+ # Create a table for the statistics
132
+ stats_table = table.Table()
133
+ stats_table.add_column("Name", style="cyan", no_wrap=True)
134
+ stats_table.add_column("Mean", justify="right", style="green")
135
+ stats_table.add_column("Min", justify="right", style="yellow")
136
+ stats_table.add_column("Max", justify="right", style="yellow")
137
+ stats_table.add_column("Std", justify="right", style="magenta")
138
+
139
+ # Add rows for each metric
140
+ for name, stats in aggregated_view.aggregated_scores.items():
141
+ std_value = f"{stats.std:.4f}" if stats.std is not None else "N/A"
142
+ stats_table.add_row(
143
+ name,
144
+ f"{stats.mean:.4f}",
145
+ f"{stats.min:.4f}",
146
+ f"{stats.max:.4f}",
147
+ std_value,
148
+ )
149
+
150
+ # Create a panel with the table inside
151
+ panel_content = panel.Panel(
152
+ stats_table,
153
+ title=f"Evaluation statistics for {dataset_name}",
154
+ title_align="left",
155
+ expand=False,
156
+ )
157
+
158
+ # Display results
159
+ console_container = console.Console()
160
+ console_container.print(panel_content)
@@ -1,11 +1,14 @@
1
+ import logging
1
2
  from typing import List, Optional
2
3
 
3
- from opik.api_objects import experiment, opik_client
4
- from opik.types import FeedbackScoreDict
5
- from . import test_case, test_result
6
- from .metrics import arguments_helpers
4
+ from opik.api_objects import dataset, experiment, opik_client
5
+ from opik.types import BatchFeedbackScoreDict
6
+ from . import test_case
7
+ from .metrics import score_result
7
8
  from .types import ScoringKeyMappingType
8
9
 
10
+ LOGGER = logging.getLogger(__name__)
11
+
9
12
 
10
13
  def get_experiment_with_unique_name(
11
14
  client: opik_client.Opik, experiment_name: str
@@ -34,63 +37,64 @@ def get_trace_project_name(client: opik_client.Opik, trace_id: str) -> str:
34
37
 
35
38
 
36
39
  def get_experiment_test_cases(
37
- client: opik_client.Opik,
38
- experiment_id: str,
39
- dataset_id: str,
40
+ experiment_: experiment.Experiment,
41
+ dataset_: dataset.Dataset,
40
42
  scoring_key_mapping: Optional[ScoringKeyMappingType],
41
43
  ) -> List[test_case.TestCase]:
44
+ experiment_items = experiment_.get_items()
45
+
46
+ # Fetch dataset items to get input data for bulk-uploaded experiment items
47
+ dataset_items_by_id = {item["id"]: item for item in dataset_.get_items()}
48
+
42
49
  test_cases = []
43
- page = 1
50
+ for item in experiment_items:
51
+ dataset_item_data = dataset_items_by_id.get(item.dataset_item_id)
44
52
 
45
- while True:
46
- experiment_items_page = (
47
- client._rest_client.datasets.find_dataset_items_with_experiment_items(
48
- id=dataset_id, experiment_ids=f'["{experiment_id}"]', page=page
53
+ if dataset_item_data is None:
54
+ LOGGER.error(
55
+ f"Unexpected error: Dataset item with id {item.dataset_item_id} not found, skipping experiment item {item.id}"
56
+ )
57
+ continue
58
+
59
+ if item.evaluation_task_output is None:
60
+ LOGGER.error(
61
+ f"Unexpected error: Evaluation task output is None for experiment item {item.id}, skipping experiment item"
62
+ )
63
+ continue
64
+
65
+ test_cases.append(
66
+ test_case.TestCase(
67
+ trace_id=item.trace_id,
68
+ dataset_item_id=item.dataset_item_id,
69
+ task_output=item.evaluation_task_output,
70
+ dataset_item_content=dataset_item_data,
49
71
  )
50
72
  )
51
- if len(experiment_items_page.content) == 0:
52
- break
53
-
54
- for item in experiment_items_page.content:
55
- experiment_item = item.experiment_items[0]
56
-
57
- test_cases += [
58
- test_case.TestCase(
59
- trace_id=experiment_item.trace_id,
60
- dataset_item_id=experiment_item.dataset_item_id,
61
- task_output=experiment_item.output,
62
- scoring_inputs=arguments_helpers.create_scoring_inputs(
63
- dataset_item=experiment_item.input,
64
- task_output=experiment_item.output,
65
- scoring_key_mapping=scoring_key_mapping,
66
- ),
67
- )
68
- ]
69
-
70
- page += 1
71
73
 
72
74
  return test_cases
73
75
 
74
76
 
75
- def log_test_result_scores(
77
+ def log_test_result_feedback_scores(
76
78
  client: opik_client.Opik,
77
- test_result: test_result.TestResult,
79
+ score_results: List[score_result.ScoreResult],
80
+ trace_id: str,
78
81
  project_name: Optional[str],
79
82
  ) -> None:
80
- all_trace_scores: List[FeedbackScoreDict] = []
83
+ all_trace_scores: List[BatchFeedbackScoreDict] = []
81
84
 
82
- for score_result in test_result.score_results:
83
- if score_result.scoring_failed:
85
+ for score_result_ in score_results:
86
+ if score_result_.scoring_failed:
84
87
  continue
85
88
 
86
- trace_score = FeedbackScoreDict(
87
- id=test_result.test_case.trace_id,
88
- name=score_result.name,
89
- value=score_result.value,
90
- reason=score_result.reason,
89
+ trace_score = BatchFeedbackScoreDict(
90
+ id=trace_id,
91
+ name=score_result_.name,
92
+ value=score_result_.value,
93
+ reason=score_result_.reason,
91
94
  )
92
95
  all_trace_scores.append(trace_score)
93
96
 
94
- client.log_traces_feedback_scores(
95
- scores=all_trace_scores, project_name=project_name
96
- )
97
+ if len(all_trace_scores) > 0:
98
+ client.log_traces_feedback_scores(
99
+ scores=all_trace_scores, project_name=project_name
100
+ )
@@ -0,0 +1,4 @@
1
+ from .base_dataset_sampler import BaseDatasetSampler
2
+ from .random_dataset_sampler import RandomDatasetSampler
3
+
4
+ __all__ = ["BaseDatasetSampler", "RandomDatasetSampler"]
@@ -0,0 +1,40 @@
1
+ import abc
2
+ from typing import List
3
+
4
+
5
+ from opik.api_objects.dataset import dataset_item
6
+
7
+
8
+ class BaseDatasetSampler(abc.ABC):
9
+ """
10
+ Defines the BaseDatasetSampler for sampling dataset items.
11
+
12
+ This is an abstract base class that provides the definition
13
+ for dataset sampling. It requires implementation of the `sample`
14
+ method in subclasses, which specifies the sampling logic tailored
15
+ to specific needs.
16
+
17
+ Methods in this class are enforced to be redefined in any
18
+ concrete implementation.
19
+
20
+ """
21
+
22
+ @abc.abstractmethod
23
+ def sample(
24
+ self, data_item: List[dataset_item.DatasetItem]
25
+ ) -> List[dataset_item.DatasetItem]:
26
+ """
27
+ Samples and filters a list of dataset items according to a specific implementation.
28
+
29
+ Args:
30
+ data_item (List[dataset_item.DatasetItem]): A list of DatasetItem objects to be
31
+ sampled and filtered.
32
+
33
+ Returns:
34
+ List[dataset_item.DatasetItem]: A list of DatasetItem objects resulting
35
+ from the sampling process.
36
+
37
+ Raises:
38
+ NotImplementedError: If the method is not implemented in a subclass.
39
+ """
40
+ pass
@@ -0,0 +1,48 @@
1
+ import random
2
+ from typing import List, Optional
3
+
4
+ from opik.api_objects.dataset import dataset_item
5
+
6
+ from . import base_dataset_sampler
7
+
8
+
9
+ class RandomDatasetSampler(base_dataset_sampler.BaseDatasetSampler):
10
+ def __init__(
11
+ self, max_samples: int, shuffle: bool = True, seed: Optional[int] = None
12
+ ) -> None:
13
+ """Samples a random subset of dataset items.
14
+
15
+ This class is a dataset sampler that selects a random subset of items from a dataset.
16
+ The number of items to sample can be specified, and shuffling can be enabled or disabled.
17
+ An optional random seed can be provided for reproducibility.
18
+
19
+ Args:
20
+ max_samples: The maximum number of samples to generate.
21
+ shuffle: Whether to shuffle the samples. Default is True, False provides a speedup
22
+ for large datasets.
23
+ seed: Seed for the random number generator. If None, then fresh, unpredictable
24
+ entropy will be pulled from the OS.
25
+ """
26
+ self.max_samples = max_samples
27
+ self.shuffle = shuffle
28
+ self.seed = seed
29
+
30
+ def sample(
31
+ self, data_items: List[dataset_item.DatasetItem]
32
+ ) -> List[dataset_item.DatasetItem]:
33
+ if len(data_items) == 0:
34
+ return []
35
+
36
+ # Create a random number generator with the specified seed
37
+ rng = random.Random(self.seed)
38
+
39
+ # Determine how many samples to take
40
+ sample_size = min(len(data_items), self.max_samples)
41
+
42
+ # Do sample first to avoid shuffling the entire dataset
43
+ items = rng.sample(data_items, sample_size)
44
+
45
+ if self.shuffle:
46
+ rng.shuffle(items)
47
+
48
+ return items
@@ -0,0 +1,66 @@
1
+ import dataclasses
2
+ import math
3
+ import statistics
4
+ from collections import defaultdict
5
+ from typing import List, Optional, Dict
6
+
7
+ from opik.evaluation import test_result
8
+
9
+
10
+ @dataclasses.dataclass
11
+ class ScoreStatistics:
12
+ """Statistics for a single score metric across multiple trials."""
13
+
14
+ mean: float
15
+ max: float
16
+ min: float
17
+ values: List[float]
18
+ std: Optional[float] = None # Standard deviation (None if count < 2)
19
+
20
+
21
+ def calculate_aggregated_statistics(
22
+ evaluation_results: List[test_result.TestResult],
23
+ ) -> Dict[str, ScoreStatistics]:
24
+ """
25
+ Calculate mean, max, and min scores for each score name in the evaluation test results.
26
+
27
+ Args:
28
+ evaluation_results: List of TestResult objects to be aggregated
29
+
30
+ Returns:
31
+ Dict mapping score names to their aggregated statistics
32
+ """
33
+ if not evaluation_results:
34
+ return {}
35
+
36
+ # Group scores by name across all trials
37
+ scores_by_name = defaultdict(list)
38
+
39
+ for test_result_ in evaluation_results:
40
+ for score_result in test_result_.score_results:
41
+ # Only include successful scores with valid values
42
+ if not score_result.scoring_failed and _is_valid_score_value(
43
+ score_result.value
44
+ ):
45
+ scores_by_name[score_result.name].append(score_result.value)
46
+
47
+ # Calculate aggregated statistics for each score name
48
+ aggregated_scores = {}
49
+ for score_name, values in scores_by_name.items():
50
+ if values:
51
+ std = statistics.stdev(values) if len(values) >= 2 else None
52
+
53
+ aggregated_scores[score_name] = ScoreStatistics(
54
+ mean=statistics.mean(values),
55
+ max=max(values),
56
+ min=min(values),
57
+ values=values.copy(), # Store the actual values used
58
+ std=std,
59
+ )
60
+
61
+ return aggregated_scores
62
+
63
+
64
+ def _is_valid_score_value(value: float) -> bool:
65
+ """Check if a score value is valid for statistical calculations."""
66
+ return isinstance(value, (int, float)) and math.isfinite(value)
@@ -0,0 +1,4 @@
1
+ from .scorer_function import ScorerFunction
2
+ from .scorer_wrapper_metric import ScorerWrapperMetric
3
+
4
+ __all__ = ["ScorerFunction", "ScorerWrapperMetric"]
@@ -0,0 +1,55 @@
1
+ import inspect
2
+ from typing import Any, Dict, Optional, Protocol, Union, List
3
+
4
+ from opik.evaluation.metrics import score_result
5
+ from opik.message_processing.emulation import models
6
+
7
+
8
+ class ScorerFunctionProtocol(Protocol):
9
+ """
10
+ Represents a protocol defining the structure for a scorer function.
11
+
12
+ This protocol serves as a contract for implementing scorer functions used in
13
+ evaluating tasks. A scorer function adhering to this protocol should take
14
+ dataset item data, task outputs, and optionally a task span model as input
15
+ parameters and return a scoring result.
16
+ """
17
+
18
+ def __call__(
19
+ self,
20
+ dataset_item: Dict[str, Any],
21
+ task_outputs: Dict[str, Any],
22
+ task_span: Optional[models.SpanModel] = None,
23
+ ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]: ...
24
+
25
+
26
+ ScorerFunction = ScorerFunctionProtocol
27
+
28
+
29
+ EXPECTED_SCORER_FUNCTION_PARAMETERS = ["dataset_item", "task_outputs"]
30
+
31
+
32
+ def validate_scorer_function(scorer_function: ScorerFunction) -> None:
33
+ if not callable(scorer_function):
34
+ raise ValueError("scorer_function must be a callable function")
35
+
36
+ parameters = inspect.signature(scorer_function).parameters
37
+ names = set(parameters.keys())
38
+
39
+ # Check if it has both dataset_item and task_outputs
40
+ has_dataset_item_and_task_outputs = all(
41
+ param in names for param in EXPECTED_SCORER_FUNCTION_PARAMETERS
42
+ )
43
+
44
+ # Check if it has at least one task_span parameter
45
+ has_task_span = "task_span" in names
46
+
47
+ if not (has_dataset_item_and_task_outputs or has_task_span):
48
+ raise ValueError(
49
+ f"scorer_function must have either both 'dataset_item' and 'task_outputs' parameters "
50
+ f"or at least one 'task_span' parameter. Found parameters: {list(names)}"
51
+ )
52
+
53
+
54
+ def has_task_span_in_parameters(scorer_function: ScorerFunction) -> bool:
55
+ return "task_span" in inspect.signature(scorer_function).parameters
@@ -0,0 +1,130 @@
1
+ from typing import Any, Callable, Dict, Optional, List, Union
2
+
3
+ from opik.evaluation.metrics import base_metric, score_result
4
+
5
+ from . import scorer_function
6
+ from ...message_processing.emulation import models
7
+
8
+
9
+ class ScorerWrapperMetric(base_metric.BaseMetric):
10
+ """
11
+ A wrapper metric that adapts a ScorerFunction to the BaseMetric interface.
12
+
13
+ This class allows using ScorerFunction instances as BaseMetric instances,
14
+ providing compatibility between the two interfaces.
15
+
16
+ Args:
17
+ scorer: The ScorerFunction to wrap
18
+ name: Optional name for the metric. If not provided, uses the class name.
19
+ track: Whether to track the metric. Defaults to True.
20
+ project_name: Optional project name for tracking.
21
+
22
+ Raises:
23
+ ValueError if the scorer function is invalid.
24
+
25
+ Example:
26
+ >>> def my_scorer(dataset_item: Dict[str, Any], task_outputs: Dict[str, Any]) -> score_result.ScoreResult:
27
+ >>> return score_result.ScoreResult(name="my_metric", value=1.0)
28
+ >>>
29
+ >>> wrapper = ScorerWrapperMetric(scorer_function=my_scorer, name="wrapped_scorer")
30
+ >>> result = wrapper.score(dataset_item={"text": "hello"}, task_outputs={"text": "hello"})
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ scorer: scorer_function.ScorerFunction,
36
+ name: str,
37
+ track: bool = True,
38
+ project_name: Optional[str] = None,
39
+ ) -> None:
40
+ super().__init__(name=name, track=track, project_name=project_name)
41
+ self.scorer = scorer
42
+
43
+ # validate scorer function
44
+ scorer_function.validate_scorer_function(scorer)
45
+
46
+ def score(
47
+ self,
48
+ dataset_item: Dict[str, Any],
49
+ task_outputs: Dict[str, Any],
50
+ **kwargs: Any,
51
+ ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
52
+ """
53
+ Score using the wrapped ScorerFunction.
54
+
55
+ Args:
56
+ dataset_item: The dataset item data to score against
57
+ task_outputs: The output dictionary to be scored - can be the output of LLM task, etc.
58
+ **kwargs: Additional keyword arguments (ignored by the scorer function)
59
+
60
+ Returns:
61
+ ScoreResult from the wrapped scorer function
62
+ """
63
+ return self.scorer(dataset_item=dataset_item, task_outputs=task_outputs)
64
+
65
+
66
+ class ScorerWrapperMetricTaskSpan(ScorerWrapperMetric):
67
+ def __init__(
68
+ self,
69
+ scorer: scorer_function.ScorerFunction,
70
+ name: str,
71
+ track: bool = True,
72
+ project_name: Optional[str] = None,
73
+ ) -> None:
74
+ super().__init__(
75
+ scorer=scorer, name=name, track=track, project_name=project_name
76
+ )
77
+
78
+ def score(
79
+ self,
80
+ dataset_item: Dict[str, Any],
81
+ task_outputs: Dict[str, Any],
82
+ task_span: Optional[models.SpanModel] = None,
83
+ **kwargs: Any,
84
+ ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
85
+ """
86
+ Score using the wrapped ScorerFunction.
87
+
88
+ Args:
89
+ dataset_item: The dataset item data to score against
90
+ task_outputs: The output dictionary to be scored - can be the output of LLM task, etc.
91
+ task_span: The collected task span data.
92
+ **kwargs: Additional keyword arguments (ignored by the scorer function)
93
+
94
+ Returns:
95
+ ScoreResult from the wrapped scorer function
96
+ """
97
+ if task_span is not None and scorer_function.has_task_span_in_parameters(
98
+ self.scorer
99
+ ):
100
+ return self.scorer(
101
+ dataset_item=dataset_item,
102
+ task_outputs=task_outputs,
103
+ task_span=task_span,
104
+ )
105
+
106
+ return self.scorer(dataset_item=dataset_item, task_outputs=task_outputs)
107
+
108
+
109
+ def _scorer_name(scorer: Callable) -> str:
110
+ return scorer.__name__
111
+
112
+
113
+ def wrap_scorer_functions(
114
+ scorer_functions: List[scorer_function.ScorerFunction], project_name: Optional[str]
115
+ ) -> List[base_metric.BaseMetric]:
116
+ metrics: List[base_metric.BaseMetric] = []
117
+ for f in scorer_functions:
118
+ name = _scorer_name(f)
119
+ if scorer_function.has_task_span_in_parameters(f):
120
+ metrics.append(
121
+ ScorerWrapperMetricTaskSpan(
122
+ scorer=f, project_name=project_name, name=name
123
+ )
124
+ )
125
+ else:
126
+ metrics.append(
127
+ ScorerWrapperMetric(scorer=f, project_name=project_name, name=name)
128
+ )
129
+
130
+ return metrics