opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. opik/__init__.py +19 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/attachment_context.py +36 -0
  9. opik/api_objects/attachment/attachments_extractor.py +153 -0
  10. opik/api_objects/attachment/client.py +1 -0
  11. opik/api_objects/attachment/converters.py +2 -0
  12. opik/api_objects/attachment/decoder.py +18 -0
  13. opik/api_objects/attachment/decoder_base64.py +83 -0
  14. opik/api_objects/attachment/decoder_helpers.py +137 -0
  15. opik/api_objects/data_helpers.py +79 -0
  16. opik/api_objects/dataset/dataset.py +64 -4
  17. opik/api_objects/dataset/rest_operations.py +11 -2
  18. opik/api_objects/experiment/experiment.py +57 -57
  19. opik/api_objects/experiment/experiment_item.py +2 -1
  20. opik/api_objects/experiment/experiments_client.py +64 -0
  21. opik/api_objects/experiment/helpers.py +35 -11
  22. opik/api_objects/experiment/rest_operations.py +65 -5
  23. opik/api_objects/helpers.py +8 -5
  24. opik/api_objects/local_recording.py +81 -0
  25. opik/api_objects/opik_client.py +600 -108
  26. opik/api_objects/opik_query_language.py +39 -5
  27. opik/api_objects/prompt/__init__.py +12 -2
  28. opik/api_objects/prompt/base_prompt.py +69 -0
  29. opik/api_objects/prompt/base_prompt_template.py +29 -0
  30. opik/api_objects/prompt/chat/__init__.py +1 -0
  31. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  32. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  33. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  34. opik/api_objects/prompt/client.py +189 -47
  35. opik/api_objects/prompt/text/__init__.py +1 -0
  36. opik/api_objects/prompt/text/prompt.py +174 -0
  37. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  38. opik/api_objects/prompt/types.py +23 -0
  39. opik/api_objects/search_helpers.py +89 -0
  40. opik/api_objects/span/span_data.py +35 -25
  41. opik/api_objects/threads/threads_client.py +39 -5
  42. opik/api_objects/trace/trace_client.py +52 -2
  43. opik/api_objects/trace/trace_data.py +15 -24
  44. opik/api_objects/validation_helpers.py +3 -3
  45. opik/cli/__init__.py +5 -0
  46. opik/cli/__main__.py +6 -0
  47. opik/cli/configure.py +66 -0
  48. opik/cli/exports/__init__.py +131 -0
  49. opik/cli/exports/dataset.py +278 -0
  50. opik/cli/exports/experiment.py +784 -0
  51. opik/cli/exports/project.py +685 -0
  52. opik/cli/exports/prompt.py +578 -0
  53. opik/cli/exports/utils.py +406 -0
  54. opik/cli/harbor.py +39 -0
  55. opik/cli/healthcheck.py +21 -0
  56. opik/cli/imports/__init__.py +439 -0
  57. opik/cli/imports/dataset.py +143 -0
  58. opik/cli/imports/experiment.py +1192 -0
  59. opik/cli/imports/project.py +262 -0
  60. opik/cli/imports/prompt.py +177 -0
  61. opik/cli/imports/utils.py +280 -0
  62. opik/cli/main.py +49 -0
  63. opik/cli/proxy.py +93 -0
  64. opik/cli/usage_report/__init__.py +16 -0
  65. opik/cli/usage_report/charts.py +783 -0
  66. opik/cli/usage_report/cli.py +274 -0
  67. opik/cli/usage_report/constants.py +9 -0
  68. opik/cli/usage_report/extraction.py +749 -0
  69. opik/cli/usage_report/pdf.py +244 -0
  70. opik/cli/usage_report/statistics.py +78 -0
  71. opik/cli/usage_report/utils.py +235 -0
  72. opik/config.py +13 -7
  73. opik/configurator/configure.py +17 -0
  74. opik/datetime_helpers.py +12 -0
  75. opik/decorator/arguments_helpers.py +9 -1
  76. opik/decorator/base_track_decorator.py +205 -133
  77. opik/decorator/context_manager/span_context_manager.py +123 -0
  78. opik/decorator/context_manager/trace_context_manager.py +84 -0
  79. opik/decorator/opik_args/__init__.py +13 -0
  80. opik/decorator/opik_args/api_classes.py +71 -0
  81. opik/decorator/opik_args/helpers.py +120 -0
  82. opik/decorator/span_creation_handler.py +25 -6
  83. opik/dict_utils.py +3 -3
  84. opik/evaluation/__init__.py +13 -2
  85. opik/evaluation/engine/engine.py +272 -75
  86. opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
  87. opik/evaluation/engine/helpers.py +31 -6
  88. opik/evaluation/engine/metrics_evaluator.py +237 -0
  89. opik/evaluation/evaluation_result.py +168 -2
  90. opik/evaluation/evaluator.py +533 -62
  91. opik/evaluation/metrics/__init__.py +103 -4
  92. opik/evaluation/metrics/aggregated_metric.py +35 -6
  93. opik/evaluation/metrics/base_metric.py +1 -1
  94. opik/evaluation/metrics/conversation/__init__.py +48 -0
  95. opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
  96. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  97. opik/evaluation/metrics/conversation/helpers.py +14 -15
  98. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  99. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  100. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  101. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  102. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  103. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  104. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  105. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
  106. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
  107. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  108. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
  109. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
  110. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  111. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
  112. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
  113. opik/evaluation/metrics/conversation/types.py +4 -5
  114. opik/evaluation/metrics/conversation_types.py +9 -0
  115. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  116. opik/evaluation/metrics/heuristics/bleu.py +35 -15
  117. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  118. opik/evaluation/metrics/heuristics/contains.py +47 -11
  119. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  120. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  121. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  122. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  123. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  124. opik/evaluation/metrics/heuristics/readability.py +129 -0
  125. opik/evaluation/metrics/heuristics/rouge.py +26 -9
  126. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  127. opik/evaluation/metrics/heuristics/tone.py +155 -0
  128. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  129. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
  130. opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
  131. opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
  132. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  133. opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
  134. opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
  135. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  136. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  137. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  138. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  139. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  140. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  141. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  142. opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
  143. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  144. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  145. opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
  146. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  147. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  148. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  149. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  150. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  151. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  152. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  153. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  154. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  155. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
  156. opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
  157. opik/evaluation/metrics/ragas_metric.py +43 -23
  158. opik/evaluation/models/__init__.py +8 -0
  159. opik/evaluation/models/base_model.py +107 -1
  160. opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
  161. opik/evaluation/models/langchain/message_converters.py +97 -15
  162. opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
  163. opik/evaluation/models/litellm/util.py +125 -0
  164. opik/evaluation/models/litellm/warning_filters.py +16 -4
  165. opik/evaluation/models/model_capabilities.py +187 -0
  166. opik/evaluation/models/models_factory.py +25 -3
  167. opik/evaluation/preprocessing.py +92 -0
  168. opik/evaluation/report.py +70 -12
  169. opik/evaluation/rest_operations.py +49 -45
  170. opik/evaluation/samplers/__init__.py +4 -0
  171. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  172. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  173. opik/evaluation/score_statistics.py +66 -0
  174. opik/evaluation/scorers/__init__.py +4 -0
  175. opik/evaluation/scorers/scorer_function.py +55 -0
  176. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  177. opik/evaluation/test_case.py +3 -2
  178. opik/evaluation/test_result.py +1 -0
  179. opik/evaluation/threads/evaluator.py +31 -3
  180. opik/evaluation/threads/helpers.py +3 -2
  181. opik/evaluation/types.py +9 -1
  182. opik/exceptions.py +33 -0
  183. opik/file_upload/file_uploader.py +13 -0
  184. opik/file_upload/upload_options.py +2 -0
  185. opik/hooks/__init__.py +23 -0
  186. opik/hooks/anonymizer_hook.py +36 -0
  187. opik/hooks/httpx_client_hook.py +112 -0
  188. opik/httpx_client.py +12 -9
  189. opik/id_helpers.py +18 -0
  190. opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
  191. opik/integrations/adk/helpers.py +16 -7
  192. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  193. opik/integrations/adk/opik_tracer.py +14 -1
  194. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  195. opik/integrations/adk/recursive_callback_injector.py +4 -7
  196. opik/integrations/bedrock/converse/__init__.py +0 -0
  197. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  198. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
  199. opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
  200. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  201. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  202. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  203. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  204. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  205. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  206. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  207. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  208. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  209. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  210. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  211. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  212. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  213. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  214. opik/integrations/bedrock/opik_tracker.py +42 -4
  215. opik/integrations/bedrock/types.py +19 -0
  216. opik/integrations/crewai/crewai_decorator.py +8 -51
  217. opik/integrations/crewai/opik_tracker.py +31 -10
  218. opik/integrations/crewai/patchers/__init__.py +5 -0
  219. opik/integrations/crewai/patchers/flow.py +118 -0
  220. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  221. opik/integrations/crewai/patchers/llm_client.py +207 -0
  222. opik/integrations/dspy/callback.py +80 -17
  223. opik/integrations/dspy/parsers.py +168 -0
  224. opik/integrations/harbor/__init__.py +17 -0
  225. opik/integrations/harbor/experiment_service.py +269 -0
  226. opik/integrations/harbor/opik_tracker.py +528 -0
  227. opik/integrations/haystack/opik_connector.py +2 -2
  228. opik/integrations/haystack/opik_tracer.py +3 -7
  229. opik/integrations/langchain/__init__.py +3 -1
  230. opik/integrations/langchain/helpers.py +96 -0
  231. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  232. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  233. opik/integrations/langchain/opik_encoder_extension.py +1 -1
  234. opik/integrations/langchain/opik_tracer.py +474 -229
  235. opik/integrations/litellm/__init__.py +5 -0
  236. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  237. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  238. opik/integrations/litellm/opik_tracker.py +43 -0
  239. opik/integrations/litellm/stream_patchers.py +151 -0
  240. opik/integrations/llama_index/callback.py +146 -107
  241. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  242. opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
  243. opik/integrations/openai/opik_tracker.py +1 -1
  244. opik/integrations/sagemaker/auth.py +5 -1
  245. opik/llm_usage/google_usage.py +3 -1
  246. opik/llm_usage/opik_usage.py +7 -8
  247. opik/llm_usage/opik_usage_factory.py +4 -2
  248. opik/logging_messages.py +6 -0
  249. opik/message_processing/batching/base_batcher.py +14 -21
  250. opik/message_processing/batching/batch_manager.py +22 -10
  251. opik/message_processing/batching/batch_manager_constuctors.py +10 -0
  252. opik/message_processing/batching/batchers.py +59 -27
  253. opik/message_processing/batching/flushing_thread.py +0 -3
  254. opik/message_processing/emulation/__init__.py +0 -0
  255. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  256. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  257. opik/message_processing/emulation/models.py +162 -0
  258. opik/message_processing/encoder_helpers.py +79 -0
  259. opik/message_processing/messages.py +56 -1
  260. opik/message_processing/preprocessing/__init__.py +0 -0
  261. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  262. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  263. opik/message_processing/preprocessing/constants.py +1 -0
  264. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  265. opik/message_processing/preprocessing/preprocessor.py +36 -0
  266. opik/message_processing/processors/__init__.py +0 -0
  267. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  268. opik/message_processing/processors/message_processors.py +92 -0
  269. opik/message_processing/processors/message_processors_chain.py +96 -0
  270. opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
  271. opik/message_processing/queue_consumer.py +9 -3
  272. opik/message_processing/streamer.py +71 -33
  273. opik/message_processing/streamer_constructors.py +43 -10
  274. opik/opik_context.py +16 -4
  275. opik/plugins/pytest/hooks.py +5 -3
  276. opik/rest_api/__init__.py +346 -15
  277. opik/rest_api/alerts/__init__.py +7 -0
  278. opik/rest_api/alerts/client.py +667 -0
  279. opik/rest_api/alerts/raw_client.py +1015 -0
  280. opik/rest_api/alerts/types/__init__.py +7 -0
  281. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  282. opik/rest_api/annotation_queues/__init__.py +4 -0
  283. opik/rest_api/annotation_queues/client.py +668 -0
  284. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  285. opik/rest_api/automation_rule_evaluators/client.py +34 -2
  286. opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
  287. opik/rest_api/client.py +15 -0
  288. opik/rest_api/dashboards/__init__.py +4 -0
  289. opik/rest_api/dashboards/client.py +462 -0
  290. opik/rest_api/dashboards/raw_client.py +648 -0
  291. opik/rest_api/datasets/client.py +1310 -44
  292. opik/rest_api/datasets/raw_client.py +2269 -358
  293. opik/rest_api/experiments/__init__.py +2 -2
  294. opik/rest_api/experiments/client.py +191 -5
  295. opik/rest_api/experiments/raw_client.py +301 -7
  296. opik/rest_api/experiments/types/__init__.py +4 -1
  297. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  298. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  299. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  300. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  301. opik/rest_api/llm_provider_key/client.py +20 -0
  302. opik/rest_api/llm_provider_key/raw_client.py +20 -0
  303. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  304. opik/rest_api/manual_evaluation/__init__.py +4 -0
  305. opik/rest_api/manual_evaluation/client.py +347 -0
  306. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  307. opik/rest_api/optimizations/client.py +145 -9
  308. opik/rest_api/optimizations/raw_client.py +237 -13
  309. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  310. opik/rest_api/prompts/__init__.py +2 -2
  311. opik/rest_api/prompts/client.py +227 -6
  312. opik/rest_api/prompts/raw_client.py +331 -2
  313. opik/rest_api/prompts/types/__init__.py +3 -1
  314. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  315. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  316. opik/rest_api/spans/__init__.py +0 -2
  317. opik/rest_api/spans/client.py +238 -76
  318. opik/rest_api/spans/raw_client.py +307 -95
  319. opik/rest_api/spans/types/__init__.py +0 -2
  320. opik/rest_api/traces/client.py +572 -161
  321. opik/rest_api/traces/raw_client.py +736 -229
  322. opik/rest_api/types/__init__.py +352 -17
  323. opik/rest_api/types/aggregation_data.py +1 -0
  324. opik/rest_api/types/alert.py +33 -0
  325. opik/rest_api/types/alert_alert_type.py +5 -0
  326. opik/rest_api/types/alert_page_public.py +24 -0
  327. opik/rest_api/types/alert_public.py +33 -0
  328. opik/rest_api/types/alert_public_alert_type.py +5 -0
  329. opik/rest_api/types/alert_trigger.py +27 -0
  330. opik/rest_api/types/alert_trigger_config.py +28 -0
  331. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  332. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  333. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  334. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  335. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  336. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  337. opik/rest_api/types/alert_trigger_public.py +27 -0
  338. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  339. opik/rest_api/types/alert_trigger_write.py +23 -0
  340. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  341. opik/rest_api/types/alert_write.py +28 -0
  342. opik/rest_api/types/alert_write_alert_type.py +5 -0
  343. opik/rest_api/types/annotation_queue.py +42 -0
  344. opik/rest_api/types/annotation_queue_batch.py +27 -0
  345. opik/rest_api/types/annotation_queue_item_ids.py +19 -0
  346. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  347. opik/rest_api/types/annotation_queue_public.py +38 -0
  348. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  349. opik/rest_api/types/annotation_queue_reviewer.py +20 -0
  350. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  351. opik/rest_api/types/annotation_queue_scope.py +5 -0
  352. opik/rest_api/types/annotation_queue_write.py +31 -0
  353. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  354. opik/rest_api/types/audio_url.py +19 -0
  355. opik/rest_api/types/audio_url_public.py +19 -0
  356. opik/rest_api/types/audio_url_write.py +19 -0
  357. opik/rest_api/types/automation_rule_evaluator.py +62 -2
  358. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  359. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  360. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  361. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  362. opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
  363. opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
  364. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  365. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  366. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  367. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  368. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  369. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  370. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  371. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  372. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  373. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  374. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  375. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  376. opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
  377. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  378. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  379. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  380. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  381. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  382. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  383. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  384. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  385. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  386. opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
  387. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  388. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  389. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  390. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  391. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  392. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  393. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  394. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  395. opik/rest_api/types/dashboard_page_public.py +24 -0
  396. opik/rest_api/types/dashboard_public.py +30 -0
  397. opik/rest_api/types/dataset.py +4 -0
  398. opik/rest_api/types/dataset_expansion.py +42 -0
  399. opik/rest_api/types/dataset_expansion_response.py +39 -0
  400. opik/rest_api/types/dataset_item.py +2 -0
  401. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  402. opik/rest_api/types/dataset_item_compare.py +2 -0
  403. opik/rest_api/types/dataset_item_filter.py +27 -0
  404. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  405. opik/rest_api/types/dataset_item_page_compare.py +5 -0
  406. opik/rest_api/types/dataset_item_page_public.py +5 -0
  407. opik/rest_api/types/dataset_item_public.py +2 -0
  408. opik/rest_api/types/dataset_item_update.py +39 -0
  409. opik/rest_api/types/dataset_item_write.py +1 -0
  410. opik/rest_api/types/dataset_public.py +4 -0
  411. opik/rest_api/types/dataset_public_status.py +5 -0
  412. opik/rest_api/types/dataset_status.py +5 -0
  413. opik/rest_api/types/dataset_version_diff.py +22 -0
  414. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  415. opik/rest_api/types/dataset_version_page_public.py +23 -0
  416. opik/rest_api/types/dataset_version_public.py +59 -0
  417. opik/rest_api/types/dataset_version_summary.py +46 -0
  418. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  419. opik/rest_api/types/experiment.py +7 -2
  420. opik/rest_api/types/experiment_group_response.py +2 -0
  421. opik/rest_api/types/experiment_public.py +7 -2
  422. opik/rest_api/types/experiment_public_status.py +5 -0
  423. opik/rest_api/types/experiment_score.py +20 -0
  424. opik/rest_api/types/experiment_score_public.py +20 -0
  425. opik/rest_api/types/experiment_score_write.py +20 -0
  426. opik/rest_api/types/experiment_status.py +5 -0
  427. opik/rest_api/types/feedback.py +25 -1
  428. opik/rest_api/types/feedback_create.py +20 -1
  429. opik/rest_api/types/feedback_object_public.py +27 -1
  430. opik/rest_api/types/feedback_public.py +25 -1
  431. opik/rest_api/types/feedback_score_batch_item.py +2 -1
  432. opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
  433. opik/rest_api/types/feedback_score_public.py +4 -0
  434. opik/rest_api/types/feedback_update.py +20 -1
  435. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  436. opik/rest_api/types/group_detail.py +19 -0
  437. opik/rest_api/types/group_details.py +20 -0
  438. opik/rest_api/types/guardrail.py +1 -0
  439. opik/rest_api/types/guardrail_write.py +1 -0
  440. opik/rest_api/types/ids_holder.py +19 -0
  441. opik/rest_api/types/image_url.py +20 -0
  442. opik/rest_api/types/image_url_public.py +20 -0
  443. opik/rest_api/types/image_url_write.py +20 -0
  444. opik/rest_api/types/llm_as_judge_message.py +5 -1
  445. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  446. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  447. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  448. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  449. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  450. opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
  451. opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
  452. opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
  453. opik/rest_api/types/manual_evaluation_request.py +38 -0
  454. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  455. opik/rest_api/types/manual_evaluation_response.py +27 -0
  456. opik/rest_api/types/optimization.py +4 -2
  457. opik/rest_api/types/optimization_public.py +4 -2
  458. opik/rest_api/types/optimization_public_status.py +3 -1
  459. opik/rest_api/types/optimization_status.py +3 -1
  460. opik/rest_api/types/optimization_studio_config.py +27 -0
  461. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  462. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  463. opik/rest_api/types/optimization_studio_log.py +22 -0
  464. opik/rest_api/types/optimization_write.py +4 -2
  465. opik/rest_api/types/optimization_write_status.py +3 -1
  466. opik/rest_api/types/project.py +1 -0
  467. opik/rest_api/types/project_detailed.py +1 -0
  468. opik/rest_api/types/project_reference.py +31 -0
  469. opik/rest_api/types/project_reference_public.py +31 -0
  470. opik/rest_api/types/project_stats_summary_item.py +1 -0
  471. opik/rest_api/types/prompt.py +6 -0
  472. opik/rest_api/types/prompt_detail.py +6 -0
  473. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  474. opik/rest_api/types/prompt_public.py +6 -0
  475. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  476. opik/rest_api/types/prompt_template_structure.py +5 -0
  477. opik/rest_api/types/prompt_version.py +3 -0
  478. opik/rest_api/types/prompt_version_detail.py +3 -0
  479. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  480. opik/rest_api/types/prompt_version_link.py +1 -0
  481. opik/rest_api/types/prompt_version_link_public.py +1 -0
  482. opik/rest_api/types/prompt_version_page_public.py +5 -0
  483. opik/rest_api/types/prompt_version_public.py +3 -0
  484. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  485. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  486. opik/rest_api/types/prompt_version_update.py +33 -0
  487. opik/rest_api/types/provider_api_key.py +9 -0
  488. opik/rest_api/types/provider_api_key_provider.py +1 -1
  489. opik/rest_api/types/provider_api_key_public.py +9 -0
  490. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  491. opik/rest_api/types/score_name.py +1 -0
  492. opik/rest_api/types/service_toggles_config.py +18 -0
  493. opik/rest_api/types/span.py +1 -2
  494. opik/rest_api/types/span_enrichment_options.py +31 -0
  495. opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
  496. opik/rest_api/types/span_filter.py +23 -0
  497. opik/rest_api/types/span_filter_operator.py +21 -0
  498. opik/rest_api/types/span_filter_write.py +23 -0
  499. opik/rest_api/types/span_filter_write_operator.py +21 -0
  500. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  501. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  502. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  503. opik/rest_api/types/span_public.py +1 -2
  504. opik/rest_api/types/span_update.py +46 -0
  505. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  506. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  507. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  508. opik/rest_api/types/span_write.py +1 -2
  509. opik/rest_api/types/studio_evaluation.py +20 -0
  510. opik/rest_api/types/studio_evaluation_public.py +20 -0
  511. opik/rest_api/types/studio_evaluation_write.py +20 -0
  512. opik/rest_api/types/studio_llm_model.py +21 -0
  513. opik/rest_api/types/studio_llm_model_public.py +21 -0
  514. opik/rest_api/types/studio_llm_model_write.py +21 -0
  515. opik/rest_api/types/studio_message.py +20 -0
  516. opik/rest_api/types/studio_message_public.py +20 -0
  517. opik/rest_api/types/studio_message_write.py +20 -0
  518. opik/rest_api/types/studio_metric.py +21 -0
  519. opik/rest_api/types/studio_metric_public.py +21 -0
  520. opik/rest_api/types/studio_metric_write.py +21 -0
  521. opik/rest_api/types/studio_optimizer.py +21 -0
  522. opik/rest_api/types/studio_optimizer_public.py +21 -0
  523. opik/rest_api/types/studio_optimizer_write.py +21 -0
  524. opik/rest_api/types/studio_prompt.py +20 -0
  525. opik/rest_api/types/studio_prompt_public.py +20 -0
  526. opik/rest_api/types/studio_prompt_write.py +20 -0
  527. opik/rest_api/types/trace.py +11 -2
  528. opik/rest_api/types/trace_enrichment_options.py +32 -0
  529. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
  530. opik/rest_api/types/trace_filter.py +23 -0
  531. opik/rest_api/types/trace_filter_operator.py +21 -0
  532. opik/rest_api/types/trace_filter_write.py +23 -0
  533. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  534. opik/rest_api/types/trace_public.py +11 -2
  535. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  536. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  537. opik/rest_api/types/trace_thread_identifier.py +1 -0
  538. opik/rest_api/types/trace_update.py +39 -0
  539. opik/rest_api/types/trace_write.py +1 -2
  540. opik/rest_api/types/value_entry.py +2 -0
  541. opik/rest_api/types/value_entry_compare.py +2 -0
  542. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  543. opik/rest_api/types/value_entry_public.py +2 -0
  544. opik/rest_api/types/video_url.py +19 -0
  545. opik/rest_api/types/video_url_public.py +19 -0
  546. opik/rest_api/types/video_url_write.py +19 -0
  547. opik/rest_api/types/webhook.py +28 -0
  548. opik/rest_api/types/webhook_examples.py +19 -0
  549. opik/rest_api/types/webhook_public.py +28 -0
  550. opik/rest_api/types/webhook_test_result.py +23 -0
  551. opik/rest_api/types/webhook_test_result_status.py +5 -0
  552. opik/rest_api/types/webhook_write.py +23 -0
  553. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  554. opik/rest_api/types/workspace_configuration.py +5 -0
  555. opik/rest_api/welcome_wizard/__init__.py +4 -0
  556. opik/rest_api/welcome_wizard/client.py +195 -0
  557. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  558. opik/rest_api/workspaces/client.py +14 -2
  559. opik/rest_api/workspaces/raw_client.py +10 -0
  560. opik/s3_httpx_client.py +14 -1
  561. opik/simulation/__init__.py +6 -0
  562. opik/simulation/simulated_user.py +99 -0
  563. opik/simulation/simulator.py +108 -0
  564. opik/synchronization.py +5 -6
  565. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  566. opik/types.py +36 -0
  567. opik/validation/chat_prompt_messages.py +241 -0
  568. opik/validation/feedback_score.py +3 -3
  569. opik/validation/validator.py +28 -0
  570. opik-1.9.71.dist-info/METADATA +370 -0
  571. opik-1.9.71.dist-info/RECORD +1110 -0
  572. opik/api_objects/prompt/prompt.py +0 -112
  573. opik/cli.py +0 -193
  574. opik/hooks.py +0 -13
  575. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  576. opik/integrations/bedrock/helpers.py +0 -8
  577. opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
  578. opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
  579. opik-1.8.39.dist-info/METADATA +0 -339
  580. opik-1.8.39.dist-info/RECORD +0 -790
  581. /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
  582. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
  583. /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
  584. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
  585. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
  586. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
  587. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  588. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  589. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
  590. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  591. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
  592. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,37 @@
1
1
  import functools
2
2
  import logging
3
- from typing import List, Optional
3
+ from typing import List, Optional, Any, Dict
4
4
 
5
- import opik.exceptions as exceptions
6
5
  import opik.logging_messages as logging_messages
7
6
  import opik.opik_context as opik_context
8
7
  import opik
9
- from opik.api_objects import opik_client, trace
8
+ from opik.api_objects import opik_client, trace, local_recording
10
9
  from opik.api_objects.dataset import dataset, dataset_item
11
10
  from opik.api_objects.experiment import experiment
12
11
  from opik.evaluation import (
13
12
  rest_operations,
14
13
  test_case,
15
14
  test_result,
15
+ samplers,
16
16
  )
17
17
  from opik.evaluation.types import LLMTask, ScoringKeyMappingType
18
18
 
19
- from . import evaluation_tasks_executor, exception_analyzer, helpers
19
+ from . import evaluation_tasks_executor, exception_analyzer, helpers, metrics_evaluator
20
20
  from .types import EvaluationTask
21
- from ..metrics import arguments_validator, arguments_helpers, base_metric, score_result
21
+ from ..metrics import base_metric, score_result
22
+ from ...message_processing.emulation import models
23
+
22
24
 
23
25
  LOGGER = logging.getLogger(__name__)
24
26
 
27
+ EVALUATION_TASK_NAME = "evaluation_task"
28
+
25
29
 
26
30
  class EvaluationEngine:
27
31
  def __init__(
28
32
  self,
29
33
  client: opik_client.Opik,
30
34
  project_name: Optional[str],
31
- experiment_: experiment.Experiment,
32
35
  scoring_metrics: List[base_metric.BaseMetric],
33
36
  workers: int,
34
37
  verbose: int,
@@ -36,73 +39,76 @@ class EvaluationEngine:
36
39
  ) -> None:
37
40
  self._client = client
38
41
  self._project_name = project_name
39
- self._experiment = experiment_
40
42
  self._workers = workers
41
43
  self._verbose = verbose
42
- self._scoring_metrics = scoring_metrics
43
- self._scoring_key_mapping = scoring_key_mapping
44
+
45
+ # Delegate metric analysis to MetricsEvaluator
46
+ self._metrics_evaluator = metrics_evaluator.MetricsEvaluator(
47
+ scoring_metrics=scoring_metrics,
48
+ scoring_key_mapping=scoring_key_mapping,
49
+ )
44
50
 
45
51
  @opik.track(name="metrics_calculation") # type: ignore[attr-defined,has-type]
46
- def _evaluate_test_case(
52
+ def _compute_test_result_for_test_case(
47
53
  self,
48
54
  test_case_: test_case.TestCase,
55
+ trial_id: int = 0,
49
56
  ) -> test_result.TestResult:
50
- score_results: List[score_result.ScoreResult] = []
51
-
52
- for metric in self._scoring_metrics:
53
- try:
54
- score_kwargs = test_case_.scoring_inputs
55
- arguments_validator.validate_score_arguments(
56
- metric=metric,
57
- kwargs=score_kwargs,
58
- scoring_key_mapping=self._scoring_key_mapping,
59
- )
60
- LOGGER.debug("Metric %s score started", metric.name)
61
- result = metric.score(**score_kwargs)
62
- LOGGER.debug("Metric %s score ended", metric.name)
63
-
64
- if isinstance(result, list):
65
- score_results += result
66
- else:
67
- score_results.append(result)
68
- except exceptions.ScoreMethodMissingArguments:
69
- raise
70
- except Exception as exception:
71
- # This can be problematic if the metric returns a list of strings as we will not know the name of the metrics that have failed
72
- LOGGER.error(
73
- "Failed to compute metric %s. Score result will be marked as failed.",
74
- metric.name,
75
- exc_info=True,
76
- )
77
-
78
- if exception_analyzer.is_llm_provider_rate_limit_error(exception):
79
- LOGGER.error(
80
- logging_messages.LLM_PROVIDER_RATE_LIMIT_ERROR_DETECTED_IN_EVALUATE_FUNCTION
81
- )
82
-
83
- score_results.append(
84
- score_result.ScoreResult(
85
- name=metric.name,
86
- value=0.0,
87
- reason=str(exception),
88
- scoring_failed=True,
89
- )
90
- )
57
+ score_results, mapped_scoring_inputs = (
58
+ self._metrics_evaluator.compute_regular_scores(
59
+ dataset_item_content=test_case_.dataset_item_content,
60
+ task_output=test_case_.task_output,
61
+ )
62
+ )
63
+ test_case_.mapped_scoring_inputs = mapped_scoring_inputs
91
64
 
92
65
  test_result_ = test_result.TestResult(
93
- test_case=test_case_, score_results=score_results
66
+ test_case=test_case_,
67
+ score_results=score_results,
68
+ trial_id=trial_id,
94
69
  )
95
- rest_operations.log_test_result_scores(
70
+ rest_operations.log_test_result_feedback_scores(
96
71
  client=self._client,
97
- test_result=test_result_,
72
+ score_results=score_results,
73
+ trace_id=test_case_.trace_id,
98
74
  project_name=self._project_name,
99
75
  )
100
76
  return test_result_
101
77
 
102
- def _evaluate_llm_task(
78
+ @opik.track( # type: ignore[attr-defined,has-type]
79
+ name="task_span_metrics_calculation",
80
+ ignore_arguments=["test_case_"],
81
+ )
82
+ def _compute_scores_for_test_case_with_task_span(
83
+ self,
84
+ trace_id: str,
85
+ task_span: models.SpanModel,
86
+ test_case_: test_case.TestCase,
87
+ ) -> List[score_result.ScoreResult]:
88
+ score_results, mapped_scoring_inputs = (
89
+ self._metrics_evaluator.compute_task_span_scores(
90
+ dataset_item_content=test_case_.dataset_item_content,
91
+ task_output=test_case_.task_output,
92
+ task_span=task_span,
93
+ )
94
+ )
95
+ test_case_.mapped_scoring_inputs = mapped_scoring_inputs
96
+
97
+ # log feedback scores
98
+ rest_operations.log_test_result_feedback_scores(
99
+ client=self._client,
100
+ score_results=score_results,
101
+ trace_id=trace_id,
102
+ project_name=self._project_name,
103
+ )
104
+ return score_results
105
+
106
+ def _compute_test_result_for_llm_task(
103
107
  self,
104
108
  item: dataset_item.DatasetItem,
105
109
  task: LLMTask,
110
+ trial_id: int,
111
+ experiment_: Optional[experiment.Experiment],
106
112
  ) -> test_result.TestResult:
107
113
  if not hasattr(task, "opik_tracked"):
108
114
  name = task.__name__ if hasattr(task, "__name__") else "llm_task"
@@ -111,13 +117,13 @@ class EvaluationEngine:
111
117
  item_content = item.get_content(include_id=True)
112
118
  trace_data = trace.TraceData(
113
119
  input=item_content,
114
- name="evaluation_task",
120
+ name=EVALUATION_TASK_NAME,
115
121
  created_by="evaluation",
116
122
  project_name=self._project_name,
117
123
  )
118
124
 
119
125
  with helpers.evaluate_llm_task_context(
120
- experiment=self._experiment,
126
+ experiment=experiment_,
121
127
  dataset_item_id=item.id,
122
128
  trace_data=trace_data,
123
129
  client=self._client,
@@ -136,49 +142,238 @@ class EvaluationEngine:
136
142
 
137
143
  opik_context.update_current_trace(output=task_output_)
138
144
 
139
- scoring_inputs = arguments_helpers.create_scoring_inputs(
140
- dataset_item=item_content,
141
- task_output=task_output_,
142
- scoring_key_mapping=self._scoring_key_mapping,
143
- )
144
-
145
145
  test_case_ = test_case.TestCase(
146
146
  trace_id=trace_data.id,
147
147
  dataset_item_id=item.id,
148
- scoring_inputs=scoring_inputs,
149
148
  task_output=task_output_,
149
+ dataset_item_content=item_content,
150
150
  )
151
- test_result_ = self._evaluate_test_case(
151
+ test_result_ = self._compute_test_result_for_test_case(
152
152
  test_case_=test_case_,
153
+ trial_id=trial_id,
153
154
  )
154
155
 
155
156
  return test_result_
156
157
 
157
- def evaluate_llm_tasks(
158
+ def _compute_test_results_for_llm_task(
159
+ self,
160
+ dataset_items: List[dataset_item.DatasetItem],
161
+ task: LLMTask,
162
+ experiment_: Optional[experiment.Experiment],
163
+ trial_count: int,
164
+ description: str,
165
+ ) -> List[test_result.TestResult]:
166
+ test_results: List[test_result.TestResult] = []
167
+
168
+ for trial_id in range(trial_count):
169
+ evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
170
+ functools.partial(
171
+ self._compute_test_result_for_llm_task,
172
+ item=item,
173
+ task=task,
174
+ trial_id=trial_id,
175
+ experiment_=experiment_,
176
+ )
177
+ for item in dataset_items
178
+ ]
179
+
180
+ test_results += evaluation_tasks_executor.execute(
181
+ evaluation_tasks=evaluation_tasks,
182
+ workers=self._workers,
183
+ verbose=self._verbose,
184
+ desc=f"{description} trial {trial_id}"
185
+ if trial_count > 1
186
+ else description,
187
+ )
188
+
189
+ return test_results
190
+
191
+ def _update_test_result_with_task_span_metrics(
192
+ self,
193
+ evaluation_task_result: test_result.TestResult,
194
+ trace_trees: List[models.TraceModel],
195
+ ) -> test_result.TestResult:
196
+ # find related trace
197
+ trace_id = evaluation_task_result.test_case.trace_id
198
+ task_trace = None
199
+ for trace_ in trace_trees:
200
+ if trace_.id == trace_id:
201
+ task_trace = trace_
202
+ break
203
+
204
+ if task_trace is None:
205
+ raise ValueError(
206
+ f"No trace found for test result: {evaluation_task_result}"
207
+ )
208
+
209
+ # find evaluation span
210
+ if len(task_trace.spans) == 0:
211
+ raise ValueError(
212
+ f"Task trace contains no spans. Task span metrics require at least one span to be present in the execution trace. Test result: {evaluation_task_result}"
213
+ )
214
+ # the first span is the evaluation span
215
+ evaluation_span = task_trace.spans[0]
216
+
217
+ with helpers.evaluate_llm_task_result_spans_context(
218
+ trace_data=trace.TraceData(
219
+ id=trace_id,
220
+ name=task_trace.name,
221
+ start_time=task_trace.start_time,
222
+ metadata=task_trace.metadata,
223
+ input=task_trace.input,
224
+ output=task_trace.output,
225
+ tags=task_trace.tags,
226
+ project_name=self._project_name,
227
+ created_by="evaluation",
228
+ error_info=task_trace.error_info,
229
+ thread_id=task_trace.thread_id,
230
+ ),
231
+ client=self._client,
232
+ ):
233
+ score_results = self._compute_scores_for_test_case_with_task_span(
234
+ trace_id=trace_id,
235
+ task_span=evaluation_span,
236
+ test_case_=evaluation_task_result.test_case,
237
+ )
238
+ # append scores to the input test result
239
+ evaluation_task_result.score_results += score_results
240
+ return evaluation_task_result
241
+
242
+ def _update_test_results_with_task_span_metrics(
243
+ self,
244
+ test_results: List[test_result.TestResult],
245
+ recording: local_recording._LocalRecordingHandle,
246
+ ) -> None:
247
+ """Evaluate task spans from a local recording."""
248
+ # Get trace trees from the recording (this flushes automatically)
249
+ trace_trees = recording.trace_trees
250
+ if len(trace_trees) == 0:
251
+ LOGGER.warning("No trace trees found in the local recording.")
252
+ return
253
+
254
+ # Create span evaluation tasks from LLM tasks evaluation results and evaluate them in parallel
255
+ span_evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
256
+ functools.partial(
257
+ self._update_test_result_with_task_span_metrics,
258
+ evaluation_task_result=test_result_,
259
+ trace_trees=trace_trees,
260
+ )
261
+ for test_result_ in test_results
262
+ ]
263
+
264
+ evaluation_tasks_executor.execute(
265
+ evaluation_tasks=span_evaluation_tasks,
266
+ workers=self._workers,
267
+ verbose=self._verbose,
268
+ desc="LLM task spans evaluation",
269
+ )
270
+
271
+ LOGGER.debug(
272
+ "Task evaluation span handling is disabled — the evaluation has been completed."
273
+ )
274
+
275
+ def evaluate_llm_task_on_dataset(
158
276
  self,
159
277
  dataset_: dataset.Dataset,
160
278
  task: LLMTask,
161
279
  nb_samples: Optional[int],
162
280
  dataset_item_ids: Optional[List[str]],
281
+ dataset_sampler: Optional[samplers.BaseDatasetSampler],
282
+ trial_count: int,
283
+ experiment_: Optional[experiment.Experiment],
163
284
  ) -> List[test_result.TestResult]:
164
285
  dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
165
286
  nb_samples=nb_samples,
166
287
  dataset_item_ids=dataset_item_ids,
167
288
  )
168
289
 
169
- evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
170
- functools.partial(
171
- self._evaluate_llm_task,
172
- item=item,
290
+ if dataset_sampler is not None:
291
+ dataset_items = dataset_sampler.sample(dataset_items)
292
+
293
+ if not self._metrics_evaluator.has_task_span_metrics:
294
+ return self._compute_test_results_for_llm_task(
295
+ dataset_items=dataset_items,
173
296
  task=task,
297
+ experiment_=experiment_,
298
+ trial_count=trial_count,
299
+ description="Evaluation",
174
300
  )
175
- for item in dataset_items
301
+
302
+ LOGGER.debug(
303
+ "Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
304
+ len(self._metrics_evaluator.task_span_metrics),
305
+ )
306
+
307
+ with local_recording.record_traces_locally(client=self._client) as recording:
308
+ test_results = self._compute_test_results_for_llm_task(
309
+ dataset_items=dataset_items,
310
+ task=task,
311
+ experiment_=experiment_,
312
+ trial_count=trial_count,
313
+ description="Evaluation",
314
+ )
315
+ self._update_test_results_with_task_span_metrics(
316
+ test_results=test_results,
317
+ recording=recording,
318
+ )
319
+
320
+ return test_results
321
+
322
+ def evaluate_llm_task_on_dict_items(
323
+ self,
324
+ items: List[Dict[str, Any]],
325
+ task: LLMTask,
326
+ ) -> List[test_result.TestResult]:
327
+ """
328
+ Evaluate an LLM task on a list of dict items.
329
+
330
+ This method creates traces for each evaluation but doesn't require a Dataset object
331
+ or experiment. It's useful for optimization scenarios where you have items in memory
332
+ and want to evaluate them with a task function.
333
+
334
+ Args:
335
+ items: List of dataset item contents (dictionaries).
336
+ task: A callable that takes a dataset item dict and returns a dict with outputs.
337
+
338
+ Returns:
339
+ List of TestResult objects containing scores for each item.
340
+ """
341
+ # Convert raw items to DatasetItem objects for compatibility
342
+ dataset_items = [
343
+ dataset_item.DatasetItem(
344
+ id=f"temp_item_{idx}",
345
+ **item,
346
+ )
347
+ for idx, item in enumerate(items)
176
348
  ]
177
349
 
178
- test_results = evaluation_tasks_executor.execute(
179
- evaluation_tasks, self._workers, self._verbose
350
+ if not self._metrics_evaluator.has_task_span_metrics:
351
+ return self._compute_test_results_for_llm_task(
352
+ dataset_items=dataset_items,
353
+ task=task,
354
+ experiment_=None,
355
+ trial_count=1,
356
+ description="Items evaluation",
357
+ )
358
+
359
+ LOGGER.debug(
360
+ "Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
361
+ len(self._metrics_evaluator.task_span_metrics),
180
362
  )
181
363
 
364
+ with local_recording.record_traces_locally(client=self._client) as recording:
365
+ test_results = self._compute_test_results_for_llm_task(
366
+ dataset_items=dataset_items,
367
+ task=task,
368
+ experiment_=None,
369
+ trial_count=1,
370
+ description="Items evaluation",
371
+ )
372
+ self._update_test_results_with_task_span_metrics(
373
+ test_results=test_results,
374
+ recording=recording,
375
+ )
376
+
182
377
  return test_results
183
378
 
184
379
  def evaluate_test_cases(
@@ -187,14 +382,16 @@ class EvaluationEngine:
187
382
  ) -> List[test_result.TestResult]:
188
383
  evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
189
384
  functools.partial(
190
- self._evaluate_test_case,
385
+ self._compute_test_result_for_test_case,
191
386
  test_case_=test_case_,
192
387
  )
193
388
  for test_case_ in test_cases
194
389
  ]
195
390
 
196
391
  test_results = evaluation_tasks_executor.execute(
197
- evaluation_tasks, self._workers, self._verbose
392
+ evaluation_tasks=evaluation_tasks,
393
+ workers=self._workers,
394
+ verbose=self._verbose,
198
395
  )
199
396
 
200
397
  return test_results
@@ -10,7 +10,10 @@ T = TypeVar("T")
10
10
 
11
11
 
12
12
  def execute(
13
- evaluation_tasks: List[EvaluationTask[T]], workers: int, verbose: int
13
+ evaluation_tasks: List[EvaluationTask[T]],
14
+ workers: int,
15
+ verbose: int,
16
+ desc: str = "Evaluation",
14
17
  ) -> List[T]:
15
18
  if workers == 1:
16
19
  test_results = [
@@ -18,7 +21,7 @@ def execute(
18
21
  for evaluation_task in _tqdm(
19
22
  evaluation_tasks,
20
23
  disable=(verbose < 1),
21
- desc="Evaluation",
24
+ desc=desc,
22
25
  total=len(evaluation_tasks),
23
26
  )
24
27
  ]
@@ -37,7 +40,7 @@ def execute(
37
40
  test_result_futures,
38
41
  ),
39
42
  disable=(verbose < 1),
40
- desc="Evaluation",
43
+ desc=desc,
41
44
  total=len(test_result_futures),
42
45
  )
43
46
  ]
@@ -11,7 +11,7 @@ import opik.context_storage as context_storage
11
11
 
12
12
  @contextlib.contextmanager
13
13
  def evaluate_llm_task_context(
14
- experiment: experiment.Experiment,
14
+ experiment: Optional[experiment.Experiment],
15
15
  dataset_item_id: str,
16
16
  trace_data: trace.TraceData,
17
17
  client: opik_client.Opik,
@@ -36,9 +36,34 @@ def evaluate_llm_task_context(
36
36
  client = client if client is not None else opik_client.get_client_cached()
37
37
  client.trace(**trace_data.as_parameters)
38
38
 
39
- experiment_item_ = experiment_item.ExperimentItemReferences(
40
- dataset_item_id=dataset_item_id,
41
- trace_id=trace_data.id,
42
- )
39
+ # Only insert experiment item if an experiment is provided
40
+ if experiment is not None:
41
+ experiment_item_ = experiment_item.ExperimentItemReferences(
42
+ dataset_item_id=dataset_item_id,
43
+ trace_id=trace_data.id,
44
+ )
45
+ experiment.insert(experiment_items_references=[experiment_item_])
43
46
 
44
- experiment.insert(experiment_items_references=[experiment_item_])
47
+
48
+ @contextlib.contextmanager
49
+ def evaluate_llm_task_result_spans_context(
50
+ trace_data: trace.TraceData,
51
+ client: opik_client.Opik,
52
+ ) -> Iterator[None]:
53
+ error_info: Optional[ErrorInfoDict] = None
54
+ try:
55
+ context_storage.set_trace_data(trace_data)
56
+ yield
57
+ except Exception as exception:
58
+ error_info = error_info_collector.collect(exception)
59
+ raise
60
+ finally:
61
+ trace_data = context_storage.pop_trace_data() # type: ignore
62
+
63
+ assert trace_data is not None
64
+
65
+ if error_info is not None:
66
+ trace_data.error_info = error_info
67
+
68
+ trace_data.init_end_time()
69
+ client.trace(**trace_data.as_parameters)