opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. opik/__init__.py +19 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/attachment_context.py +36 -0
  9. opik/api_objects/attachment/attachments_extractor.py +153 -0
  10. opik/api_objects/attachment/client.py +1 -0
  11. opik/api_objects/attachment/converters.py +2 -0
  12. opik/api_objects/attachment/decoder.py +18 -0
  13. opik/api_objects/attachment/decoder_base64.py +83 -0
  14. opik/api_objects/attachment/decoder_helpers.py +137 -0
  15. opik/api_objects/data_helpers.py +79 -0
  16. opik/api_objects/dataset/dataset.py +64 -4
  17. opik/api_objects/dataset/rest_operations.py +11 -2
  18. opik/api_objects/experiment/experiment.py +57 -57
  19. opik/api_objects/experiment/experiment_item.py +2 -1
  20. opik/api_objects/experiment/experiments_client.py +64 -0
  21. opik/api_objects/experiment/helpers.py +35 -11
  22. opik/api_objects/experiment/rest_operations.py +65 -5
  23. opik/api_objects/helpers.py +8 -5
  24. opik/api_objects/local_recording.py +81 -0
  25. opik/api_objects/opik_client.py +600 -108
  26. opik/api_objects/opik_query_language.py +39 -5
  27. opik/api_objects/prompt/__init__.py +12 -2
  28. opik/api_objects/prompt/base_prompt.py +69 -0
  29. opik/api_objects/prompt/base_prompt_template.py +29 -0
  30. opik/api_objects/prompt/chat/__init__.py +1 -0
  31. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  32. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  33. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  34. opik/api_objects/prompt/client.py +189 -47
  35. opik/api_objects/prompt/text/__init__.py +1 -0
  36. opik/api_objects/prompt/text/prompt.py +174 -0
  37. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  38. opik/api_objects/prompt/types.py +23 -0
  39. opik/api_objects/search_helpers.py +89 -0
  40. opik/api_objects/span/span_data.py +35 -25
  41. opik/api_objects/threads/threads_client.py +39 -5
  42. opik/api_objects/trace/trace_client.py +52 -2
  43. opik/api_objects/trace/trace_data.py +15 -24
  44. opik/api_objects/validation_helpers.py +3 -3
  45. opik/cli/__init__.py +5 -0
  46. opik/cli/__main__.py +6 -0
  47. opik/cli/configure.py +66 -0
  48. opik/cli/exports/__init__.py +131 -0
  49. opik/cli/exports/dataset.py +278 -0
  50. opik/cli/exports/experiment.py +784 -0
  51. opik/cli/exports/project.py +685 -0
  52. opik/cli/exports/prompt.py +578 -0
  53. opik/cli/exports/utils.py +406 -0
  54. opik/cli/harbor.py +39 -0
  55. opik/cli/healthcheck.py +21 -0
  56. opik/cli/imports/__init__.py +439 -0
  57. opik/cli/imports/dataset.py +143 -0
  58. opik/cli/imports/experiment.py +1192 -0
  59. opik/cli/imports/project.py +262 -0
  60. opik/cli/imports/prompt.py +177 -0
  61. opik/cli/imports/utils.py +280 -0
  62. opik/cli/main.py +49 -0
  63. opik/cli/proxy.py +93 -0
  64. opik/cli/usage_report/__init__.py +16 -0
  65. opik/cli/usage_report/charts.py +783 -0
  66. opik/cli/usage_report/cli.py +274 -0
  67. opik/cli/usage_report/constants.py +9 -0
  68. opik/cli/usage_report/extraction.py +749 -0
  69. opik/cli/usage_report/pdf.py +244 -0
  70. opik/cli/usage_report/statistics.py +78 -0
  71. opik/cli/usage_report/utils.py +235 -0
  72. opik/config.py +13 -7
  73. opik/configurator/configure.py +17 -0
  74. opik/datetime_helpers.py +12 -0
  75. opik/decorator/arguments_helpers.py +9 -1
  76. opik/decorator/base_track_decorator.py +205 -133
  77. opik/decorator/context_manager/span_context_manager.py +123 -0
  78. opik/decorator/context_manager/trace_context_manager.py +84 -0
  79. opik/decorator/opik_args/__init__.py +13 -0
  80. opik/decorator/opik_args/api_classes.py +71 -0
  81. opik/decorator/opik_args/helpers.py +120 -0
  82. opik/decorator/span_creation_handler.py +25 -6
  83. opik/dict_utils.py +3 -3
  84. opik/evaluation/__init__.py +13 -2
  85. opik/evaluation/engine/engine.py +272 -75
  86. opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
  87. opik/evaluation/engine/helpers.py +31 -6
  88. opik/evaluation/engine/metrics_evaluator.py +237 -0
  89. opik/evaluation/evaluation_result.py +168 -2
  90. opik/evaluation/evaluator.py +533 -62
  91. opik/evaluation/metrics/__init__.py +103 -4
  92. opik/evaluation/metrics/aggregated_metric.py +35 -6
  93. opik/evaluation/metrics/base_metric.py +1 -1
  94. opik/evaluation/metrics/conversation/__init__.py +48 -0
  95. opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
  96. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  97. opik/evaluation/metrics/conversation/helpers.py +14 -15
  98. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  99. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  100. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  101. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  102. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  103. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  104. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  105. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
  106. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
  107. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  108. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
  109. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
  110. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  111. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
  112. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
  113. opik/evaluation/metrics/conversation/types.py +4 -5
  114. opik/evaluation/metrics/conversation_types.py +9 -0
  115. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  116. opik/evaluation/metrics/heuristics/bleu.py +35 -15
  117. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  118. opik/evaluation/metrics/heuristics/contains.py +47 -11
  119. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  120. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  121. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  122. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  123. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  124. opik/evaluation/metrics/heuristics/readability.py +129 -0
  125. opik/evaluation/metrics/heuristics/rouge.py +26 -9
  126. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  127. opik/evaluation/metrics/heuristics/tone.py +155 -0
  128. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  129. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
  130. opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
  131. opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
  132. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  133. opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
  134. opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
  135. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  136. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  137. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  138. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  139. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  140. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  141. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  142. opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
  143. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  144. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  145. opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
  146. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  147. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  148. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  149. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  150. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  151. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  152. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  153. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  154. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  155. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
  156. opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
  157. opik/evaluation/metrics/ragas_metric.py +43 -23
  158. opik/evaluation/models/__init__.py +8 -0
  159. opik/evaluation/models/base_model.py +107 -1
  160. opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
  161. opik/evaluation/models/langchain/message_converters.py +97 -15
  162. opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
  163. opik/evaluation/models/litellm/util.py +125 -0
  164. opik/evaluation/models/litellm/warning_filters.py +16 -4
  165. opik/evaluation/models/model_capabilities.py +187 -0
  166. opik/evaluation/models/models_factory.py +25 -3
  167. opik/evaluation/preprocessing.py +92 -0
  168. opik/evaluation/report.py +70 -12
  169. opik/evaluation/rest_operations.py +49 -45
  170. opik/evaluation/samplers/__init__.py +4 -0
  171. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  172. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  173. opik/evaluation/score_statistics.py +66 -0
  174. opik/evaluation/scorers/__init__.py +4 -0
  175. opik/evaluation/scorers/scorer_function.py +55 -0
  176. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  177. opik/evaluation/test_case.py +3 -2
  178. opik/evaluation/test_result.py +1 -0
  179. opik/evaluation/threads/evaluator.py +31 -3
  180. opik/evaluation/threads/helpers.py +3 -2
  181. opik/evaluation/types.py +9 -1
  182. opik/exceptions.py +33 -0
  183. opik/file_upload/file_uploader.py +13 -0
  184. opik/file_upload/upload_options.py +2 -0
  185. opik/hooks/__init__.py +23 -0
  186. opik/hooks/anonymizer_hook.py +36 -0
  187. opik/hooks/httpx_client_hook.py +112 -0
  188. opik/httpx_client.py +12 -9
  189. opik/id_helpers.py +18 -0
  190. opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
  191. opik/integrations/adk/helpers.py +16 -7
  192. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  193. opik/integrations/adk/opik_tracer.py +14 -1
  194. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  195. opik/integrations/adk/recursive_callback_injector.py +4 -7
  196. opik/integrations/bedrock/converse/__init__.py +0 -0
  197. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  198. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
  199. opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
  200. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  201. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  202. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  203. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  204. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  205. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  206. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  207. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  208. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  209. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  210. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  211. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  212. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  213. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  214. opik/integrations/bedrock/opik_tracker.py +42 -4
  215. opik/integrations/bedrock/types.py +19 -0
  216. opik/integrations/crewai/crewai_decorator.py +8 -51
  217. opik/integrations/crewai/opik_tracker.py +31 -10
  218. opik/integrations/crewai/patchers/__init__.py +5 -0
  219. opik/integrations/crewai/patchers/flow.py +118 -0
  220. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  221. opik/integrations/crewai/patchers/llm_client.py +207 -0
  222. opik/integrations/dspy/callback.py +80 -17
  223. opik/integrations/dspy/parsers.py +168 -0
  224. opik/integrations/harbor/__init__.py +17 -0
  225. opik/integrations/harbor/experiment_service.py +269 -0
  226. opik/integrations/harbor/opik_tracker.py +528 -0
  227. opik/integrations/haystack/opik_connector.py +2 -2
  228. opik/integrations/haystack/opik_tracer.py +3 -7
  229. opik/integrations/langchain/__init__.py +3 -1
  230. opik/integrations/langchain/helpers.py +96 -0
  231. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  232. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  233. opik/integrations/langchain/opik_encoder_extension.py +1 -1
  234. opik/integrations/langchain/opik_tracer.py +474 -229
  235. opik/integrations/litellm/__init__.py +5 -0
  236. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  237. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  238. opik/integrations/litellm/opik_tracker.py +43 -0
  239. opik/integrations/litellm/stream_patchers.py +151 -0
  240. opik/integrations/llama_index/callback.py +146 -107
  241. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  242. opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
  243. opik/integrations/openai/opik_tracker.py +1 -1
  244. opik/integrations/sagemaker/auth.py +5 -1
  245. opik/llm_usage/google_usage.py +3 -1
  246. opik/llm_usage/opik_usage.py +7 -8
  247. opik/llm_usage/opik_usage_factory.py +4 -2
  248. opik/logging_messages.py +6 -0
  249. opik/message_processing/batching/base_batcher.py +14 -21
  250. opik/message_processing/batching/batch_manager.py +22 -10
  251. opik/message_processing/batching/batch_manager_constuctors.py +10 -0
  252. opik/message_processing/batching/batchers.py +59 -27
  253. opik/message_processing/batching/flushing_thread.py +0 -3
  254. opik/message_processing/emulation/__init__.py +0 -0
  255. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  256. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  257. opik/message_processing/emulation/models.py +162 -0
  258. opik/message_processing/encoder_helpers.py +79 -0
  259. opik/message_processing/messages.py +56 -1
  260. opik/message_processing/preprocessing/__init__.py +0 -0
  261. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  262. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  263. opik/message_processing/preprocessing/constants.py +1 -0
  264. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  265. opik/message_processing/preprocessing/preprocessor.py +36 -0
  266. opik/message_processing/processors/__init__.py +0 -0
  267. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  268. opik/message_processing/processors/message_processors.py +92 -0
  269. opik/message_processing/processors/message_processors_chain.py +96 -0
  270. opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
  271. opik/message_processing/queue_consumer.py +9 -3
  272. opik/message_processing/streamer.py +71 -33
  273. opik/message_processing/streamer_constructors.py +43 -10
  274. opik/opik_context.py +16 -4
  275. opik/plugins/pytest/hooks.py +5 -3
  276. opik/rest_api/__init__.py +346 -15
  277. opik/rest_api/alerts/__init__.py +7 -0
  278. opik/rest_api/alerts/client.py +667 -0
  279. opik/rest_api/alerts/raw_client.py +1015 -0
  280. opik/rest_api/alerts/types/__init__.py +7 -0
  281. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  282. opik/rest_api/annotation_queues/__init__.py +4 -0
  283. opik/rest_api/annotation_queues/client.py +668 -0
  284. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  285. opik/rest_api/automation_rule_evaluators/client.py +34 -2
  286. opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
  287. opik/rest_api/client.py +15 -0
  288. opik/rest_api/dashboards/__init__.py +4 -0
  289. opik/rest_api/dashboards/client.py +462 -0
  290. opik/rest_api/dashboards/raw_client.py +648 -0
  291. opik/rest_api/datasets/client.py +1310 -44
  292. opik/rest_api/datasets/raw_client.py +2269 -358
  293. opik/rest_api/experiments/__init__.py +2 -2
  294. opik/rest_api/experiments/client.py +191 -5
  295. opik/rest_api/experiments/raw_client.py +301 -7
  296. opik/rest_api/experiments/types/__init__.py +4 -1
  297. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  298. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  299. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  300. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  301. opik/rest_api/llm_provider_key/client.py +20 -0
  302. opik/rest_api/llm_provider_key/raw_client.py +20 -0
  303. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  304. opik/rest_api/manual_evaluation/__init__.py +4 -0
  305. opik/rest_api/manual_evaluation/client.py +347 -0
  306. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  307. opik/rest_api/optimizations/client.py +145 -9
  308. opik/rest_api/optimizations/raw_client.py +237 -13
  309. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  310. opik/rest_api/prompts/__init__.py +2 -2
  311. opik/rest_api/prompts/client.py +227 -6
  312. opik/rest_api/prompts/raw_client.py +331 -2
  313. opik/rest_api/prompts/types/__init__.py +3 -1
  314. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  315. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  316. opik/rest_api/spans/__init__.py +0 -2
  317. opik/rest_api/spans/client.py +238 -76
  318. opik/rest_api/spans/raw_client.py +307 -95
  319. opik/rest_api/spans/types/__init__.py +0 -2
  320. opik/rest_api/traces/client.py +572 -161
  321. opik/rest_api/traces/raw_client.py +736 -229
  322. opik/rest_api/types/__init__.py +352 -17
  323. opik/rest_api/types/aggregation_data.py +1 -0
  324. opik/rest_api/types/alert.py +33 -0
  325. opik/rest_api/types/alert_alert_type.py +5 -0
  326. opik/rest_api/types/alert_page_public.py +24 -0
  327. opik/rest_api/types/alert_public.py +33 -0
  328. opik/rest_api/types/alert_public_alert_type.py +5 -0
  329. opik/rest_api/types/alert_trigger.py +27 -0
  330. opik/rest_api/types/alert_trigger_config.py +28 -0
  331. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  332. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  333. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  334. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  335. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  336. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  337. opik/rest_api/types/alert_trigger_public.py +27 -0
  338. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  339. opik/rest_api/types/alert_trigger_write.py +23 -0
  340. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  341. opik/rest_api/types/alert_write.py +28 -0
  342. opik/rest_api/types/alert_write_alert_type.py +5 -0
  343. opik/rest_api/types/annotation_queue.py +42 -0
  344. opik/rest_api/types/annotation_queue_batch.py +27 -0
  345. opik/rest_api/types/annotation_queue_item_ids.py +19 -0
  346. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  347. opik/rest_api/types/annotation_queue_public.py +38 -0
  348. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  349. opik/rest_api/types/annotation_queue_reviewer.py +20 -0
  350. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  351. opik/rest_api/types/annotation_queue_scope.py +5 -0
  352. opik/rest_api/types/annotation_queue_write.py +31 -0
  353. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  354. opik/rest_api/types/audio_url.py +19 -0
  355. opik/rest_api/types/audio_url_public.py +19 -0
  356. opik/rest_api/types/audio_url_write.py +19 -0
  357. opik/rest_api/types/automation_rule_evaluator.py +62 -2
  358. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  359. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  360. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  361. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  362. opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
  363. opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
  364. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  365. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  366. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  367. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  368. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  369. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  370. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  371. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  372. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  373. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  374. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  375. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  376. opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
  377. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  378. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  379. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  380. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  381. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  382. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  383. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  384. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  385. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  386. opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
  387. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  388. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  389. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  390. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  391. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  392. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  393. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  394. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  395. opik/rest_api/types/dashboard_page_public.py +24 -0
  396. opik/rest_api/types/dashboard_public.py +30 -0
  397. opik/rest_api/types/dataset.py +4 -0
  398. opik/rest_api/types/dataset_expansion.py +42 -0
  399. opik/rest_api/types/dataset_expansion_response.py +39 -0
  400. opik/rest_api/types/dataset_item.py +2 -0
  401. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  402. opik/rest_api/types/dataset_item_compare.py +2 -0
  403. opik/rest_api/types/dataset_item_filter.py +27 -0
  404. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  405. opik/rest_api/types/dataset_item_page_compare.py +5 -0
  406. opik/rest_api/types/dataset_item_page_public.py +5 -0
  407. opik/rest_api/types/dataset_item_public.py +2 -0
  408. opik/rest_api/types/dataset_item_update.py +39 -0
  409. opik/rest_api/types/dataset_item_write.py +1 -0
  410. opik/rest_api/types/dataset_public.py +4 -0
  411. opik/rest_api/types/dataset_public_status.py +5 -0
  412. opik/rest_api/types/dataset_status.py +5 -0
  413. opik/rest_api/types/dataset_version_diff.py +22 -0
  414. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  415. opik/rest_api/types/dataset_version_page_public.py +23 -0
  416. opik/rest_api/types/dataset_version_public.py +59 -0
  417. opik/rest_api/types/dataset_version_summary.py +46 -0
  418. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  419. opik/rest_api/types/experiment.py +7 -2
  420. opik/rest_api/types/experiment_group_response.py +2 -0
  421. opik/rest_api/types/experiment_public.py +7 -2
  422. opik/rest_api/types/experiment_public_status.py +5 -0
  423. opik/rest_api/types/experiment_score.py +20 -0
  424. opik/rest_api/types/experiment_score_public.py +20 -0
  425. opik/rest_api/types/experiment_score_write.py +20 -0
  426. opik/rest_api/types/experiment_status.py +5 -0
  427. opik/rest_api/types/feedback.py +25 -1
  428. opik/rest_api/types/feedback_create.py +20 -1
  429. opik/rest_api/types/feedback_object_public.py +27 -1
  430. opik/rest_api/types/feedback_public.py +25 -1
  431. opik/rest_api/types/feedback_score_batch_item.py +2 -1
  432. opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
  433. opik/rest_api/types/feedback_score_public.py +4 -0
  434. opik/rest_api/types/feedback_update.py +20 -1
  435. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  436. opik/rest_api/types/group_detail.py +19 -0
  437. opik/rest_api/types/group_details.py +20 -0
  438. opik/rest_api/types/guardrail.py +1 -0
  439. opik/rest_api/types/guardrail_write.py +1 -0
  440. opik/rest_api/types/ids_holder.py +19 -0
  441. opik/rest_api/types/image_url.py +20 -0
  442. opik/rest_api/types/image_url_public.py +20 -0
  443. opik/rest_api/types/image_url_write.py +20 -0
  444. opik/rest_api/types/llm_as_judge_message.py +5 -1
  445. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  446. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  447. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  448. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  449. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  450. opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
  451. opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
  452. opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
  453. opik/rest_api/types/manual_evaluation_request.py +38 -0
  454. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  455. opik/rest_api/types/manual_evaluation_response.py +27 -0
  456. opik/rest_api/types/optimization.py +4 -2
  457. opik/rest_api/types/optimization_public.py +4 -2
  458. opik/rest_api/types/optimization_public_status.py +3 -1
  459. opik/rest_api/types/optimization_status.py +3 -1
  460. opik/rest_api/types/optimization_studio_config.py +27 -0
  461. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  462. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  463. opik/rest_api/types/optimization_studio_log.py +22 -0
  464. opik/rest_api/types/optimization_write.py +4 -2
  465. opik/rest_api/types/optimization_write_status.py +3 -1
  466. opik/rest_api/types/project.py +1 -0
  467. opik/rest_api/types/project_detailed.py +1 -0
  468. opik/rest_api/types/project_reference.py +31 -0
  469. opik/rest_api/types/project_reference_public.py +31 -0
  470. opik/rest_api/types/project_stats_summary_item.py +1 -0
  471. opik/rest_api/types/prompt.py +6 -0
  472. opik/rest_api/types/prompt_detail.py +6 -0
  473. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  474. opik/rest_api/types/prompt_public.py +6 -0
  475. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  476. opik/rest_api/types/prompt_template_structure.py +5 -0
  477. opik/rest_api/types/prompt_version.py +3 -0
  478. opik/rest_api/types/prompt_version_detail.py +3 -0
  479. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  480. opik/rest_api/types/prompt_version_link.py +1 -0
  481. opik/rest_api/types/prompt_version_link_public.py +1 -0
  482. opik/rest_api/types/prompt_version_page_public.py +5 -0
  483. opik/rest_api/types/prompt_version_public.py +3 -0
  484. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  485. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  486. opik/rest_api/types/prompt_version_update.py +33 -0
  487. opik/rest_api/types/provider_api_key.py +9 -0
  488. opik/rest_api/types/provider_api_key_provider.py +1 -1
  489. opik/rest_api/types/provider_api_key_public.py +9 -0
  490. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  491. opik/rest_api/types/score_name.py +1 -0
  492. opik/rest_api/types/service_toggles_config.py +18 -0
  493. opik/rest_api/types/span.py +1 -2
  494. opik/rest_api/types/span_enrichment_options.py +31 -0
  495. opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
  496. opik/rest_api/types/span_filter.py +23 -0
  497. opik/rest_api/types/span_filter_operator.py +21 -0
  498. opik/rest_api/types/span_filter_write.py +23 -0
  499. opik/rest_api/types/span_filter_write_operator.py +21 -0
  500. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  501. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  502. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  503. opik/rest_api/types/span_public.py +1 -2
  504. opik/rest_api/types/span_update.py +46 -0
  505. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  506. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  507. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  508. opik/rest_api/types/span_write.py +1 -2
  509. opik/rest_api/types/studio_evaluation.py +20 -0
  510. opik/rest_api/types/studio_evaluation_public.py +20 -0
  511. opik/rest_api/types/studio_evaluation_write.py +20 -0
  512. opik/rest_api/types/studio_llm_model.py +21 -0
  513. opik/rest_api/types/studio_llm_model_public.py +21 -0
  514. opik/rest_api/types/studio_llm_model_write.py +21 -0
  515. opik/rest_api/types/studio_message.py +20 -0
  516. opik/rest_api/types/studio_message_public.py +20 -0
  517. opik/rest_api/types/studio_message_write.py +20 -0
  518. opik/rest_api/types/studio_metric.py +21 -0
  519. opik/rest_api/types/studio_metric_public.py +21 -0
  520. opik/rest_api/types/studio_metric_write.py +21 -0
  521. opik/rest_api/types/studio_optimizer.py +21 -0
  522. opik/rest_api/types/studio_optimizer_public.py +21 -0
  523. opik/rest_api/types/studio_optimizer_write.py +21 -0
  524. opik/rest_api/types/studio_prompt.py +20 -0
  525. opik/rest_api/types/studio_prompt_public.py +20 -0
  526. opik/rest_api/types/studio_prompt_write.py +20 -0
  527. opik/rest_api/types/trace.py +11 -2
  528. opik/rest_api/types/trace_enrichment_options.py +32 -0
  529. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
  530. opik/rest_api/types/trace_filter.py +23 -0
  531. opik/rest_api/types/trace_filter_operator.py +21 -0
  532. opik/rest_api/types/trace_filter_write.py +23 -0
  533. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  534. opik/rest_api/types/trace_public.py +11 -2
  535. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  536. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  537. opik/rest_api/types/trace_thread_identifier.py +1 -0
  538. opik/rest_api/types/trace_update.py +39 -0
  539. opik/rest_api/types/trace_write.py +1 -2
  540. opik/rest_api/types/value_entry.py +2 -0
  541. opik/rest_api/types/value_entry_compare.py +2 -0
  542. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  543. opik/rest_api/types/value_entry_public.py +2 -0
  544. opik/rest_api/types/video_url.py +19 -0
  545. opik/rest_api/types/video_url_public.py +19 -0
  546. opik/rest_api/types/video_url_write.py +19 -0
  547. opik/rest_api/types/webhook.py +28 -0
  548. opik/rest_api/types/webhook_examples.py +19 -0
  549. opik/rest_api/types/webhook_public.py +28 -0
  550. opik/rest_api/types/webhook_test_result.py +23 -0
  551. opik/rest_api/types/webhook_test_result_status.py +5 -0
  552. opik/rest_api/types/webhook_write.py +23 -0
  553. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  554. opik/rest_api/types/workspace_configuration.py +5 -0
  555. opik/rest_api/welcome_wizard/__init__.py +4 -0
  556. opik/rest_api/welcome_wizard/client.py +195 -0
  557. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  558. opik/rest_api/workspaces/client.py +14 -2
  559. opik/rest_api/workspaces/raw_client.py +10 -0
  560. opik/s3_httpx_client.py +14 -1
  561. opik/simulation/__init__.py +6 -0
  562. opik/simulation/simulated_user.py +99 -0
  563. opik/simulation/simulator.py +108 -0
  564. opik/synchronization.py +5 -6
  565. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  566. opik/types.py +36 -0
  567. opik/validation/chat_prompt_messages.py +241 -0
  568. opik/validation/feedback_score.py +3 -3
  569. opik/validation/validator.py +28 -0
  570. opik-1.9.71.dist-info/METADATA +370 -0
  571. opik-1.9.71.dist-info/RECORD +1110 -0
  572. opik/api_objects/prompt/prompt.py +0 -112
  573. opik/cli.py +0 -193
  574. opik/hooks.py +0 -13
  575. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  576. opik/integrations/bedrock/helpers.py +0 -8
  577. opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
  578. opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
  579. opik-1.8.39.dist-info/METADATA +0 -339
  580. opik-1.8.39.dist-info/RECORD +0 -790
  581. /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
  582. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
  583. /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
  584. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
  585. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
  586. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
  587. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  588. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  589. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
  590. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  591. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
  592. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -1,37 +1,98 @@
1
1
  import logging
2
2
  import time
3
- from typing import Any, Callable, Dict, List, Optional, Union
3
+ from typing import Any, Callable, Dict, List, Optional, Union, cast
4
4
 
5
- from .. import Prompt
5
+ from ..api_objects.prompt import base_prompt
6
6
  from ..api_objects import opik_client
7
7
  from ..api_objects import dataset, experiment
8
8
  from ..api_objects.experiment import helpers as experiment_helpers
9
- from ..api_objects.prompt import prompt_template
10
- from . import asyncio_support, engine, evaluation_result, report, rest_operations
11
- from .metrics import base_metric
12
- from .models import base_model, models_factory
13
- from .types import LLMTask, ScoringKeyMappingType
9
+ from ..api_objects.prompt.chat import chat_prompt_template
10
+ from ..api_objects.prompt import types as prompt_types
11
+ from . import (
12
+ asyncio_support,
13
+ engine,
14
+ evaluation_result,
15
+ report,
16
+ rest_operations,
17
+ samplers,
18
+ )
19
+ from .metrics import base_metric, score_result
20
+ from .models import ModelCapabilities, base_model, models_factory
21
+ from .scorers import scorer_function, scorer_wrapper_metric
22
+ from . import test_result
23
+ from .types import ExperimentScoreFunction, LLMTask, ScoringKeyMappingType
24
+ from .. import url_helpers
14
25
 
15
26
  LOGGER = logging.getLogger(__name__)
27
+ MODALITY_SUPPORT_DOC_URL = (
28
+ "https://www.comet.com/docs/opik/evaluation/evaluate_multimodal"
29
+ )
30
+
31
+
32
+ def _try_notifying_about_experiment_completion(
33
+ experiment: experiment.Experiment,
34
+ ) -> None:
35
+ try:
36
+ experiment.experiments_rest_client.finish_experiments(ids=[experiment.id])
37
+ except Exception:
38
+ LOGGER.debug(
39
+ "Failed to notify backend about the experiment completion. Experiment ID: %s",
40
+ experiment.id,
41
+ exc_info=True,
42
+ )
43
+
44
+
45
+ def _compute_experiment_scores(
46
+ experiment_scoring_functions: List[ExperimentScoreFunction],
47
+ test_results: List[test_result.TestResult],
48
+ ) -> List[score_result.ScoreResult]:
49
+ """Compute experiment-level scores from test results."""
50
+ if not experiment_scoring_functions or not test_results:
51
+ return []
52
+
53
+ all_scores: List[score_result.ScoreResult] = []
54
+ for score_function in experiment_scoring_functions:
55
+ try:
56
+ scores = score_function(test_results)
57
+ # Handle Union[ScoreResult, List[ScoreResult]]
58
+ if isinstance(scores, list):
59
+ all_scores.extend(scores)
60
+ else:
61
+ all_scores.append(scores)
62
+ except Exception as e:
63
+ LOGGER.warning(
64
+ "Failed to compute experiment score: %s",
65
+ e,
66
+ exc_info=True,
67
+ )
68
+
69
+ return all_scores
16
70
 
17
71
 
18
72
  def evaluate(
19
73
  dataset: dataset.Dataset,
20
74
  task: LLMTask,
21
75
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
76
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
77
+ experiment_name_prefix: Optional[str] = None,
22
78
  experiment_name: Optional[str] = None,
23
79
  project_name: Optional[str] = None,
24
80
  experiment_config: Optional[Dict[str, Any]] = None,
25
81
  verbose: int = 1,
26
82
  nb_samples: Optional[int] = None,
27
83
  task_threads: int = 16,
28
- prompt: Optional[Prompt] = None,
29
- prompts: Optional[List[Prompt]] = None,
84
+ prompt: Optional[base_prompt.BasePrompt] = None,
85
+ prompts: Optional[List[base_prompt.BasePrompt]] = None,
30
86
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
31
87
  dataset_item_ids: Optional[List[str]] = None,
88
+ dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
89
+ trial_count: int = 1,
90
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
32
91
  ) -> evaluation_result.EvaluationResult:
33
92
  """
34
- Performs task evaluation on a given dataset.
93
+ Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
94
+ evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
95
+ to receive inputs and outputs from the task.
35
96
 
36
97
  Args:
37
98
  dataset: An Opik dataset instance
@@ -39,6 +100,10 @@ def evaluate(
39
100
  task: A callable object that takes dict with dataset item content
40
101
  as input and returns dict which will later be used for scoring.
41
102
 
103
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
104
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
105
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
106
+
42
107
  experiment_name: The name of the experiment associated with evaluation run.
43
108
  If None, a generated name will be used.
44
109
 
@@ -53,8 +118,16 @@ def evaluate(
53
118
  are mandatory in `task`-returned dictionary.
54
119
  If no value provided, the experiment won't have any scoring metrics.
55
120
 
121
+ scoring_functions: List of scorer functions to be executed during evaluation.
122
+ Each scorer function includes a scoring method that accepts predefined
123
+ arguments supplied by the evaluation engine:
124
+ • dataset_item — a dictionary containing the dataset item content,
125
+ • task_outputs — a dictionary containing the LLM task output.
126
+ • task_span - the data collected during the LLM task execution [optional].
127
+
56
128
  verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
57
- 0 - no outputs, 1 - outputs are enabled (default).
129
+ 0 - no outputs, 1 - outputs are enabled (default), 2 - outputs are enabled and detailed statistics
130
+ are displayed.
58
131
 
59
132
  nb_samples: number of samples to evaluate. If no value is provided, all samples in the dataset will be evaluated.
60
133
 
@@ -73,9 +146,20 @@ def evaluate(
73
146
  `{"input": "user_question"}` to map the "user_question" key to "input".
74
147
 
75
148
  dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
149
+
150
+ dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
151
+ If not provided, all samples in the dataset will be evaluated.
152
+
153
+ trial_count: number of times to run the task and evaluate the task output for every dataset item.
154
+
155
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
156
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
157
+ These scores are computed after all test results are collected and represent aggregate
158
+ metrics across the entire experiment.
76
159
  """
77
- if scoring_metrics is None:
78
- scoring_metrics = []
160
+ experiment_scoring_functions = (
161
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
162
+ )
79
163
 
80
164
  checked_prompts = experiment_helpers.handle_prompt_args(
81
165
  prompt=prompt,
@@ -84,6 +168,11 @@ def evaluate(
84
168
 
85
169
  client = opik_client.get_client_cached()
86
170
 
171
+ experiment_name = _use_or_create_experiment_name(
172
+ experiment_name=experiment_name,
173
+ experiment_name_prefix=experiment_name_prefix,
174
+ )
175
+
87
176
  experiment = client.create_experiment(
88
177
  name=experiment_name,
89
178
  dataset_name=dataset.name,
@@ -91,6 +180,13 @@ def evaluate(
91
180
  prompts=checked_prompts,
92
181
  )
93
182
 
183
+ # wrap scoring functions if any
184
+ scoring_metrics = _wrap_scoring_functions(
185
+ scoring_functions=scoring_functions,
186
+ scoring_metrics=scoring_metrics,
187
+ project_name=project_name,
188
+ )
189
+
94
190
  return _evaluate_task(
95
191
  client=client,
96
192
  experiment=experiment,
@@ -103,6 +199,9 @@ def evaluate(
103
199
  task_threads=task_threads,
104
200
  scoring_key_mapping=scoring_key_mapping,
105
201
  dataset_item_ids=dataset_item_ids,
202
+ dataset_sampler=dataset_sampler,
203
+ trial_count=trial_count,
204
+ experiment_scoring_functions=experiment_scoring_functions,
106
205
  )
107
206
 
108
207
 
@@ -119,6 +218,9 @@ def _evaluate_task(
119
218
  task_threads: int,
120
219
  scoring_key_mapping: Optional[ScoringKeyMappingType],
121
220
  dataset_item_ids: Optional[List[str]],
221
+ dataset_sampler: Optional[samplers.BaseDatasetSampler],
222
+ trial_count: int,
223
+ experiment_scoring_functions: List[ExperimentScoreFunction],
122
224
  ) -> evaluation_result.EvaluationResult:
123
225
  start_time = time.time()
124
226
 
@@ -126,51 +228,82 @@ def _evaluate_task(
126
228
  evaluation_engine = engine.EvaluationEngine(
127
229
  client=client,
128
230
  project_name=project_name,
129
- experiment_=experiment,
130
231
  scoring_metrics=scoring_metrics,
131
232
  workers=task_threads,
132
233
  verbose=verbose,
133
234
  scoring_key_mapping=scoring_key_mapping,
134
235
  )
135
- test_results = evaluation_engine.evaluate_llm_tasks(
236
+ test_results = evaluation_engine.evaluate_llm_task_on_dataset(
136
237
  dataset_=dataset,
137
238
  task=task,
138
239
  nb_samples=nb_samples,
139
240
  dataset_item_ids=dataset_item_ids,
241
+ dataset_sampler=dataset_sampler,
242
+ trial_count=trial_count,
243
+ experiment_=experiment,
140
244
  )
141
245
 
142
246
  total_time = time.time() - start_time
143
247
 
144
- if verbose == 1:
145
- report.display_experiment_results(dataset.name, total_time, test_results)
248
+ # Compute experiment scores
249
+ computed_experiment_scores = _compute_experiment_scores(
250
+ experiment_scoring_functions=experiment_scoring_functions,
251
+ test_results=test_results,
252
+ )
146
253
 
147
- report.display_experiment_link(
254
+ if verbose >= 1:
255
+ report.display_experiment_results(
256
+ dataset.name, total_time, test_results, computed_experiment_scores
257
+ )
258
+
259
+ experiment_url = url_helpers.get_experiment_url_by_id(
148
260
  experiment_id=experiment.id,
149
261
  dataset_id=dataset.id,
150
262
  url_override=client.config.url_override,
151
263
  )
152
264
 
265
+ report.display_experiment_link(experiment_url=experiment_url)
266
+
153
267
  client.flush()
154
268
 
269
+ _try_notifying_about_experiment_completion(experiment)
270
+
271
+ # Log experiment scores to backend
272
+ if computed_experiment_scores:
273
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
274
+
155
275
  evaluation_result_ = evaluation_result.EvaluationResult(
156
276
  dataset_id=dataset.id,
157
277
  experiment_id=experiment.id,
158
278
  experiment_name=experiment.name,
159
279
  test_results=test_results,
280
+ experiment_url=experiment_url,
281
+ trial_count=trial_count,
282
+ experiment_scores=computed_experiment_scores,
160
283
  )
161
284
 
285
+ if verbose >= 2:
286
+ report.display_evaluation_scores_statistics(
287
+ dataset_name=dataset.name,
288
+ evaluation_results=evaluation_result_,
289
+ )
290
+
162
291
  return evaluation_result_
163
292
 
164
293
 
165
294
  def evaluate_experiment(
166
295
  experiment_name: str,
167
296
  scoring_metrics: List[base_metric.BaseMetric],
297
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
168
298
  scoring_threads: int = 16,
169
299
  verbose: int = 1,
170
300
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
171
301
  experiment_id: Optional[str] = None,
302
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
172
303
  ) -> evaluation_result.EvaluationResult:
173
- """Update existing experiment with new evaluation metrics.
304
+ """Update the existing experiment with new evaluation metrics. You can use either `scoring_metrics` or `scorer_functions` to calculate
305
+ evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
306
+ to receive inputs and outputs from the task.
174
307
 
175
308
  Args:
176
309
  experiment_name: The name of the experiment to update.
@@ -181,15 +314,32 @@ def evaluate_experiment(
181
314
  of the `score` method in metrics that you need to find out which keys
182
315
  are mandatory in `task`-returned dictionary.
183
316
 
317
+ scoring_functions: List of scorer functions to be executed during evaluation.
318
+ Each scorer function includes a scoring method that accepts predefined
319
+ arguments supplied by the evaluation engine:
320
+ • dataset_item — a dictionary containing the dataset item content,
321
+ • task_outputs — a dictionary containing the LLM task output.
322
+ • task_span - the data collected during the LLM task execution [optional].
323
+
184
324
  scoring_threads: amount of thread workers to run scoring metrics.
185
325
 
186
326
  verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
187
327
 
188
328
  scoring_key_mapping: A dictionary that allows you to rename keys present in either the dataset item or the task output
189
- so that they match the keys expected by the scoring metrics. For example if you have a dataset item with the following content:
329
+ so that they match the keys expected by the scoring metrics. For example, if you have a dataset item with the following content:
190
330
  {"user_question": "What is Opik ?"} and a scoring metric that expects a key "input", you can use scoring_key_mapping
191
331
  `{"input": "user_question"}` to map the "user_question" key to "input".
332
+
333
+ experiment_id: The ID of the experiment to evaluate. If not provided, the experiment will be evaluated based on the experiment name.
334
+
335
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
336
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
337
+ These scores are computed after all test results are collected and represent aggregate
338
+ metrics across the entire experiment.
192
339
  """
340
+ experiment_scoring_functions = (
341
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
342
+ )
193
343
  start_time = time.time()
194
344
 
195
345
  client = opik_client.get_client_cached()
@@ -202,10 +352,11 @@ def evaluate_experiment(
202
352
  client=client, experiment_name=experiment_name
203
353
  )
204
354
 
355
+ dataset_ = client.get_dataset(name=experiment.dataset_name)
356
+
205
357
  test_cases = rest_operations.get_experiment_test_cases(
206
- client=client,
207
- experiment_id=experiment.id,
208
- dataset_id=experiment.dataset_id,
358
+ experiment_=experiment,
359
+ dataset_=dataset_,
209
360
  scoring_key_mapping=scoring_key_mapping,
210
361
  )
211
362
  first_trace_id = test_cases[0].trace_id
@@ -213,11 +364,17 @@ def evaluate_experiment(
213
364
  client=client, trace_id=first_trace_id
214
365
  )
215
366
 
367
+ # wrap scoring functions if any
368
+ scoring_metrics = _wrap_scoring_functions(
369
+ scoring_functions=scoring_functions,
370
+ scoring_metrics=scoring_metrics,
371
+ project_name=project_name,
372
+ )
373
+
216
374
  with asyncio_support.async_http_connections_expire_immediately():
217
375
  evaluation_engine = engine.EvaluationEngine(
218
376
  client=client,
219
377
  project_name=project_name,
220
- experiment_=experiment,
221
378
  scoring_metrics=scoring_metrics,
222
379
  workers=scoring_threads,
223
380
  verbose=verbose,
@@ -229,50 +386,104 @@ def evaluate_experiment(
229
386
 
230
387
  total_time = time.time() - start_time
231
388
 
232
- if verbose == 1:
389
+ # Compute experiment scores
390
+ computed_experiment_scores = _compute_experiment_scores(
391
+ experiment_scoring_functions=experiment_scoring_functions,
392
+ test_results=test_results,
393
+ )
394
+
395
+ if verbose >= 1:
233
396
  report.display_experiment_results(
234
- experiment.dataset_name, total_time, test_results
397
+ dataset_.name,
398
+ total_time,
399
+ test_results,
400
+ computed_experiment_scores,
235
401
  )
236
402
 
237
- report.display_experiment_link(
238
- dataset_id=experiment.dataset_id,
403
+ experiment_url = url_helpers.get_experiment_url_by_id(
239
404
  experiment_id=experiment.id,
405
+ dataset_id=dataset_.id,
240
406
  url_override=client.config.url_override,
241
407
  )
242
408
 
409
+ report.display_experiment_link(experiment_url=experiment_url)
410
+
411
+ _try_notifying_about_experiment_completion(experiment)
412
+
413
+ # Log experiment scores to backend
414
+ if computed_experiment_scores:
415
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
416
+
243
417
  evaluation_result_ = evaluation_result.EvaluationResult(
244
- dataset_id=experiment.dataset_id,
418
+ dataset_id=dataset_.id,
245
419
  experiment_id=experiment.id,
246
420
  experiment_name=experiment.name,
247
421
  test_results=test_results,
422
+ experiment_url=experiment_url,
423
+ trial_count=1,
424
+ experiment_scores=computed_experiment_scores,
248
425
  )
249
426
 
427
+ if verbose >= 2:
428
+ report.display_evaluation_scores_statistics(
429
+ dataset_name=dataset_.name,
430
+ evaluation_results=evaluation_result_,
431
+ )
432
+
250
433
  return evaluation_result_
251
434
 
252
435
 
253
436
  def _build_prompt_evaluation_task(
254
437
  model: base_model.OpikBaseModel, messages: List[Dict[str, Any]]
255
438
  ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
256
- def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
257
- processed_messages = []
258
- for message in messages:
259
- processed_messages.append(
260
- {
261
- "role": message["role"],
262
- "content": prompt_template.PromptTemplate(
263
- message["content"],
264
- validate_placeholders=False,
265
- type=prompt_variables.get("type", "mustache"),
266
- ).format(**prompt_variables),
267
- }
268
- )
439
+ supported_modalities = cast(
440
+ prompt_types.SupportedModalities,
441
+ {
442
+ "vision": ModelCapabilities.supports_vision(
443
+ getattr(model, "model_name", None)
444
+ ),
445
+ "video": ModelCapabilities.supports_video(
446
+ getattr(model, "model_name", None)
447
+ ),
448
+ },
449
+ )
450
+ # Disable placeholder validation since we pass all dataset item fields to format()
451
+ chat_prompt_template_ = chat_prompt_template.ChatPromptTemplate(
452
+ messages=messages, validate_placeholders=False
453
+ )
269
454
 
270
- llm_output = model.generate_provider_response(messages=processed_messages)
455
+ required_modalities = chat_prompt_template_.required_modalities()
456
+ unsupported_modalities = {
457
+ modality
458
+ for modality in required_modalities
459
+ if not supported_modalities.get(modality, False)
460
+ }
461
+
462
+ if unsupported_modalities:
463
+ modalities_list = ", ".join(sorted(unsupported_modalities))
464
+ LOGGER.warning(
465
+ "Model '%s' does not support %s content. Multimedia parts will be flattened "
466
+ "to text placeholders. See %s for supported models and customization options.",
467
+ getattr(model, "model_name", "unknown"),
468
+ modalities_list,
469
+ MODALITY_SUPPORT_DOC_URL,
470
+ )
271
471
 
272
- return {
273
- "input": processed_messages,
274
- "output": llm_output.choices[0].message.content,
275
- }
472
+ def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
473
+ template_type_override = prompt_variables.get("type")
474
+ processed_messages = chat_prompt_template_.format(
475
+ variables=prompt_variables,
476
+ supported_modalities=supported_modalities,
477
+ template_type=template_type_override,
478
+ )
479
+
480
+ with base_model.get_provider_response(
481
+ model_provider=model, messages=processed_messages
482
+ ) as llm_output:
483
+ return {
484
+ "input": processed_messages,
485
+ "output": llm_output.choices[0].message.content,
486
+ }
276
487
 
277
488
  return _prompt_evaluation_task
278
489
 
@@ -282,14 +493,19 @@ def evaluate_prompt(
282
493
  messages: List[Dict[str, Any]],
283
494
  model: Optional[Union[str, base_model.OpikBaseModel]] = None,
284
495
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
496
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
497
+ experiment_name_prefix: Optional[str] = None,
285
498
  experiment_name: Optional[str] = None,
286
499
  project_name: Optional[str] = None,
287
500
  experiment_config: Optional[Dict[str, Any]] = None,
288
501
  verbose: int = 1,
289
502
  nb_samples: Optional[int] = None,
290
503
  task_threads: int = 16,
291
- prompt: Optional[Prompt] = None,
504
+ prompt: Optional[base_prompt.BasePrompt] = None,
292
505
  dataset_item_ids: Optional[List[str]] = None,
506
+ dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
507
+ trial_count: int = 1,
508
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
293
509
  ) -> evaluation_result.EvaluationResult:
294
510
  """
295
511
  Performs prompt evaluation on a given dataset.
@@ -304,6 +520,17 @@ def evaluate_prompt(
304
520
  scoring_metrics: List of metrics to calculate during evaluation.
305
521
  The LLM input and output will be passed as arguments to each metric `score(...)` method.
306
522
 
523
+ scoring_functions: List of scorer functions to be executed during evaluation.
524
+ Each scorer function includes a scoring method that accepts predefined
525
+ arguments supplied by the evaluation engine:
526
+ • dataset_item — a dictionary containing the dataset item content,
527
+ • task_outputs — a dictionary containing the LLM task output.
528
+ • task_span - the data collected during the LLM task execution [optional].
529
+
530
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
531
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
532
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
533
+
307
534
  experiment_name: name of the experiment.
308
535
 
309
536
  project_name: The name of the project to log data
@@ -319,28 +546,48 @@ def evaluate_prompt(
319
546
  prompt: Prompt object to link with experiment.
320
547
 
321
548
  dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
549
+
550
+ dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
551
+ If not provided, all samples in the dataset will be evaluated.
552
+
553
+ trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
554
+
555
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
556
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
557
+ These scores are computed after all test results are collected and represent aggregate
558
+ metrics across the entire experiment.
322
559
  """
560
+ experiment_scoring_functions = (
561
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
562
+ )
323
563
  if isinstance(model, str):
324
- model = models_factory.get(model_name=model)
564
+ opik_model = models_factory.get(model_name=model)
325
565
  elif not isinstance(model, base_model.OpikBaseModel):
326
566
  raise ValueError("`model` must be either a string or an OpikBaseModel instance")
567
+ else:
568
+ opik_model = model
327
569
 
328
570
  if experiment_config is None:
329
- experiment_config = {"prompt_template": messages, "model": model.model_name}
571
+ experiment_config = {
572
+ "prompt_template": messages,
573
+ "model": opik_model.model_name,
574
+ }
330
575
  else:
331
576
  if "prompt_template" not in experiment_config:
332
577
  experiment_config["prompt_template"] = messages
333
578
 
334
579
  if "model" not in experiment_config:
335
- experiment_config["model"] = model.model_name
336
-
337
- if scoring_metrics is None:
338
- scoring_metrics = []
580
+ experiment_config["model"] = opik_model.model_name
339
581
 
340
582
  client = opik_client.get_client_cached()
341
583
 
342
584
  prompts = [prompt] if prompt else None
343
585
 
586
+ experiment_name = _use_or_create_experiment_name(
587
+ experiment_name=experiment_name,
588
+ experiment_name_prefix=experiment_name_prefix,
589
+ )
590
+
344
591
  experiment = client.create_experiment(
345
592
  name=experiment_name,
346
593
  dataset_name=dataset.name,
@@ -348,45 +595,79 @@ def evaluate_prompt(
348
595
  prompts=prompts,
349
596
  )
350
597
 
598
+ # wrap scoring functions if any
599
+ scoring_metrics = _wrap_scoring_functions(
600
+ scoring_functions=scoring_functions,
601
+ scoring_metrics=scoring_metrics,
602
+ project_name=project_name,
603
+ )
604
+
351
605
  start_time = time.time()
352
606
 
353
607
  with asyncio_support.async_http_connections_expire_immediately():
354
608
  evaluation_engine = engine.EvaluationEngine(
355
609
  client=client,
356
610
  project_name=project_name,
357
- experiment_=experiment,
358
611
  scoring_metrics=scoring_metrics,
359
612
  workers=task_threads,
360
613
  verbose=verbose,
361
614
  scoring_key_mapping=None,
362
615
  )
363
- test_results = evaluation_engine.evaluate_llm_tasks(
616
+ test_results = evaluation_engine.evaluate_llm_task_on_dataset(
364
617
  dataset_=dataset,
365
- task=_build_prompt_evaluation_task(model=model, messages=messages),
618
+ task=_build_prompt_evaluation_task(model=opik_model, messages=messages),
366
619
  nb_samples=nb_samples,
367
620
  dataset_item_ids=dataset_item_ids,
621
+ dataset_sampler=dataset_sampler,
622
+ trial_count=trial_count,
623
+ experiment_=experiment,
368
624
  )
369
625
 
370
626
  total_time = time.time() - start_time
371
627
 
372
- if verbose == 1:
373
- report.display_experiment_results(dataset.name, total_time, test_results)
628
+ # Compute experiment scores
629
+ computed_experiment_scores = _compute_experiment_scores(
630
+ experiment_scoring_functions=experiment_scoring_functions,
631
+ test_results=test_results,
632
+ )
633
+
634
+ if verbose >= 1:
635
+ report.display_experiment_results(
636
+ dataset.name, total_time, test_results, computed_experiment_scores
637
+ )
374
638
 
375
- report.display_experiment_link(
639
+ experiment_url = url_helpers.get_experiment_url_by_id(
376
640
  experiment_id=experiment.id,
377
641
  dataset_id=dataset.id,
378
642
  url_override=client.config.url_override,
379
643
  )
380
644
 
645
+ report.display_experiment_link(experiment_url=experiment_url)
646
+
381
647
  client.flush()
382
648
 
649
+ _try_notifying_about_experiment_completion(experiment)
650
+
651
+ # Log experiment scores to backend
652
+ if computed_experiment_scores:
653
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
654
+
383
655
  evaluation_result_ = evaluation_result.EvaluationResult(
384
656
  experiment_id=experiment.id,
385
657
  dataset_id=dataset.id,
386
658
  experiment_name=experiment.name,
387
659
  test_results=test_results,
660
+ experiment_url=experiment_url,
661
+ trial_count=trial_count,
662
+ experiment_scores=computed_experiment_scores,
388
663
  )
389
664
 
665
+ if verbose >= 2:
666
+ report.display_evaluation_scores_statistics(
667
+ dataset_name=dataset.name,
668
+ evaluation_results=evaluation_result_,
669
+ )
670
+
390
671
  return evaluation_result_
391
672
 
392
673
 
@@ -395,16 +676,21 @@ def evaluate_optimization_trial(
395
676
  dataset: dataset.Dataset,
396
677
  task: LLMTask,
397
678
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
679
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
680
+ experiment_name_prefix: Optional[str] = None,
398
681
  experiment_name: Optional[str] = None,
399
682
  project_name: Optional[str] = None,
400
683
  experiment_config: Optional[Dict[str, Any]] = None,
401
684
  verbose: int = 1,
402
685
  nb_samples: Optional[int] = None,
403
686
  task_threads: int = 16,
404
- prompt: Optional[Prompt] = None,
405
- prompts: Optional[List[Prompt]] = None,
687
+ prompt: Optional[base_prompt.BasePrompt] = None,
688
+ prompts: Optional[List[base_prompt.BasePrompt]] = None,
406
689
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
407
690
  dataset_item_ids: Optional[List[str]] = None,
691
+ dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
692
+ trial_count: int = 1,
693
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
408
694
  ) -> evaluation_result.EvaluationResult:
409
695
  """
410
696
  Performs task evaluation on a given dataset.
@@ -417,6 +703,17 @@ def evaluate_optimization_trial(
417
703
  task: A callable object that takes dict with dataset item content
418
704
  as input and returns dict which will later be used for scoring.
419
705
 
706
+ scoring_functions: List of scorer functions to be executed during evaluation.
707
+ Each scorer function includes a scoring method that accepts predefined
708
+ arguments supplied by the evaluation engine:
709
+ • dataset_item — a dictionary containing the dataset item content,
710
+ • task_outputs — a dictionary containing the LLM task output.
711
+ • task_span - the data collected during the LLM task execution [optional].
712
+
713
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
714
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
715
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
716
+
420
717
  experiment_name: The name of the experiment associated with evaluation run.
421
718
  If None, a generated name will be used.
422
719
 
@@ -451,7 +748,21 @@ def evaluate_optimization_trial(
451
748
  `{"input": "user_question"}` to map the "user_question" key to "input".
452
749
 
453
750
  dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
751
+
752
+ dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
753
+ If not provided, all samples in the dataset will be evaluated.
754
+
755
+ trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
756
+
757
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
758
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
759
+ These scores are computed after all test results are collected and represent aggregate
760
+ metrics across the entire experiment.
454
761
  """
762
+ experiment_scoring_functions = (
763
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
764
+ )
765
+
455
766
  if scoring_metrics is None:
456
767
  scoring_metrics = []
457
768
 
@@ -460,8 +771,20 @@ def evaluate_optimization_trial(
460
771
  prompts=prompts,
461
772
  )
462
773
 
774
+ # wrap scoring functions if any
775
+ scoring_metrics = _wrap_scoring_functions(
776
+ scoring_functions=scoring_functions,
777
+ scoring_metrics=scoring_metrics,
778
+ project_name=project_name,
779
+ )
780
+
463
781
  client = opik_client.get_client_cached()
464
782
 
783
+ experiment_name = _use_or_create_experiment_name(
784
+ experiment_name=experiment_name,
785
+ experiment_name_prefix=experiment_name_prefix,
786
+ )
787
+
465
788
  experiment = client.create_experiment(
466
789
  name=experiment_name,
467
790
  dataset_name=dataset.name,
@@ -483,4 +806,152 @@ def evaluate_optimization_trial(
483
806
  task_threads=task_threads,
484
807
  scoring_key_mapping=scoring_key_mapping,
485
808
  dataset_item_ids=dataset_item_ids,
809
+ dataset_sampler=dataset_sampler,
810
+ trial_count=trial_count,
811
+ experiment_scoring_functions=experiment_scoring_functions,
486
812
  )
813
+
814
+
815
+ def evaluate_on_dict_items(
816
+ items: List[Dict[str, Any]],
817
+ task: LLMTask,
818
+ scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
819
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
820
+ project_name: Optional[str] = None,
821
+ verbose: int = 0,
822
+ scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
823
+ scoring_threads: int = 16,
824
+ ) -> evaluation_result.EvaluationResultOnDictItems:
825
+ """
826
+ Lightweight evaluation function that evaluates a task on dataset items (as dictionaries)
827
+ without requiring a Dataset object or creating an experiment.
828
+
829
+ This function is useful for optimization scenarios where you need to evaluate many
830
+ candidate solutions quickly using Opik's metric infrastructure. It creates traces for
831
+ tracking but doesn't require experiment setup or dataset management.
832
+
833
+ Args:
834
+ items: List of dataset item contents (dictionaries with the data to evaluate).
835
+
836
+ task: A callable object that takes dict with dataset item content
837
+ as input and returns dict which will later be used for scoring.
838
+
839
+ scoring_metrics: List of metrics to calculate during evaluation.
840
+ Each metric's `score(...)` method will be called with arguments taken from
841
+ the dataset item and task output.
842
+
843
+ scoring_functions: List of scorer functions to be executed during evaluation.
844
+ Each scorer function accepts predefined arguments:
845
+ • dataset_item — a dictionary containing the dataset item content,
846
+ • task_outputs — a dictionary containing the LLM task output.
847
+
848
+ project_name: The name of the project for logging traces.
849
+
850
+ verbose: Controls evaluation output logs and progress bars.
851
+ 0 - no outputs (default), 1 - enable outputs.
852
+
853
+ scoring_key_mapping: A dictionary that allows you to rename keys present in either
854
+ the dataset item or the task output to match the keys expected by scoring metrics.
855
+
856
+ scoring_threads: Number of thread workers to run scoring metrics.
857
+
858
+ Returns:
859
+ EvaluationResultOnDictItems object containing test results and providing methods
860
+ to aggregate scores, similar to the regular evaluation result.
861
+
862
+ Example:
863
+ ```python
864
+ import opik
865
+ from opik.evaluation.metrics import Equals
866
+
867
+ items = [
868
+ {"input": "What is 2+2?", "expected_output": "4"},
869
+ {"input": "What is 3+3?", "expected_output": "6"},
870
+ ]
871
+
872
+ def my_task(item):
873
+ # Your LLM call here
874
+ question = item["input"]
875
+ # ... call model ...
876
+ return {"output": model_output}
877
+
878
+ result = opik.evaluate_on_dict_items(
879
+ items=items,
880
+ task=my_task,
881
+ scoring_metrics=[Equals()],
882
+ scoring_key_mapping={"reference": "expected_output"},
883
+ )
884
+
885
+ # Access individual test results
886
+ for test_result in result.test_results:
887
+ print(f"Score: {test_result.score_results[0].value}")
888
+
889
+ # Get aggregated statistics
890
+ aggregated = result.aggregate_evaluation_scores()
891
+ print(f"Mean equals score: {aggregated['equals_metric'].mean}")
892
+ ```
893
+ """
894
+ # Wrap scoring functions if any
895
+ scoring_metrics = _wrap_scoring_functions(
896
+ scoring_functions=scoring_functions,
897
+ scoring_metrics=scoring_metrics,
898
+ project_name=project_name,
899
+ )
900
+
901
+ if not scoring_metrics:
902
+ LOGGER.warning("No scoring metrics provided for items evaluation")
903
+ return evaluation_result.EvaluationResultOnDictItems(test_results=[])
904
+
905
+ client = opik_client.get_client_cached()
906
+
907
+ # Create evaluation engine
908
+ with asyncio_support.async_http_connections_expire_immediately():
909
+ evaluation_engine = engine.EvaluationEngine(
910
+ client=client,
911
+ project_name=project_name,
912
+ scoring_metrics=scoring_metrics,
913
+ workers=scoring_threads,
914
+ verbose=verbose,
915
+ scoring_key_mapping=scoring_key_mapping,
916
+ )
917
+
918
+ # Use the new evaluate_items method
919
+ test_results = evaluation_engine.evaluate_llm_task_on_dict_items(
920
+ items=items,
921
+ task=task,
922
+ )
923
+
924
+ return evaluation_result.EvaluationResultOnDictItems(
925
+ test_results=test_results,
926
+ )
927
+
928
+
929
+ def _wrap_scoring_functions(
930
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]],
931
+ scoring_metrics: Optional[List[base_metric.BaseMetric]],
932
+ project_name: Optional[str],
933
+ ) -> List[base_metric.BaseMetric]:
934
+ if scoring_functions:
935
+ function_metrics = scorer_wrapper_metric.wrap_scorer_functions(
936
+ scoring_functions, project_name=project_name
937
+ )
938
+ if scoring_metrics:
939
+ scoring_metrics.extend(function_metrics)
940
+ else:
941
+ scoring_metrics = function_metrics
942
+
943
+ return scoring_metrics if scoring_metrics else []
944
+
945
+
946
+ def _use_or_create_experiment_name(
947
+ experiment_name: Optional[str], experiment_name_prefix: Optional[str]
948
+ ) -> Optional[str]:
949
+ if experiment_name:
950
+ return experiment_name
951
+
952
+ if experiment_name_prefix:
953
+ return experiment_helpers.generate_unique_experiment_name(
954
+ experiment_name_prefix
955
+ )
956
+ else:
957
+ return None