opik 1.6.4__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1021) hide show
  1. opik/__init__.py +33 -2
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/__init__.py +5 -0
  9. opik/api_objects/attachment/attachment.py +20 -0
  10. opik/api_objects/attachment/attachment_context.py +36 -0
  11. opik/api_objects/attachment/attachments_extractor.py +153 -0
  12. opik/api_objects/attachment/client.py +220 -0
  13. opik/api_objects/attachment/converters.py +51 -0
  14. opik/api_objects/attachment/decoder.py +18 -0
  15. opik/api_objects/attachment/decoder_base64.py +83 -0
  16. opik/api_objects/attachment/decoder_helpers.py +137 -0
  17. opik/api_objects/conversation/__init__.py +0 -0
  18. opik/api_objects/conversation/conversation_factory.py +43 -0
  19. opik/api_objects/conversation/conversation_thread.py +49 -0
  20. opik/api_objects/data_helpers.py +79 -0
  21. opik/api_objects/dataset/dataset.py +107 -45
  22. opik/api_objects/dataset/rest_operations.py +12 -3
  23. opik/api_objects/experiment/experiment.py +81 -45
  24. opik/api_objects/experiment/experiment_item.py +2 -1
  25. opik/api_objects/experiment/experiments_client.py +64 -0
  26. opik/api_objects/experiment/helpers.py +35 -11
  27. opik/api_objects/experiment/rest_operations.py +88 -19
  28. opik/api_objects/helpers.py +104 -7
  29. opik/api_objects/local_recording.py +81 -0
  30. opik/api_objects/opik_client.py +872 -174
  31. opik/api_objects/opik_query_language.py +136 -18
  32. opik/api_objects/optimization/__init__.py +3 -0
  33. opik/api_objects/optimization/optimization.py +39 -0
  34. opik/api_objects/prompt/__init__.py +13 -1
  35. opik/api_objects/prompt/base_prompt.py +69 -0
  36. opik/api_objects/prompt/base_prompt_template.py +29 -0
  37. opik/api_objects/prompt/chat/__init__.py +1 -0
  38. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  39. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  40. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  41. opik/api_objects/prompt/client.py +193 -41
  42. opik/api_objects/prompt/text/__init__.py +1 -0
  43. opik/api_objects/prompt/text/prompt.py +174 -0
  44. opik/api_objects/prompt/text/prompt_template.py +55 -0
  45. opik/api_objects/prompt/types.py +29 -0
  46. opik/api_objects/rest_stream_parser.py +98 -0
  47. opik/api_objects/search_helpers.py +89 -0
  48. opik/api_objects/span/span_client.py +165 -45
  49. opik/api_objects/span/span_data.py +136 -25
  50. opik/api_objects/threads/__init__.py +0 -0
  51. opik/api_objects/threads/threads_client.py +185 -0
  52. opik/api_objects/trace/trace_client.py +72 -36
  53. opik/api_objects/trace/trace_data.py +112 -26
  54. opik/api_objects/validation_helpers.py +3 -3
  55. opik/cli/__init__.py +5 -0
  56. opik/cli/__main__.py +6 -0
  57. opik/cli/configure.py +66 -0
  58. opik/cli/exports/__init__.py +131 -0
  59. opik/cli/exports/dataset.py +278 -0
  60. opik/cli/exports/experiment.py +784 -0
  61. opik/cli/exports/project.py +685 -0
  62. opik/cli/exports/prompt.py +578 -0
  63. opik/cli/exports/utils.py +406 -0
  64. opik/cli/harbor.py +39 -0
  65. opik/cli/healthcheck.py +21 -0
  66. opik/cli/imports/__init__.py +439 -0
  67. opik/cli/imports/dataset.py +143 -0
  68. opik/cli/imports/experiment.py +1192 -0
  69. opik/cli/imports/project.py +262 -0
  70. opik/cli/imports/prompt.py +177 -0
  71. opik/cli/imports/utils.py +280 -0
  72. opik/cli/main.py +49 -0
  73. opik/cli/proxy.py +93 -0
  74. opik/cli/usage_report/__init__.py +16 -0
  75. opik/cli/usage_report/charts.py +783 -0
  76. opik/cli/usage_report/cli.py +274 -0
  77. opik/cli/usage_report/constants.py +9 -0
  78. opik/cli/usage_report/extraction.py +749 -0
  79. opik/cli/usage_report/pdf.py +244 -0
  80. opik/cli/usage_report/statistics.py +78 -0
  81. opik/cli/usage_report/utils.py +235 -0
  82. opik/config.py +62 -4
  83. opik/configurator/configure.py +45 -6
  84. opik/configurator/opik_rest_helpers.py +4 -1
  85. opik/context_storage.py +164 -65
  86. opik/datetime_helpers.py +12 -0
  87. opik/decorator/arguments_helpers.py +9 -1
  88. opik/decorator/base_track_decorator.py +298 -146
  89. opik/decorator/context_manager/__init__.py +0 -0
  90. opik/decorator/context_manager/span_context_manager.py +123 -0
  91. opik/decorator/context_manager/trace_context_manager.py +84 -0
  92. opik/decorator/generator_wrappers.py +3 -2
  93. opik/decorator/inspect_helpers.py +11 -0
  94. opik/decorator/opik_args/__init__.py +13 -0
  95. opik/decorator/opik_args/api_classes.py +71 -0
  96. opik/decorator/opik_args/helpers.py +120 -0
  97. opik/decorator/span_creation_handler.py +49 -21
  98. opik/decorator/tracker.py +9 -1
  99. opik/dict_utils.py +3 -3
  100. opik/environment.py +13 -1
  101. opik/error_tracking/api.py +1 -1
  102. opik/error_tracking/before_send.py +6 -5
  103. opik/error_tracking/environment_details.py +29 -7
  104. opik/error_tracking/error_filtering/filter_by_response_status_code.py +42 -0
  105. opik/error_tracking/error_filtering/filter_chain_builder.py +14 -3
  106. opik/evaluation/__init__.py +14 -2
  107. opik/evaluation/engine/engine.py +280 -82
  108. opik/evaluation/engine/evaluation_tasks_executor.py +15 -10
  109. opik/evaluation/engine/helpers.py +34 -9
  110. opik/evaluation/engine/metrics_evaluator.py +237 -0
  111. opik/evaluation/engine/types.py +5 -4
  112. opik/evaluation/evaluation_result.py +169 -2
  113. opik/evaluation/evaluator.py +659 -58
  114. opik/evaluation/metrics/__init__.py +121 -6
  115. opik/evaluation/metrics/aggregated_metric.py +92 -0
  116. opik/evaluation/metrics/arguments_helpers.py +15 -21
  117. opik/evaluation/metrics/arguments_validator.py +38 -0
  118. opik/evaluation/metrics/base_metric.py +20 -10
  119. opik/evaluation/metrics/conversation/__init__.py +48 -0
  120. opik/evaluation/metrics/conversation/conversation_thread_metric.py +79 -0
  121. opik/evaluation/metrics/conversation/conversation_turns_factory.py +39 -0
  122. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  123. opik/evaluation/metrics/conversation/helpers.py +84 -0
  124. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  125. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  126. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  127. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  128. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  129. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  130. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  131. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/__init__.py +0 -0
  132. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +274 -0
  133. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/schema.py +16 -0
  134. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/templates.py +95 -0
  135. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  136. opik/evaluation/metrics/conversation/llm_judges/session_completeness/__init__.py +0 -0
  137. opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +295 -0
  138. opik/evaluation/metrics/conversation/llm_judges/session_completeness/schema.py +22 -0
  139. opik/evaluation/metrics/conversation/llm_judges/session_completeness/templates.py +139 -0
  140. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  141. opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +277 -0
  142. opik/evaluation/metrics/conversation/llm_judges/user_frustration/schema.py +16 -0
  143. opik/evaluation/metrics/conversation/llm_judges/user_frustration/templates.py +135 -0
  144. opik/evaluation/metrics/conversation/types.py +34 -0
  145. opik/evaluation/metrics/conversation_types.py +9 -0
  146. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  147. opik/evaluation/metrics/heuristics/bleu.py +43 -16
  148. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  149. opik/evaluation/metrics/heuristics/contains.py +50 -11
  150. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  151. opik/evaluation/metrics/heuristics/equals.py +4 -1
  152. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  153. opik/evaluation/metrics/heuristics/is_json.py +9 -3
  154. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  155. opik/evaluation/metrics/heuristics/levenshtein_ratio.py +6 -5
  156. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  157. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  158. opik/evaluation/metrics/heuristics/readability.py +129 -0
  159. opik/evaluation/metrics/heuristics/regex_match.py +4 -1
  160. opik/evaluation/metrics/heuristics/rouge.py +148 -0
  161. opik/evaluation/metrics/heuristics/sentiment.py +98 -0
  162. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  163. opik/evaluation/metrics/heuristics/tone.py +155 -0
  164. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  165. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +27 -30
  166. opik/evaluation/metrics/llm_judges/answer_relevance/parser.py +27 -0
  167. opik/evaluation/metrics/llm_judges/answer_relevance/templates.py +10 -10
  168. opik/evaluation/metrics/llm_judges/context_precision/metric.py +28 -31
  169. opik/evaluation/metrics/llm_judges/context_precision/parser.py +27 -0
  170. opik/evaluation/metrics/llm_judges/context_precision/template.py +7 -7
  171. opik/evaluation/metrics/llm_judges/context_recall/metric.py +27 -31
  172. opik/evaluation/metrics/llm_judges/context_recall/parser.py +27 -0
  173. opik/evaluation/metrics/llm_judges/context_recall/template.py +7 -7
  174. opik/evaluation/metrics/llm_judges/factuality/metric.py +7 -26
  175. opik/evaluation/metrics/llm_judges/factuality/parser.py +35 -0
  176. opik/evaluation/metrics/llm_judges/factuality/template.py +1 -1
  177. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  178. opik/evaluation/metrics/llm_judges/g_eval/metric.py +244 -113
  179. opik/evaluation/metrics/llm_judges/g_eval/parser.py +161 -0
  180. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  181. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  182. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  183. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  184. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  185. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  186. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  187. opik/evaluation/metrics/llm_judges/hallucination/metric.py +23 -27
  188. opik/evaluation/metrics/llm_judges/hallucination/parser.py +29 -0
  189. opik/evaluation/metrics/llm_judges/hallucination/template.py +2 -4
  190. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  191. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  192. opik/evaluation/metrics/llm_judges/moderation/metric.py +23 -28
  193. opik/evaluation/metrics/llm_judges/moderation/parser.py +27 -0
  194. opik/evaluation/metrics/llm_judges/moderation/template.py +2 -2
  195. opik/evaluation/metrics/llm_judges/parsing_helpers.py +26 -0
  196. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  197. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  198. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  199. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  200. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  201. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  202. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  203. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  204. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  205. opik/evaluation/metrics/llm_judges/trajectory_accuracy/__init__.py +3 -0
  206. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +171 -0
  207. opik/evaluation/metrics/llm_judges/trajectory_accuracy/parser.py +38 -0
  208. opik/evaluation/metrics/llm_judges/trajectory_accuracy/templates.py +65 -0
  209. opik/evaluation/metrics/llm_judges/usefulness/metric.py +23 -32
  210. opik/evaluation/metrics/llm_judges/usefulness/parser.py +28 -0
  211. opik/evaluation/metrics/ragas_metric.py +112 -0
  212. opik/evaluation/models/__init__.py +10 -0
  213. opik/evaluation/models/base_model.py +140 -18
  214. opik/evaluation/models/langchain/__init__.py +3 -0
  215. opik/evaluation/models/langchain/langchain_chat_model.py +166 -0
  216. opik/evaluation/models/langchain/message_converters.py +106 -0
  217. opik/evaluation/models/langchain/opik_monitoring.py +23 -0
  218. opik/evaluation/models/litellm/litellm_chat_model.py +186 -40
  219. opik/evaluation/models/litellm/opik_monitor.py +24 -21
  220. opik/evaluation/models/litellm/util.py +125 -0
  221. opik/evaluation/models/litellm/warning_filters.py +16 -4
  222. opik/evaluation/models/model_capabilities.py +187 -0
  223. opik/evaluation/models/models_factory.py +25 -3
  224. opik/evaluation/preprocessing.py +92 -0
  225. opik/evaluation/report.py +70 -12
  226. opik/evaluation/rest_operations.py +49 -45
  227. opik/evaluation/samplers/__init__.py +4 -0
  228. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  229. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  230. opik/evaluation/score_statistics.py +66 -0
  231. opik/evaluation/scorers/__init__.py +4 -0
  232. opik/evaluation/scorers/scorer_function.py +55 -0
  233. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  234. opik/evaluation/test_case.py +3 -2
  235. opik/evaluation/test_result.py +1 -0
  236. opik/evaluation/threads/__init__.py +0 -0
  237. opik/evaluation/threads/context_helper.py +32 -0
  238. opik/evaluation/threads/evaluation_engine.py +181 -0
  239. opik/evaluation/threads/evaluation_result.py +18 -0
  240. opik/evaluation/threads/evaluator.py +120 -0
  241. opik/evaluation/threads/helpers.py +51 -0
  242. opik/evaluation/types.py +9 -1
  243. opik/exceptions.py +116 -3
  244. opik/file_upload/__init__.py +0 -0
  245. opik/file_upload/base_upload_manager.py +39 -0
  246. opik/file_upload/file_upload_monitor.py +14 -0
  247. opik/file_upload/file_uploader.py +141 -0
  248. opik/file_upload/mime_type.py +9 -0
  249. opik/file_upload/s3_multipart_upload/__init__.py +0 -0
  250. opik/file_upload/s3_multipart_upload/file_parts_strategy.py +89 -0
  251. opik/file_upload/s3_multipart_upload/s3_file_uploader.py +86 -0
  252. opik/file_upload/s3_multipart_upload/s3_upload_error.py +29 -0
  253. opik/file_upload/thread_pool.py +17 -0
  254. opik/file_upload/upload_client.py +114 -0
  255. opik/file_upload/upload_manager.py +255 -0
  256. opik/file_upload/upload_options.py +37 -0
  257. opik/format_helpers.py +17 -0
  258. opik/guardrails/__init__.py +4 -0
  259. opik/guardrails/guardrail.py +157 -0
  260. opik/guardrails/guards/__init__.py +5 -0
  261. opik/guardrails/guards/guard.py +17 -0
  262. opik/guardrails/guards/pii.py +47 -0
  263. opik/guardrails/guards/topic.py +76 -0
  264. opik/guardrails/rest_api_client.py +34 -0
  265. opik/guardrails/schemas.py +24 -0
  266. opik/guardrails/tracing.py +61 -0
  267. opik/healthcheck/__init__.py +2 -1
  268. opik/healthcheck/checks.py +2 -2
  269. opik/healthcheck/rich_representation.py +1 -1
  270. opik/hooks/__init__.py +23 -0
  271. opik/hooks/anonymizer_hook.py +36 -0
  272. opik/hooks/httpx_client_hook.py +112 -0
  273. opik/httpx_client.py +75 -4
  274. opik/id_helpers.py +18 -0
  275. opik/integrations/adk/__init__.py +14 -0
  276. opik/integrations/adk/callback_context_info_extractors.py +32 -0
  277. opik/integrations/adk/graph/__init__.py +0 -0
  278. opik/integrations/adk/graph/mermaid_graph_builder.py +128 -0
  279. opik/integrations/adk/graph/nodes.py +101 -0
  280. opik/integrations/adk/graph/subgraph_edges_builders.py +41 -0
  281. opik/integrations/adk/helpers.py +48 -0
  282. opik/integrations/adk/legacy_opik_tracer.py +381 -0
  283. opik/integrations/adk/opik_tracer.py +370 -0
  284. opik/integrations/adk/patchers/__init__.py +4 -0
  285. opik/integrations/adk/patchers/adk_otel_tracer/__init__.py +0 -0
  286. opik/integrations/adk/patchers/adk_otel_tracer/llm_span_helpers.py +30 -0
  287. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +201 -0
  288. opik/integrations/adk/patchers/litellm_wrappers.py +91 -0
  289. opik/integrations/adk/patchers/llm_response_wrapper.py +105 -0
  290. opik/integrations/adk/patchers/patchers.py +64 -0
  291. opik/integrations/adk/recursive_callback_injector.py +126 -0
  292. opik/integrations/aisuite/aisuite_decorator.py +8 -3
  293. opik/integrations/aisuite/opik_tracker.py +1 -0
  294. opik/integrations/anthropic/messages_create_decorator.py +8 -3
  295. opik/integrations/anthropic/opik_tracker.py +0 -1
  296. opik/integrations/bedrock/converse/__init__.py +0 -0
  297. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  298. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +18 -8
  299. opik/integrations/bedrock/invoke_agent_decorator.py +12 -7
  300. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  301. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  302. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  303. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  304. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  305. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  306. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  307. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  308. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  309. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  310. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  311. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  312. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  313. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  314. opik/integrations/bedrock/opik_tracker.py +43 -4
  315. opik/integrations/bedrock/types.py +19 -0
  316. opik/integrations/crewai/crewai_decorator.py +34 -56
  317. opik/integrations/crewai/opik_tracker.py +31 -10
  318. opik/integrations/crewai/patchers/__init__.py +5 -0
  319. opik/integrations/crewai/patchers/flow.py +118 -0
  320. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  321. opik/integrations/crewai/patchers/llm_client.py +207 -0
  322. opik/integrations/dspy/callback.py +246 -84
  323. opik/integrations/dspy/graph.py +88 -0
  324. opik/integrations/dspy/parsers.py +168 -0
  325. opik/integrations/genai/encoder_extension.py +2 -6
  326. opik/integrations/genai/generate_content_decorator.py +20 -13
  327. opik/integrations/guardrails/guardrails_decorator.py +4 -0
  328. opik/integrations/harbor/__init__.py +17 -0
  329. opik/integrations/harbor/experiment_service.py +269 -0
  330. opik/integrations/harbor/opik_tracker.py +528 -0
  331. opik/integrations/haystack/constants.py +35 -0
  332. opik/integrations/haystack/converters.py +1 -2
  333. opik/integrations/haystack/opik_connector.py +28 -6
  334. opik/integrations/haystack/opik_span_bridge.py +284 -0
  335. opik/integrations/haystack/opik_tracer.py +124 -222
  336. opik/integrations/langchain/__init__.py +3 -1
  337. opik/integrations/langchain/helpers.py +96 -0
  338. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  339. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  340. opik/integrations/langchain/opik_encoder_extension.py +2 -2
  341. opik/integrations/langchain/opik_tracer.py +641 -206
  342. opik/integrations/langchain/provider_usage_extractors/__init__.py +5 -0
  343. opik/integrations/langchain/provider_usage_extractors/anthropic_usage_extractor.py +101 -0
  344. opik/integrations/langchain/provider_usage_extractors/anthropic_vertexai_usage_extractor.py +67 -0
  345. opik/integrations/langchain/provider_usage_extractors/bedrock_usage_extractor.py +94 -0
  346. opik/integrations/langchain/provider_usage_extractors/google_generative_ai_usage_extractor.py +109 -0
  347. opik/integrations/langchain/provider_usage_extractors/groq_usage_extractor.py +92 -0
  348. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/__init__.py +15 -0
  349. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +134 -0
  350. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/langchain_usage.py +163 -0
  351. opik/integrations/langchain/provider_usage_extractors/openai_usage_extractor.py +124 -0
  352. opik/integrations/langchain/provider_usage_extractors/provider_usage_extractor_protocol.py +29 -0
  353. opik/integrations/langchain/provider_usage_extractors/usage_extractor.py +48 -0
  354. opik/integrations/langchain/provider_usage_extractors/vertexai_usage_extractor.py +109 -0
  355. opik/integrations/litellm/__init__.py +5 -0
  356. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  357. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  358. opik/integrations/litellm/opik_tracker.py +43 -0
  359. opik/integrations/litellm/stream_patchers.py +151 -0
  360. opik/integrations/llama_index/callback.py +179 -78
  361. opik/integrations/llama_index/event_parsing_utils.py +29 -9
  362. opik/integrations/openai/agents/opik_tracing_processor.py +204 -32
  363. opik/integrations/openai/agents/span_data_parsers.py +15 -6
  364. opik/integrations/openai/chat_completion_chunks_aggregator.py +1 -1
  365. opik/integrations/openai/{openai_decorator.py → openai_chat_completions_decorator.py} +45 -35
  366. opik/integrations/openai/openai_responses_decorator.py +158 -0
  367. opik/integrations/openai/opik_tracker.py +94 -13
  368. opik/integrations/openai/response_events_aggregator.py +36 -0
  369. opik/integrations/openai/stream_patchers.py +125 -15
  370. opik/integrations/sagemaker/auth.py +5 -1
  371. opik/jsonable_encoder.py +29 -1
  372. opik/llm_usage/base_original_provider_usage.py +15 -8
  373. opik/llm_usage/bedrock_usage.py +8 -2
  374. opik/llm_usage/google_usage.py +6 -1
  375. opik/llm_usage/llm_usage_info.py +6 -0
  376. opik/llm_usage/{openai_usage.py → openai_chat_completions_usage.py} +2 -12
  377. opik/llm_usage/{openai_agent_usage.py → openai_responses_usage.py} +7 -15
  378. opik/llm_usage/opik_usage.py +36 -10
  379. opik/llm_usage/opik_usage_factory.py +35 -19
  380. opik/logging_messages.py +19 -7
  381. opik/message_processing/arguments_utils.py +22 -0
  382. opik/message_processing/batching/base_batcher.py +45 -17
  383. opik/message_processing/batching/batch_manager.py +22 -10
  384. opik/message_processing/batching/batch_manager_constuctors.py +36 -11
  385. opik/message_processing/batching/batchers.py +167 -44
  386. opik/message_processing/batching/flushing_thread.py +0 -3
  387. opik/message_processing/batching/sequence_splitter.py +50 -5
  388. opik/message_processing/emulation/__init__.py +0 -0
  389. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  390. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  391. opik/message_processing/emulation/models.py +162 -0
  392. opik/message_processing/encoder_helpers.py +79 -0
  393. opik/message_processing/message_queue.py +79 -0
  394. opik/message_processing/messages.py +154 -12
  395. opik/message_processing/preprocessing/__init__.py +0 -0
  396. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  397. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  398. opik/message_processing/preprocessing/constants.py +1 -0
  399. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  400. opik/message_processing/preprocessing/preprocessor.py +36 -0
  401. opik/message_processing/processors/__init__.py +0 -0
  402. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  403. opik/message_processing/processors/message_processors.py +92 -0
  404. opik/message_processing/processors/message_processors_chain.py +96 -0
  405. opik/message_processing/processors/online_message_processor.py +324 -0
  406. opik/message_processing/queue_consumer.py +61 -13
  407. opik/message_processing/streamer.py +102 -31
  408. opik/message_processing/streamer_constructors.py +67 -12
  409. opik/opik_context.py +103 -11
  410. opik/plugins/pytest/decorator.py +2 -2
  411. opik/plugins/pytest/experiment_runner.py +3 -2
  412. opik/plugins/pytest/hooks.py +6 -4
  413. opik/rate_limit/__init__.py +0 -0
  414. opik/rate_limit/rate_limit.py +25 -0
  415. opik/rest_api/__init__.py +643 -11
  416. opik/rest_api/alerts/__init__.py +7 -0
  417. opik/rest_api/alerts/client.py +667 -0
  418. opik/rest_api/alerts/raw_client.py +1015 -0
  419. opik/rest_api/alerts/types/__init__.py +7 -0
  420. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  421. opik/rest_api/annotation_queues/__init__.py +4 -0
  422. opik/rest_api/annotation_queues/client.py +668 -0
  423. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  424. opik/rest_api/attachments/__init__.py +17 -0
  425. opik/rest_api/attachments/client.py +752 -0
  426. opik/rest_api/attachments/raw_client.py +1125 -0
  427. opik/rest_api/attachments/types/__init__.py +15 -0
  428. opik/rest_api/attachments/types/attachment_list_request_entity_type.py +5 -0
  429. opik/rest_api/attachments/types/download_attachment_request_entity_type.py +5 -0
  430. opik/rest_api/attachments/types/start_multipart_upload_request_entity_type.py +5 -0
  431. opik/rest_api/attachments/types/upload_attachment_request_entity_type.py +5 -0
  432. opik/rest_api/automation_rule_evaluators/__init__.py +2 -0
  433. opik/rest_api/automation_rule_evaluators/client.py +182 -1162
  434. opik/rest_api/automation_rule_evaluators/raw_client.py +598 -0
  435. opik/rest_api/chat_completions/__init__.py +2 -0
  436. opik/rest_api/chat_completions/client.py +115 -149
  437. opik/rest_api/chat_completions/raw_client.py +339 -0
  438. opik/rest_api/check/__init__.py +2 -0
  439. opik/rest_api/check/client.py +88 -106
  440. opik/rest_api/check/raw_client.py +258 -0
  441. opik/rest_api/client.py +112 -212
  442. opik/rest_api/core/__init__.py +5 -0
  443. opik/rest_api/core/api_error.py +12 -6
  444. opik/rest_api/core/client_wrapper.py +4 -14
  445. opik/rest_api/core/datetime_utils.py +1 -3
  446. opik/rest_api/core/file.py +2 -5
  447. opik/rest_api/core/http_client.py +42 -120
  448. opik/rest_api/core/http_response.py +55 -0
  449. opik/rest_api/core/jsonable_encoder.py +1 -4
  450. opik/rest_api/core/pydantic_utilities.py +79 -147
  451. opik/rest_api/core/query_encoder.py +1 -3
  452. opik/rest_api/core/serialization.py +10 -10
  453. opik/rest_api/dashboards/__init__.py +4 -0
  454. opik/rest_api/dashboards/client.py +462 -0
  455. opik/rest_api/dashboards/raw_client.py +648 -0
  456. opik/rest_api/datasets/__init__.py +5 -0
  457. opik/rest_api/datasets/client.py +1638 -1091
  458. opik/rest_api/datasets/raw_client.py +3389 -0
  459. opik/rest_api/datasets/types/__init__.py +8 -0
  460. opik/rest_api/datasets/types/dataset_update_visibility.py +5 -0
  461. opik/rest_api/datasets/types/dataset_write_visibility.py +5 -0
  462. opik/rest_api/errors/__init__.py +2 -0
  463. opik/rest_api/errors/bad_request_error.py +4 -3
  464. opik/rest_api/errors/conflict_error.py +4 -3
  465. opik/rest_api/errors/forbidden_error.py +4 -2
  466. opik/rest_api/errors/not_found_error.py +4 -3
  467. opik/rest_api/errors/not_implemented_error.py +4 -3
  468. opik/rest_api/errors/unauthorized_error.py +4 -3
  469. opik/rest_api/errors/unprocessable_entity_error.py +4 -3
  470. opik/rest_api/experiments/__init__.py +5 -0
  471. opik/rest_api/experiments/client.py +676 -752
  472. opik/rest_api/experiments/raw_client.py +1872 -0
  473. opik/rest_api/experiments/types/__init__.py +10 -0
  474. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  475. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  476. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  477. opik/rest_api/experiments/types/experiment_write_type.py +5 -0
  478. opik/rest_api/feedback_definitions/__init__.py +2 -0
  479. opik/rest_api/feedback_definitions/client.py +96 -370
  480. opik/rest_api/feedback_definitions/raw_client.py +541 -0
  481. opik/rest_api/feedback_definitions/types/__init__.py +2 -0
  482. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -3
  483. opik/rest_api/guardrails/__init__.py +4 -0
  484. opik/rest_api/guardrails/client.py +104 -0
  485. opik/rest_api/guardrails/raw_client.py +102 -0
  486. opik/rest_api/llm_provider_key/__init__.py +2 -0
  487. opik/rest_api/llm_provider_key/client.py +166 -440
  488. opik/rest_api/llm_provider_key/raw_client.py +643 -0
  489. opik/rest_api/llm_provider_key/types/__init__.py +2 -0
  490. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  491. opik/rest_api/manual_evaluation/__init__.py +4 -0
  492. opik/rest_api/manual_evaluation/client.py +347 -0
  493. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  494. opik/rest_api/open_telemetry_ingestion/__init__.py +2 -0
  495. opik/rest_api/open_telemetry_ingestion/client.py +38 -63
  496. opik/rest_api/open_telemetry_ingestion/raw_client.py +88 -0
  497. opik/rest_api/optimizations/__init__.py +7 -0
  498. opik/rest_api/optimizations/client.py +704 -0
  499. opik/rest_api/optimizations/raw_client.py +920 -0
  500. opik/rest_api/optimizations/types/__init__.py +7 -0
  501. opik/rest_api/optimizations/types/optimization_update_status.py +7 -0
  502. opik/rest_api/projects/__init__.py +10 -1
  503. opik/rest_api/projects/client.py +180 -855
  504. opik/rest_api/projects/raw_client.py +1216 -0
  505. opik/rest_api/projects/types/__init__.py +11 -4
  506. opik/rest_api/projects/types/project_metric_request_public_interval.py +1 -3
  507. opik/rest_api/projects/types/project_metric_request_public_metric_type.py +11 -1
  508. opik/rest_api/projects/types/project_update_visibility.py +5 -0
  509. opik/rest_api/projects/types/project_write_visibility.py +5 -0
  510. opik/rest_api/prompts/__init__.py +4 -2
  511. opik/rest_api/prompts/client.py +381 -970
  512. opik/rest_api/prompts/raw_client.py +1634 -0
  513. opik/rest_api/prompts/types/__init__.py +5 -1
  514. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  515. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  516. opik/rest_api/raw_client.py +156 -0
  517. opik/rest_api/redirect/__init__.py +4 -0
  518. opik/rest_api/redirect/client.py +375 -0
  519. opik/rest_api/redirect/raw_client.py +566 -0
  520. opik/rest_api/service_toggles/__init__.py +4 -0
  521. opik/rest_api/service_toggles/client.py +91 -0
  522. opik/rest_api/service_toggles/raw_client.py +93 -0
  523. opik/rest_api/spans/__init__.py +2 -0
  524. opik/rest_api/spans/client.py +659 -1354
  525. opik/rest_api/spans/raw_client.py +2383 -0
  526. opik/rest_api/spans/types/__init__.py +2 -0
  527. opik/rest_api/spans/types/find_feedback_score_names_1_request_type.py +1 -3
  528. opik/rest_api/spans/types/get_span_stats_request_type.py +1 -3
  529. opik/rest_api/spans/types/get_spans_by_project_request_type.py +1 -3
  530. opik/rest_api/spans/types/span_search_stream_request_public_type.py +1 -3
  531. opik/rest_api/system_usage/__init__.py +2 -0
  532. opik/rest_api/system_usage/client.py +157 -216
  533. opik/rest_api/system_usage/raw_client.py +455 -0
  534. opik/rest_api/traces/__init__.py +2 -0
  535. opik/rest_api/traces/client.py +2102 -1625
  536. opik/rest_api/traces/raw_client.py +4144 -0
  537. opik/rest_api/types/__init__.py +629 -24
  538. opik/rest_api/types/aggregation_data.py +27 -0
  539. opik/rest_api/types/alert.py +33 -0
  540. opik/rest_api/types/alert_alert_type.py +5 -0
  541. opik/rest_api/types/alert_page_public.py +24 -0
  542. opik/rest_api/types/alert_public.py +33 -0
  543. opik/rest_api/types/alert_public_alert_type.py +5 -0
  544. opik/rest_api/types/alert_trigger.py +27 -0
  545. opik/rest_api/types/alert_trigger_config.py +28 -0
  546. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  547. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  548. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  549. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  550. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  551. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  552. opik/rest_api/types/alert_trigger_public.py +27 -0
  553. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  554. opik/rest_api/types/alert_trigger_write.py +23 -0
  555. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  556. opik/rest_api/types/alert_write.py +28 -0
  557. opik/rest_api/types/alert_write_alert_type.py +5 -0
  558. opik/rest_api/types/annotation_queue.py +42 -0
  559. opik/rest_api/types/annotation_queue_batch.py +27 -0
  560. opik/rest_api/types/{json_schema_element.py → annotation_queue_item_ids.py} +5 -7
  561. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  562. opik/rest_api/types/annotation_queue_public.py +38 -0
  563. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  564. opik/rest_api/types/{workspace_metadata.py → annotation_queue_reviewer.py} +6 -7
  565. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  566. opik/rest_api/types/annotation_queue_scope.py +5 -0
  567. opik/rest_api/types/annotation_queue_write.py +31 -0
  568. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  569. opik/rest_api/types/assistant_message.py +7 -8
  570. opik/rest_api/types/assistant_message_role.py +1 -3
  571. opik/rest_api/types/attachment.py +22 -0
  572. opik/rest_api/types/attachment_page.py +28 -0
  573. opik/rest_api/types/audio_url.py +19 -0
  574. opik/rest_api/types/audio_url_public.py +19 -0
  575. opik/rest_api/types/audio_url_write.py +19 -0
  576. opik/rest_api/types/automation_rule_evaluator.py +160 -0
  577. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +6 -6
  578. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +6 -6
  579. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +6 -6
  580. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  581. opik/rest_api/types/automation_rule_evaluator_page_public.py +6 -6
  582. opik/rest_api/types/automation_rule_evaluator_public.py +155 -0
  583. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  584. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  585. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  586. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  587. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  588. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  589. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +22 -0
  590. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +22 -0
  591. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +22 -0
  592. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +22 -0
  593. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +22 -0
  594. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +22 -0
  595. opik/rest_api/types/automation_rule_evaluator_update.py +143 -0
  596. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +6 -6
  597. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  598. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  599. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +22 -0
  600. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +22 -0
  601. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +6 -6
  602. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +6 -6
  603. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +6 -6
  604. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +6 -6
  605. opik/rest_api/types/automation_rule_evaluator_write.py +143 -0
  606. opik/rest_api/types/avg_value_stat_public.py +3 -5
  607. opik/rest_api/types/batch_delete.py +3 -5
  608. opik/rest_api/types/batch_delete_by_project.py +20 -0
  609. opik/rest_api/types/bi_information.py +3 -5
  610. opik/rest_api/types/bi_information_response.py +4 -6
  611. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  612. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  613. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  614. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  615. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  616. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  617. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  618. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  619. opik/rest_api/types/categorical_feedback_definition.py +5 -7
  620. opik/rest_api/types/categorical_feedback_definition_create.py +4 -6
  621. opik/rest_api/types/categorical_feedback_definition_public.py +5 -7
  622. opik/rest_api/types/categorical_feedback_definition_update.py +4 -6
  623. opik/rest_api/types/categorical_feedback_detail.py +3 -5
  624. opik/rest_api/types/categorical_feedback_detail_create.py +3 -5
  625. opik/rest_api/types/categorical_feedback_detail_public.py +3 -5
  626. opik/rest_api/types/categorical_feedback_detail_update.py +3 -5
  627. opik/rest_api/types/chat_completion_choice.py +4 -6
  628. opik/rest_api/types/chat_completion_response.py +5 -6
  629. opik/rest_api/types/check.py +22 -0
  630. opik/rest_api/types/{json_node_compare.py → check_name.py} +1 -1
  631. opik/rest_api/types/check_public.py +22 -0
  632. opik/rest_api/types/check_public_name.py +5 -0
  633. opik/rest_api/types/check_public_result.py +5 -0
  634. opik/rest_api/types/check_result.py +5 -0
  635. opik/rest_api/types/chunked_output_json_node.py +4 -6
  636. opik/rest_api/types/chunked_output_json_node_public.py +4 -6
  637. opik/rest_api/types/chunked_output_json_node_public_type.py +6 -10
  638. opik/rest_api/types/chunked_output_json_node_type.py +6 -10
  639. opik/rest_api/types/column.py +8 -10
  640. opik/rest_api/types/column_compare.py +8 -10
  641. opik/rest_api/types/column_public.py +8 -10
  642. opik/rest_api/types/column_types_item.py +1 -3
  643. opik/rest_api/types/comment.py +4 -6
  644. opik/rest_api/types/comment_compare.py +4 -6
  645. opik/rest_api/types/comment_public.py +4 -6
  646. opik/rest_api/types/complete_multipart_upload_request.py +33 -0
  647. opik/rest_api/types/complete_multipart_upload_request_entity_type.py +5 -0
  648. opik/rest_api/types/completion_tokens_details.py +3 -5
  649. opik/rest_api/types/count_value_stat_public.py +3 -5
  650. opik/rest_api/types/dashboard_page_public.py +24 -0
  651. opik/rest_api/types/dashboard_public.py +30 -0
  652. opik/rest_api/types/data_point_double.py +21 -0
  653. opik/rest_api/types/data_point_number_public.py +3 -5
  654. opik/rest_api/types/dataset.py +14 -6
  655. opik/rest_api/types/dataset_expansion.py +42 -0
  656. opik/rest_api/types/dataset_expansion_response.py +39 -0
  657. opik/rest_api/types/dataset_item.py +9 -8
  658. opik/rest_api/types/dataset_item_batch.py +3 -5
  659. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  660. opik/rest_api/types/dataset_item_compare.py +9 -8
  661. opik/rest_api/types/dataset_item_compare_source.py +1 -3
  662. opik/rest_api/types/dataset_item_filter.py +27 -0
  663. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  664. opik/rest_api/types/dataset_item_page_compare.py +10 -7
  665. opik/rest_api/types/dataset_item_page_public.py +10 -7
  666. opik/rest_api/types/dataset_item_public.py +9 -8
  667. opik/rest_api/types/dataset_item_public_source.py +1 -3
  668. opik/rest_api/types/dataset_item_source.py +1 -3
  669. opik/rest_api/types/dataset_item_update.py +39 -0
  670. opik/rest_api/types/dataset_item_write.py +5 -6
  671. opik/rest_api/types/dataset_item_write_source.py +1 -3
  672. opik/rest_api/types/dataset_page_public.py +9 -6
  673. opik/rest_api/types/dataset_public.py +14 -6
  674. opik/rest_api/types/dataset_public_status.py +5 -0
  675. opik/rest_api/types/dataset_public_visibility.py +5 -0
  676. opik/rest_api/types/dataset_status.py +5 -0
  677. opik/rest_api/types/dataset_version_diff.py +22 -0
  678. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  679. opik/rest_api/types/dataset_version_page_public.py +23 -0
  680. opik/rest_api/types/dataset_version_public.py +59 -0
  681. opik/rest_api/types/dataset_version_summary.py +46 -0
  682. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  683. opik/rest_api/types/dataset_visibility.py +5 -0
  684. opik/rest_api/types/delete_attachments_request.py +23 -0
  685. opik/rest_api/types/delete_attachments_request_entity_type.py +5 -0
  686. opik/rest_api/types/delete_feedback_score.py +4 -5
  687. opik/rest_api/types/delete_ids_holder.py +19 -0
  688. opik/rest_api/types/delta.py +7 -9
  689. opik/rest_api/types/error_count_with_deviation.py +21 -0
  690. opik/rest_api/types/error_count_with_deviation_detailed.py +21 -0
  691. opik/rest_api/types/error_info.py +3 -5
  692. opik/rest_api/types/error_info_experiment_item_bulk_write_view.py +21 -0
  693. opik/rest_api/types/error_info_public.py +3 -5
  694. opik/rest_api/types/error_info_write.py +3 -5
  695. opik/rest_api/types/error_message.py +3 -5
  696. opik/rest_api/types/error_message_detail.py +3 -5
  697. opik/rest_api/types/error_message_detailed.py +3 -5
  698. opik/rest_api/types/error_message_public.py +3 -5
  699. opik/rest_api/types/experiment.py +21 -10
  700. opik/rest_api/types/experiment_group_aggregations_response.py +20 -0
  701. opik/rest_api/types/experiment_group_response.py +22 -0
  702. opik/rest_api/types/experiment_item.py +14 -11
  703. opik/rest_api/types/experiment_item_bulk_record.py +27 -0
  704. opik/rest_api/types/experiment_item_bulk_record_experiment_item_bulk_write_view.py +27 -0
  705. opik/rest_api/types/experiment_item_bulk_upload.py +27 -0
  706. opik/rest_api/types/experiment_item_compare.py +14 -11
  707. opik/rest_api/types/experiment_item_compare_trace_visibility_mode.py +5 -0
  708. opik/rest_api/types/experiment_item_public.py +6 -6
  709. opik/rest_api/types/experiment_item_public_trace_visibility_mode.py +5 -0
  710. opik/rest_api/types/experiment_item_trace_visibility_mode.py +5 -0
  711. opik/rest_api/types/experiment_page_public.py +9 -6
  712. opik/rest_api/types/experiment_public.py +21 -10
  713. opik/rest_api/types/experiment_public_status.py +5 -0
  714. opik/rest_api/types/experiment_public_type.py +5 -0
  715. opik/rest_api/types/experiment_score.py +20 -0
  716. opik/rest_api/types/experiment_score_public.py +20 -0
  717. opik/rest_api/types/experiment_score_write.py +20 -0
  718. opik/rest_api/types/experiment_status.py +5 -0
  719. opik/rest_api/types/experiment_type.py +5 -0
  720. opik/rest_api/types/export_trace_service_request.py +5 -0
  721. opik/rest_api/types/feedback.py +40 -27
  722. opik/rest_api/types/feedback_create.py +27 -13
  723. opik/rest_api/types/feedback_definition_page_public.py +4 -6
  724. opik/rest_api/types/feedback_object_public.py +40 -27
  725. opik/rest_api/types/feedback_public.py +40 -27
  726. opik/rest_api/types/feedback_score.py +7 -7
  727. opik/rest_api/types/feedback_score_average.py +3 -5
  728. opik/rest_api/types/feedback_score_average_detailed.py +3 -5
  729. opik/rest_api/types/feedback_score_average_public.py +3 -5
  730. opik/rest_api/types/feedback_score_batch.py +4 -6
  731. opik/rest_api/types/feedback_score_batch_item.py +6 -6
  732. opik/rest_api/types/feedback_score_batch_item_source.py +1 -3
  733. opik/rest_api/types/feedback_score_batch_item_thread.py +32 -0
  734. opik/rest_api/types/feedback_score_batch_item_thread_source.py +5 -0
  735. opik/rest_api/types/feedback_score_compare.py +7 -7
  736. opik/rest_api/types/feedback_score_compare_source.py +1 -3
  737. opik/rest_api/types/feedback_score_experiment_item_bulk_write_view.py +31 -0
  738. opik/rest_api/types/feedback_score_experiment_item_bulk_write_view_source.py +5 -0
  739. opik/rest_api/types/feedback_score_names.py +4 -6
  740. opik/rest_api/types/feedback_score_public.py +11 -7
  741. opik/rest_api/types/feedback_score_public_source.py +1 -3
  742. opik/rest_api/types/feedback_score_source.py +1 -3
  743. opik/rest_api/types/feedback_update.py +27 -13
  744. opik/rest_api/types/function.py +4 -7
  745. opik/rest_api/types/function_call.py +3 -5
  746. opik/rest_api/types/group_content.py +19 -0
  747. opik/rest_api/types/group_content_with_aggregations.py +21 -0
  748. opik/rest_api/types/group_detail.py +19 -0
  749. opik/rest_api/types/group_details.py +20 -0
  750. opik/rest_api/types/guardrail.py +34 -0
  751. opik/rest_api/types/guardrail_batch.py +20 -0
  752. opik/rest_api/types/guardrail_name.py +5 -0
  753. opik/rest_api/types/guardrail_result.py +5 -0
  754. opik/rest_api/types/guardrail_write.py +33 -0
  755. opik/rest_api/types/guardrail_write_name.py +5 -0
  756. opik/rest_api/types/guardrail_write_result.py +5 -0
  757. opik/rest_api/types/guardrails_validation.py +21 -0
  758. opik/rest_api/types/guardrails_validation_public.py +21 -0
  759. opik/rest_api/types/ids_holder.py +19 -0
  760. opik/rest_api/types/image_url.py +20 -0
  761. opik/rest_api/types/image_url_public.py +20 -0
  762. opik/rest_api/types/image_url_write.py +20 -0
  763. opik/rest_api/types/json_list_string.py +7 -0
  764. opik/rest_api/types/json_list_string_compare.py +7 -0
  765. opik/rest_api/types/json_list_string_experiment_item_bulk_write_view.py +7 -0
  766. opik/rest_api/types/json_list_string_public.py +7 -0
  767. opik/rest_api/types/json_list_string_write.py +7 -0
  768. opik/rest_api/types/json_schema.py +5 -8
  769. opik/rest_api/types/llm_as_judge_code.py +8 -12
  770. opik/rest_api/types/llm_as_judge_code_public.py +8 -12
  771. opik/rest_api/types/llm_as_judge_code_write.py +8 -12
  772. opik/rest_api/types/llm_as_judge_message.py +9 -7
  773. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  774. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  775. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  776. opik/rest_api/types/llm_as_judge_message_public.py +9 -7
  777. opik/rest_api/types/llm_as_judge_message_public_role.py +1 -1
  778. opik/rest_api/types/llm_as_judge_message_role.py +1 -1
  779. opik/rest_api/types/llm_as_judge_message_write.py +9 -7
  780. opik/rest_api/types/llm_as_judge_message_write_role.py +1 -1
  781. opik/rest_api/types/llm_as_judge_model_parameters.py +6 -5
  782. opik/rest_api/types/llm_as_judge_model_parameters_public.py +6 -5
  783. opik/rest_api/types/llm_as_judge_model_parameters_write.py +6 -5
  784. opik/rest_api/types/llm_as_judge_output_schema.py +4 -6
  785. opik/rest_api/types/llm_as_judge_output_schema_public.py +4 -6
  786. opik/rest_api/types/llm_as_judge_output_schema_public_type.py +1 -3
  787. opik/rest_api/types/llm_as_judge_output_schema_type.py +1 -3
  788. opik/rest_api/types/llm_as_judge_output_schema_write.py +4 -6
  789. opik/rest_api/types/llm_as_judge_output_schema_write_type.py +1 -3
  790. opik/rest_api/types/log_item.py +5 -7
  791. opik/rest_api/types/log_item_level.py +1 -3
  792. opik/rest_api/types/log_page.py +4 -6
  793. opik/rest_api/types/manual_evaluation_request.py +38 -0
  794. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  795. opik/rest_api/types/manual_evaluation_response.py +27 -0
  796. opik/rest_api/types/multipart_upload_part.py +20 -0
  797. opik/rest_api/types/numerical_feedback_definition.py +5 -7
  798. opik/rest_api/types/numerical_feedback_definition_create.py +4 -6
  799. opik/rest_api/types/numerical_feedback_definition_public.py +5 -7
  800. opik/rest_api/types/numerical_feedback_definition_update.py +4 -6
  801. opik/rest_api/types/numerical_feedback_detail.py +3 -5
  802. opik/rest_api/types/numerical_feedback_detail_create.py +3 -5
  803. opik/rest_api/types/numerical_feedback_detail_public.py +3 -5
  804. opik/rest_api/types/numerical_feedback_detail_update.py +3 -5
  805. opik/rest_api/types/optimization.py +37 -0
  806. opik/rest_api/types/optimization_page_public.py +28 -0
  807. opik/rest_api/types/optimization_public.py +37 -0
  808. opik/rest_api/types/optimization_public_status.py +7 -0
  809. opik/rest_api/types/optimization_status.py +7 -0
  810. opik/rest_api/types/optimization_studio_config.py +27 -0
  811. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  812. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  813. opik/rest_api/types/optimization_studio_log.py +22 -0
  814. opik/rest_api/types/optimization_write.py +30 -0
  815. opik/rest_api/types/optimization_write_status.py +7 -0
  816. opik/rest_api/types/page_columns.py +4 -6
  817. opik/rest_api/types/percentage_value_stat_public.py +4 -6
  818. opik/rest_api/types/percentage_values.py +8 -16
  819. opik/rest_api/types/percentage_values_detailed.py +8 -16
  820. opik/rest_api/types/percentage_values_public.py +8 -16
  821. opik/rest_api/types/project.py +12 -7
  822. opik/rest_api/types/project_detailed.py +12 -7
  823. opik/rest_api/types/project_detailed_visibility.py +5 -0
  824. opik/rest_api/types/project_metric_response_public.py +5 -9
  825. opik/rest_api/types/project_metric_response_public_interval.py +1 -3
  826. opik/rest_api/types/project_metric_response_public_metric_type.py +11 -1
  827. opik/rest_api/types/project_page_public.py +8 -10
  828. opik/rest_api/types/project_public.py +6 -6
  829. opik/rest_api/types/project_public_visibility.py +5 -0
  830. opik/rest_api/types/project_reference.py +31 -0
  831. opik/rest_api/types/project_reference_public.py +31 -0
  832. opik/rest_api/types/project_stat_item_object_public.py +8 -17
  833. opik/rest_api/types/project_stats_public.py +4 -6
  834. opik/rest_api/types/project_stats_summary.py +4 -6
  835. opik/rest_api/types/project_stats_summary_item.py +9 -6
  836. opik/rest_api/types/project_visibility.py +5 -0
  837. opik/rest_api/types/prompt.py +12 -7
  838. opik/rest_api/types/prompt_detail.py +12 -7
  839. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  840. opik/rest_api/types/prompt_page_public.py +9 -6
  841. opik/rest_api/types/prompt_public.py +11 -6
  842. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  843. opik/rest_api/types/prompt_template_structure.py +5 -0
  844. opik/rest_api/types/prompt_tokens_details.py +19 -0
  845. opik/rest_api/types/prompt_version.py +7 -6
  846. opik/rest_api/types/prompt_version_detail.py +7 -6
  847. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  848. opik/rest_api/types/prompt_version_link.py +4 -5
  849. opik/rest_api/types/prompt_version_link_public.py +4 -5
  850. opik/rest_api/types/prompt_version_link_write.py +3 -5
  851. opik/rest_api/types/prompt_version_page_public.py +9 -6
  852. opik/rest_api/types/prompt_version_public.py +7 -6
  853. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  854. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  855. opik/rest_api/types/prompt_version_update.py +33 -0
  856. opik/rest_api/types/provider_api_key.py +18 -8
  857. opik/rest_api/types/provider_api_key_page_public.py +27 -0
  858. opik/rest_api/types/provider_api_key_provider.py +1 -1
  859. opik/rest_api/types/provider_api_key_public.py +18 -8
  860. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  861. opik/rest_api/types/response_format.py +5 -7
  862. opik/rest_api/types/response_format_type.py +1 -3
  863. opik/rest_api/types/result.py +21 -0
  864. opik/rest_api/types/results_number_public.py +4 -6
  865. opik/rest_api/types/score_name.py +4 -5
  866. opik/rest_api/types/service_toggles_config.py +44 -0
  867. opik/rest_api/types/span.py +13 -15
  868. opik/rest_api/types/span_batch.py +4 -6
  869. opik/rest_api/types/span_enrichment_options.py +31 -0
  870. opik/rest_api/types/span_experiment_item_bulk_write_view.py +39 -0
  871. opik/rest_api/types/span_experiment_item_bulk_write_view_type.py +5 -0
  872. opik/rest_api/types/span_filter.py +23 -0
  873. opik/rest_api/types/span_filter_operator.py +21 -0
  874. opik/rest_api/types/span_filter_public.py +4 -6
  875. opik/rest_api/types/span_filter_public_operator.py +2 -0
  876. opik/rest_api/types/span_filter_write.py +23 -0
  877. opik/rest_api/types/span_filter_write_operator.py +21 -0
  878. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  879. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  880. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  881. opik/rest_api/types/span_page_public.py +9 -6
  882. opik/rest_api/types/span_public.py +19 -16
  883. opik/rest_api/types/span_public_type.py +1 -1
  884. opik/rest_api/types/span_type.py +1 -1
  885. opik/rest_api/types/span_update.py +46 -0
  886. opik/rest_api/types/span_update_type.py +5 -0
  887. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  888. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  889. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  890. opik/rest_api/types/span_write.py +13 -14
  891. opik/rest_api/types/span_write_type.py +1 -1
  892. opik/rest_api/types/spans_count_response.py +20 -0
  893. opik/rest_api/types/start_multipart_upload_response.py +20 -0
  894. opik/rest_api/types/stream_options.py +3 -5
  895. opik/rest_api/types/studio_evaluation.py +20 -0
  896. opik/rest_api/types/studio_evaluation_public.py +20 -0
  897. opik/rest_api/types/studio_evaluation_write.py +20 -0
  898. opik/rest_api/types/studio_llm_model.py +21 -0
  899. opik/rest_api/types/studio_llm_model_public.py +21 -0
  900. opik/rest_api/types/studio_llm_model_write.py +21 -0
  901. opik/rest_api/types/studio_message.py +20 -0
  902. opik/rest_api/types/studio_message_public.py +20 -0
  903. opik/rest_api/types/studio_message_write.py +20 -0
  904. opik/rest_api/types/studio_metric.py +21 -0
  905. opik/rest_api/types/studio_metric_public.py +21 -0
  906. opik/rest_api/types/studio_metric_write.py +21 -0
  907. opik/rest_api/types/studio_optimizer.py +21 -0
  908. opik/rest_api/types/studio_optimizer_public.py +21 -0
  909. opik/rest_api/types/studio_optimizer_write.py +21 -0
  910. opik/rest_api/types/studio_prompt.py +20 -0
  911. opik/rest_api/types/studio_prompt_public.py +20 -0
  912. opik/rest_api/types/studio_prompt_write.py +20 -0
  913. opik/rest_api/types/tool.py +4 -6
  914. opik/rest_api/types/tool_call.py +4 -6
  915. opik/rest_api/types/trace.py +26 -12
  916. opik/rest_api/types/trace_batch.py +4 -6
  917. opik/rest_api/types/trace_count_response.py +4 -6
  918. opik/rest_api/types/trace_enrichment_options.py +32 -0
  919. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +41 -0
  920. opik/rest_api/types/trace_filter.py +23 -0
  921. opik/rest_api/types/trace_filter_operator.py +21 -0
  922. opik/rest_api/types/trace_filter_public.py +23 -0
  923. opik/rest_api/types/trace_filter_public_operator.py +21 -0
  924. opik/rest_api/types/trace_filter_write.py +23 -0
  925. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  926. opik/rest_api/types/trace_page_public.py +8 -10
  927. opik/rest_api/types/trace_public.py +27 -13
  928. opik/rest_api/types/trace_public_visibility_mode.py +5 -0
  929. opik/rest_api/types/trace_thread.py +18 -9
  930. opik/rest_api/types/trace_thread_filter.py +23 -0
  931. opik/rest_api/types/trace_thread_filter_operator.py +21 -0
  932. opik/rest_api/types/trace_thread_filter_public.py +23 -0
  933. opik/rest_api/types/trace_thread_filter_public_operator.py +21 -0
  934. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  935. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  936. opik/rest_api/types/trace_thread_identifier.py +22 -0
  937. opik/rest_api/types/trace_thread_llm_as_judge_code.py +26 -0
  938. opik/rest_api/types/trace_thread_llm_as_judge_code_public.py +26 -0
  939. opik/rest_api/types/trace_thread_llm_as_judge_code_write.py +26 -0
  940. opik/rest_api/types/trace_thread_page.py +9 -6
  941. opik/rest_api/types/trace_thread_status.py +5 -0
  942. opik/rest_api/types/trace_thread_update.py +19 -0
  943. opik/rest_api/types/trace_thread_user_defined_metric_python_code.py +19 -0
  944. opik/rest_api/types/trace_thread_user_defined_metric_python_code_public.py +19 -0
  945. opik/rest_api/types/trace_thread_user_defined_metric_python_code_write.py +19 -0
  946. opik/rest_api/types/trace_update.py +39 -0
  947. opik/rest_api/types/trace_visibility_mode.py +5 -0
  948. opik/rest_api/types/trace_write.py +10 -11
  949. opik/rest_api/types/usage.py +6 -6
  950. opik/rest_api/types/user_defined_metric_python_code.py +3 -5
  951. opik/rest_api/types/user_defined_metric_python_code_public.py +3 -5
  952. opik/rest_api/types/user_defined_metric_python_code_write.py +3 -5
  953. opik/rest_api/types/value_entry.py +27 -0
  954. opik/rest_api/types/value_entry_compare.py +27 -0
  955. opik/rest_api/types/value_entry_compare_source.py +5 -0
  956. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +27 -0
  957. opik/rest_api/types/value_entry_experiment_item_bulk_write_view_source.py +5 -0
  958. opik/rest_api/types/value_entry_public.py +27 -0
  959. opik/rest_api/types/value_entry_public_source.py +5 -0
  960. opik/rest_api/types/value_entry_source.py +5 -0
  961. opik/rest_api/types/video_url.py +19 -0
  962. opik/rest_api/types/video_url_public.py +19 -0
  963. opik/rest_api/types/video_url_write.py +19 -0
  964. opik/rest_api/types/webhook.py +28 -0
  965. opik/rest_api/types/webhook_examples.py +19 -0
  966. opik/rest_api/types/webhook_public.py +28 -0
  967. opik/rest_api/types/webhook_test_result.py +23 -0
  968. opik/rest_api/types/webhook_test_result_status.py +5 -0
  969. opik/rest_api/types/webhook_write.py +23 -0
  970. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  971. opik/rest_api/types/workspace_configuration.py +27 -0
  972. opik/rest_api/types/workspace_metric_request.py +24 -0
  973. opik/rest_api/types/workspace_metric_response.py +20 -0
  974. opik/rest_api/types/workspace_metrics_summary_request.py +23 -0
  975. opik/rest_api/types/workspace_metrics_summary_response.py +20 -0
  976. opik/rest_api/types/workspace_name_holder.py +19 -0
  977. opik/rest_api/types/workspace_spans_count.py +20 -0
  978. opik/rest_api/types/workspace_trace_count.py +3 -5
  979. opik/rest_api/welcome_wizard/__init__.py +4 -0
  980. opik/rest_api/welcome_wizard/client.py +195 -0
  981. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  982. opik/rest_api/workspaces/__init__.py +2 -0
  983. opik/rest_api/workspaces/client.py +550 -77
  984. opik/rest_api/workspaces/raw_client.py +923 -0
  985. opik/rest_client_configurator/api.py +1 -0
  986. opik/rest_client_configurator/retry_decorator.py +1 -0
  987. opik/s3_httpx_client.py +67 -0
  988. opik/simulation/__init__.py +6 -0
  989. opik/simulation/simulated_user.py +99 -0
  990. opik/simulation/simulator.py +108 -0
  991. opik/synchronization.py +11 -24
  992. opik/tracing_runtime_config.py +48 -0
  993. opik/types.py +48 -2
  994. opik/url_helpers.py +13 -3
  995. opik/validation/chat_prompt_messages.py +241 -0
  996. opik/validation/feedback_score.py +4 -5
  997. opik/validation/parameter.py +122 -0
  998. opik/validation/parameters_validator.py +175 -0
  999. opik/validation/validator.py +30 -2
  1000. opik/validation/validator_helpers.py +147 -0
  1001. opik-1.9.71.dist-info/METADATA +370 -0
  1002. opik-1.9.71.dist-info/RECORD +1110 -0
  1003. {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/WHEEL +1 -1
  1004. opik-1.9.71.dist-info/licenses/LICENSE +203 -0
  1005. opik/api_objects/prompt/prompt.py +0 -107
  1006. opik/api_objects/prompt/prompt_template.py +0 -35
  1007. opik/cli.py +0 -193
  1008. opik/evaluation/metrics/models.py +0 -8
  1009. opik/hooks.py +0 -13
  1010. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  1011. opik/integrations/bedrock/helpers.py +0 -8
  1012. opik/integrations/langchain/google_run_helpers.py +0 -75
  1013. opik/integrations/langchain/openai_run_helpers.py +0 -122
  1014. opik/message_processing/message_processors.py +0 -203
  1015. opik/rest_api/types/delta_role.py +0 -7
  1016. opik/rest_api/types/json_object_schema.py +0 -34
  1017. opik-1.6.4.dist-info/METADATA +0 -270
  1018. opik-1.6.4.dist-info/RECORD +0 -507
  1019. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  1020. {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  1021. {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -1,37 +1,98 @@
1
1
  import logging
2
2
  import time
3
- from typing import Any, Callable, Dict, List, Optional, Union
3
+ from typing import Any, Callable, Dict, List, Optional, Union, cast
4
4
 
5
- from .. import Prompt
5
+ from ..api_objects.prompt import base_prompt
6
6
  from ..api_objects import opik_client
7
- from ..api_objects.dataset import dataset
7
+ from ..api_objects import dataset, experiment
8
8
  from ..api_objects.experiment import helpers as experiment_helpers
9
- from ..api_objects.prompt import prompt_template
10
- from . import asyncio_support, engine, evaluation_result, report, rest_operations
11
- from .metrics import base_metric
12
- from .models import base_model, models_factory
13
- from .types import LLMTask, ScoringKeyMappingType
9
+ from ..api_objects.prompt.chat import chat_prompt_template
10
+ from ..api_objects.prompt import types as prompt_types
11
+ from . import (
12
+ asyncio_support,
13
+ engine,
14
+ evaluation_result,
15
+ report,
16
+ rest_operations,
17
+ samplers,
18
+ )
19
+ from .metrics import base_metric, score_result
20
+ from .models import ModelCapabilities, base_model, models_factory
21
+ from .scorers import scorer_function, scorer_wrapper_metric
22
+ from . import test_result
23
+ from .types import ExperimentScoreFunction, LLMTask, ScoringKeyMappingType
24
+ from .. import url_helpers
14
25
 
15
26
  LOGGER = logging.getLogger(__name__)
27
+ MODALITY_SUPPORT_DOC_URL = (
28
+ "https://www.comet.com/docs/opik/evaluation/evaluate_multimodal"
29
+ )
30
+
31
+
32
+ def _try_notifying_about_experiment_completion(
33
+ experiment: experiment.Experiment,
34
+ ) -> None:
35
+ try:
36
+ experiment.experiments_rest_client.finish_experiments(ids=[experiment.id])
37
+ except Exception:
38
+ LOGGER.debug(
39
+ "Failed to notify backend about the experiment completion. Experiment ID: %s",
40
+ experiment.id,
41
+ exc_info=True,
42
+ )
43
+
44
+
45
+ def _compute_experiment_scores(
46
+ experiment_scoring_functions: List[ExperimentScoreFunction],
47
+ test_results: List[test_result.TestResult],
48
+ ) -> List[score_result.ScoreResult]:
49
+ """Compute experiment-level scores from test results."""
50
+ if not experiment_scoring_functions or not test_results:
51
+ return []
52
+
53
+ all_scores: List[score_result.ScoreResult] = []
54
+ for score_function in experiment_scoring_functions:
55
+ try:
56
+ scores = score_function(test_results)
57
+ # Handle Union[ScoreResult, List[ScoreResult]]
58
+ if isinstance(scores, list):
59
+ all_scores.extend(scores)
60
+ else:
61
+ all_scores.append(scores)
62
+ except Exception as e:
63
+ LOGGER.warning(
64
+ "Failed to compute experiment score: %s",
65
+ e,
66
+ exc_info=True,
67
+ )
68
+
69
+ return all_scores
16
70
 
17
71
 
18
72
  def evaluate(
19
73
  dataset: dataset.Dataset,
20
74
  task: LLMTask,
21
75
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
76
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
77
+ experiment_name_prefix: Optional[str] = None,
22
78
  experiment_name: Optional[str] = None,
23
79
  project_name: Optional[str] = None,
24
80
  experiment_config: Optional[Dict[str, Any]] = None,
25
81
  verbose: int = 1,
26
82
  nb_samples: Optional[int] = None,
27
83
  task_threads: int = 16,
28
- prompt: Optional[Prompt] = None,
29
- prompts: Optional[List[Prompt]] = None,
84
+ prompt: Optional[base_prompt.BasePrompt] = None,
85
+ prompts: Optional[List[base_prompt.BasePrompt]] = None,
30
86
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
31
87
  dataset_item_ids: Optional[List[str]] = None,
88
+ dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
89
+ trial_count: int = 1,
90
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
32
91
  ) -> evaluation_result.EvaluationResult:
33
92
  """
34
- Performs task evaluation on a given dataset.
93
+ Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
94
+ evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
95
+ to receive inputs and outputs from the task.
35
96
 
36
97
  Args:
37
98
  dataset: An Opik dataset instance
@@ -39,6 +100,10 @@ def evaluate(
39
100
  task: A callable object that takes dict with dataset item content
40
101
  as input and returns dict which will later be used for scoring.
41
102
 
103
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
104
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
105
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
106
+
42
107
  experiment_name: The name of the experiment associated with evaluation run.
43
108
  If None, a generated name will be used.
44
109
 
@@ -53,8 +118,16 @@ def evaluate(
53
118
  are mandatory in `task`-returned dictionary.
54
119
  If no value provided, the experiment won't have any scoring metrics.
55
120
 
121
+ scoring_functions: List of scorer functions to be executed during evaluation.
122
+ Each scorer function includes a scoring method that accepts predefined
123
+ arguments supplied by the evaluation engine:
124
+ • dataset_item — a dictionary containing the dataset item content,
125
+ • task_outputs — a dictionary containing the LLM task output.
126
+ • task_span - the data collected during the LLM task execution [optional].
127
+
56
128
  verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
57
- 0 - no outputs, 1 - outputs are enabled (default).
129
+ 0 - no outputs, 1 - outputs are enabled (default), 2 - outputs are enabled and detailed statistics
130
+ are displayed.
58
131
 
59
132
  nb_samples: number of samples to evaluate. If no value is provided, all samples in the dataset will be evaluated.
60
133
 
@@ -73,9 +146,20 @@ def evaluate(
73
146
  `{"input": "user_question"}` to map the "user_question" key to "input".
74
147
 
75
148
  dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
149
+
150
+ dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
151
+ If not provided, all samples in the dataset will be evaluated.
152
+
153
+ trial_count: number of times to run the task and evaluate the task output for every dataset item.
154
+
155
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
156
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
157
+ These scores are computed after all test results are collected and represent aggregate
158
+ metrics across the entire experiment.
76
159
  """
77
- if scoring_metrics is None:
78
- scoring_metrics = []
160
+ experiment_scoring_functions = (
161
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
162
+ )
79
163
 
80
164
  checked_prompts = experiment_helpers.handle_prompt_args(
81
165
  prompt=prompt,
@@ -84,6 +168,11 @@ def evaluate(
84
168
 
85
169
  client = opik_client.get_client_cached()
86
170
 
171
+ experiment_name = _use_or_create_experiment_name(
172
+ experiment_name=experiment_name,
173
+ experiment_name_prefix=experiment_name_prefix,
174
+ )
175
+
87
176
  experiment = client.create_experiment(
88
177
  name=experiment_name,
89
178
  dataset_name=dataset.name,
@@ -91,56 +180,130 @@ def evaluate(
91
180
  prompts=checked_prompts,
92
181
  )
93
182
 
183
+ # wrap scoring functions if any
184
+ scoring_metrics = _wrap_scoring_functions(
185
+ scoring_functions=scoring_functions,
186
+ scoring_metrics=scoring_metrics,
187
+ project_name=project_name,
188
+ )
189
+
190
+ return _evaluate_task(
191
+ client=client,
192
+ experiment=experiment,
193
+ dataset=dataset,
194
+ task=task,
195
+ scoring_metrics=scoring_metrics,
196
+ project_name=project_name,
197
+ verbose=verbose,
198
+ nb_samples=nb_samples,
199
+ task_threads=task_threads,
200
+ scoring_key_mapping=scoring_key_mapping,
201
+ dataset_item_ids=dataset_item_ids,
202
+ dataset_sampler=dataset_sampler,
203
+ trial_count=trial_count,
204
+ experiment_scoring_functions=experiment_scoring_functions,
205
+ )
206
+
207
+
208
+ def _evaluate_task(
209
+ *,
210
+ client: opik_client.Opik,
211
+ experiment: experiment.Experiment,
212
+ dataset: dataset.Dataset,
213
+ task: LLMTask,
214
+ scoring_metrics: List[base_metric.BaseMetric],
215
+ project_name: Optional[str],
216
+ verbose: int,
217
+ nb_samples: Optional[int],
218
+ task_threads: int,
219
+ scoring_key_mapping: Optional[ScoringKeyMappingType],
220
+ dataset_item_ids: Optional[List[str]],
221
+ dataset_sampler: Optional[samplers.BaseDatasetSampler],
222
+ trial_count: int,
223
+ experiment_scoring_functions: List[ExperimentScoreFunction],
224
+ ) -> evaluation_result.EvaluationResult:
94
225
  start_time = time.time()
95
226
 
96
227
  with asyncio_support.async_http_connections_expire_immediately():
97
228
  evaluation_engine = engine.EvaluationEngine(
98
229
  client=client,
99
230
  project_name=project_name,
100
- experiment_=experiment,
101
231
  scoring_metrics=scoring_metrics,
102
232
  workers=task_threads,
103
233
  verbose=verbose,
104
234
  scoring_key_mapping=scoring_key_mapping,
105
235
  )
106
- test_results = evaluation_engine.evaluate_llm_tasks(
236
+ test_results = evaluation_engine.evaluate_llm_task_on_dataset(
107
237
  dataset_=dataset,
108
238
  task=task,
109
239
  nb_samples=nb_samples,
110
240
  dataset_item_ids=dataset_item_ids,
241
+ dataset_sampler=dataset_sampler,
242
+ trial_count=trial_count,
243
+ experiment_=experiment,
111
244
  )
112
245
 
113
246
  total_time = time.time() - start_time
114
247
 
115
- if verbose == 1:
116
- report.display_experiment_results(dataset.name, total_time, test_results)
248
+ # Compute experiment scores
249
+ computed_experiment_scores = _compute_experiment_scores(
250
+ experiment_scoring_functions=experiment_scoring_functions,
251
+ test_results=test_results,
252
+ )
253
+
254
+ if verbose >= 1:
255
+ report.display_experiment_results(
256
+ dataset.name, total_time, test_results, computed_experiment_scores
257
+ )
117
258
 
118
- report.display_experiment_link(
259
+ experiment_url = url_helpers.get_experiment_url_by_id(
119
260
  experiment_id=experiment.id,
120
261
  dataset_id=dataset.id,
121
262
  url_override=client.config.url_override,
122
263
  )
123
264
 
265
+ report.display_experiment_link(experiment_url=experiment_url)
266
+
124
267
  client.flush()
125
268
 
269
+ _try_notifying_about_experiment_completion(experiment)
270
+
271
+ # Log experiment scores to backend
272
+ if computed_experiment_scores:
273
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
274
+
126
275
  evaluation_result_ = evaluation_result.EvaluationResult(
276
+ dataset_id=dataset.id,
127
277
  experiment_id=experiment.id,
128
278
  experiment_name=experiment.name,
129
279
  test_results=test_results,
280
+ experiment_url=experiment_url,
281
+ trial_count=trial_count,
282
+ experiment_scores=computed_experiment_scores,
130
283
  )
131
284
 
285
+ if verbose >= 2:
286
+ report.display_evaluation_scores_statistics(
287
+ dataset_name=dataset.name,
288
+ evaluation_results=evaluation_result_,
289
+ )
290
+
132
291
  return evaluation_result_
133
292
 
134
293
 
135
294
  def evaluate_experiment(
136
295
  experiment_name: str,
137
296
  scoring_metrics: List[base_metric.BaseMetric],
297
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
138
298
  scoring_threads: int = 16,
139
299
  verbose: int = 1,
140
300
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
141
301
  experiment_id: Optional[str] = None,
302
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
142
303
  ) -> evaluation_result.EvaluationResult:
143
- """Update existing experiment with new evaluation metrics.
304
+ """Update the existing experiment with new evaluation metrics. You can use either `scoring_metrics` or `scorer_functions` to calculate
305
+ evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
306
+ to receive inputs and outputs from the task.
144
307
 
145
308
  Args:
146
309
  experiment_name: The name of the experiment to update.
@@ -151,15 +314,32 @@ def evaluate_experiment(
151
314
  of the `score` method in metrics that you need to find out which keys
152
315
  are mandatory in `task`-returned dictionary.
153
316
 
317
+ scoring_functions: List of scorer functions to be executed during evaluation.
318
+ Each scorer function includes a scoring method that accepts predefined
319
+ arguments supplied by the evaluation engine:
320
+ • dataset_item — a dictionary containing the dataset item content,
321
+ • task_outputs — a dictionary containing the LLM task output.
322
+ • task_span - the data collected during the LLM task execution [optional].
323
+
154
324
  scoring_threads: amount of thread workers to run scoring metrics.
155
325
 
156
326
  verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
157
327
 
158
328
  scoring_key_mapping: A dictionary that allows you to rename keys present in either the dataset item or the task output
159
- so that they match the keys expected by the scoring metrics. For example if you have a dataset item with the following content:
329
+ so that they match the keys expected by the scoring metrics. For example, if you have a dataset item with the following content:
160
330
  {"user_question": "What is Opik ?"} and a scoring metric that expects a key "input", you can use scoring_key_mapping
161
331
  `{"input": "user_question"}` to map the "user_question" key to "input".
332
+
333
+ experiment_id: The ID of the experiment to evaluate. If not provided, the experiment will be evaluated based on the experiment name.
334
+
335
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
336
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
337
+ These scores are computed after all test results are collected and represent aggregate
338
+ metrics across the entire experiment.
162
339
  """
340
+ experiment_scoring_functions = (
341
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
342
+ )
163
343
  start_time = time.time()
164
344
 
165
345
  client = opik_client.get_client_cached()
@@ -172,10 +352,11 @@ def evaluate_experiment(
172
352
  client=client, experiment_name=experiment_name
173
353
  )
174
354
 
355
+ dataset_ = client.get_dataset(name=experiment.dataset_name)
356
+
175
357
  test_cases = rest_operations.get_experiment_test_cases(
176
- client=client,
177
- experiment_id=experiment.id,
178
- dataset_id=experiment.dataset_id,
358
+ experiment_=experiment,
359
+ dataset_=dataset_,
179
360
  scoring_key_mapping=scoring_key_mapping,
180
361
  )
181
362
  first_trace_id = test_cases[0].trace_id
@@ -183,11 +364,17 @@ def evaluate_experiment(
183
364
  client=client, trace_id=first_trace_id
184
365
  )
185
366
 
367
+ # wrap scoring functions if any
368
+ scoring_metrics = _wrap_scoring_functions(
369
+ scoring_functions=scoring_functions,
370
+ scoring_metrics=scoring_metrics,
371
+ project_name=project_name,
372
+ )
373
+
186
374
  with asyncio_support.async_http_connections_expire_immediately():
187
375
  evaluation_engine = engine.EvaluationEngine(
188
376
  client=client,
189
377
  project_name=project_name,
190
- experiment_=experiment,
191
378
  scoring_metrics=scoring_metrics,
192
379
  workers=scoring_threads,
193
380
  verbose=verbose,
@@ -199,47 +386,104 @@ def evaluate_experiment(
199
386
 
200
387
  total_time = time.time() - start_time
201
388
 
202
- if verbose == 1:
389
+ # Compute experiment scores
390
+ computed_experiment_scores = _compute_experiment_scores(
391
+ experiment_scoring_functions=experiment_scoring_functions,
392
+ test_results=test_results,
393
+ )
394
+
395
+ if verbose >= 1:
203
396
  report.display_experiment_results(
204
- experiment.dataset_name, total_time, test_results
397
+ dataset_.name,
398
+ total_time,
399
+ test_results,
400
+ computed_experiment_scores,
205
401
  )
206
402
 
207
- report.display_experiment_link(
208
- dataset_id=experiment.dataset_id,
403
+ experiment_url = url_helpers.get_experiment_url_by_id(
209
404
  experiment_id=experiment.id,
405
+ dataset_id=dataset_.id,
210
406
  url_override=client.config.url_override,
211
407
  )
212
408
 
409
+ report.display_experiment_link(experiment_url=experiment_url)
410
+
411
+ _try_notifying_about_experiment_completion(experiment)
412
+
413
+ # Log experiment scores to backend
414
+ if computed_experiment_scores:
415
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
416
+
213
417
  evaluation_result_ = evaluation_result.EvaluationResult(
418
+ dataset_id=dataset_.id,
214
419
  experiment_id=experiment.id,
215
420
  experiment_name=experiment.name,
216
421
  test_results=test_results,
422
+ experiment_url=experiment_url,
423
+ trial_count=1,
424
+ experiment_scores=computed_experiment_scores,
217
425
  )
218
426
 
427
+ if verbose >= 2:
428
+ report.display_evaluation_scores_statistics(
429
+ dataset_name=dataset_.name,
430
+ evaluation_results=evaluation_result_,
431
+ )
432
+
219
433
  return evaluation_result_
220
434
 
221
435
 
222
436
  def _build_prompt_evaluation_task(
223
437
  model: base_model.OpikBaseModel, messages: List[Dict[str, Any]]
224
438
  ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
225
- def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
226
- processed_messages = []
227
- for message in messages:
228
- processed_messages.append(
229
- {
230
- "role": message["role"],
231
- "content": prompt_template.PromptTemplate(
232
- message["content"], validate_placeholders=False
233
- ).format(**prompt_variables),
234
- }
235
- )
439
+ supported_modalities = cast(
440
+ prompt_types.SupportedModalities,
441
+ {
442
+ "vision": ModelCapabilities.supports_vision(
443
+ getattr(model, "model_name", None)
444
+ ),
445
+ "video": ModelCapabilities.supports_video(
446
+ getattr(model, "model_name", None)
447
+ ),
448
+ },
449
+ )
450
+ # Disable placeholder validation since we pass all dataset item fields to format()
451
+ chat_prompt_template_ = chat_prompt_template.ChatPromptTemplate(
452
+ messages=messages, validate_placeholders=False
453
+ )
236
454
 
237
- llm_output = model.generate_provider_response(messages=processed_messages)
455
+ required_modalities = chat_prompt_template_.required_modalities()
456
+ unsupported_modalities = {
457
+ modality
458
+ for modality in required_modalities
459
+ if not supported_modalities.get(modality, False)
460
+ }
461
+
462
+ if unsupported_modalities:
463
+ modalities_list = ", ".join(sorted(unsupported_modalities))
464
+ LOGGER.warning(
465
+ "Model '%s' does not support %s content. Multimedia parts will be flattened "
466
+ "to text placeholders. See %s for supported models and customization options.",
467
+ getattr(model, "model_name", "unknown"),
468
+ modalities_list,
469
+ MODALITY_SUPPORT_DOC_URL,
470
+ )
238
471
 
239
- return {
240
- "input": processed_messages,
241
- "output": llm_output.choices[0].message.content,
242
- }
472
+ def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
473
+ template_type_override = prompt_variables.get("type")
474
+ processed_messages = chat_prompt_template_.format(
475
+ variables=prompt_variables,
476
+ supported_modalities=supported_modalities,
477
+ template_type=template_type_override,
478
+ )
479
+
480
+ with base_model.get_provider_response(
481
+ model_provider=model, messages=processed_messages
482
+ ) as llm_output:
483
+ return {
484
+ "input": processed_messages,
485
+ "output": llm_output.choices[0].message.content,
486
+ }
243
487
 
244
488
  return _prompt_evaluation_task
245
489
 
@@ -249,14 +493,19 @@ def evaluate_prompt(
249
493
  messages: List[Dict[str, Any]],
250
494
  model: Optional[Union[str, base_model.OpikBaseModel]] = None,
251
495
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
496
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
497
+ experiment_name_prefix: Optional[str] = None,
252
498
  experiment_name: Optional[str] = None,
253
499
  project_name: Optional[str] = None,
254
500
  experiment_config: Optional[Dict[str, Any]] = None,
255
501
  verbose: int = 1,
256
502
  nb_samples: Optional[int] = None,
257
503
  task_threads: int = 16,
258
- prompt: Optional[Prompt] = None,
504
+ prompt: Optional[base_prompt.BasePrompt] = None,
259
505
  dataset_item_ids: Optional[List[str]] = None,
506
+ dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
507
+ trial_count: int = 1,
508
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
260
509
  ) -> evaluation_result.EvaluationResult:
261
510
  """
262
511
  Performs prompt evaluation on a given dataset.
@@ -271,6 +520,17 @@ def evaluate_prompt(
271
520
  scoring_metrics: List of metrics to calculate during evaluation.
272
521
  The LLM input and output will be passed as arguments to each metric `score(...)` method.
273
522
 
523
+ scoring_functions: List of scorer functions to be executed during evaluation.
524
+ Each scorer function includes a scoring method that accepts predefined
525
+ arguments supplied by the evaluation engine:
526
+ • dataset_item — a dictionary containing the dataset item content,
527
+ • task_outputs — a dictionary containing the LLM task output.
528
+ • task_span - the data collected during the LLM task execution [optional].
529
+
530
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
531
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
532
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
533
+
274
534
  experiment_name: name of the experiment.
275
535
 
276
536
  project_name: The name of the project to log data
@@ -286,28 +546,48 @@ def evaluate_prompt(
286
546
  prompt: Prompt object to link with experiment.
287
547
 
288
548
  dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
549
+
550
+ dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
551
+ If not provided, all samples in the dataset will be evaluated.
552
+
553
+ trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
554
+
555
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
556
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
557
+ These scores are computed after all test results are collected and represent aggregate
558
+ metrics across the entire experiment.
289
559
  """
560
+ experiment_scoring_functions = (
561
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
562
+ )
290
563
  if isinstance(model, str):
291
- model = models_factory.get(model_name=model)
564
+ opik_model = models_factory.get(model_name=model)
292
565
  elif not isinstance(model, base_model.OpikBaseModel):
293
566
  raise ValueError("`model` must be either a string or an OpikBaseModel instance")
567
+ else:
568
+ opik_model = model
294
569
 
295
570
  if experiment_config is None:
296
- experiment_config = {"prompt_template": messages, "model": model.model_name}
571
+ experiment_config = {
572
+ "prompt_template": messages,
573
+ "model": opik_model.model_name,
574
+ }
297
575
  else:
298
576
  if "prompt_template" not in experiment_config:
299
577
  experiment_config["prompt_template"] = messages
300
578
 
301
579
  if "model" not in experiment_config:
302
- experiment_config["model"] = model.model_name
303
-
304
- if scoring_metrics is None:
305
- scoring_metrics = []
580
+ experiment_config["model"] = opik_model.model_name
306
581
 
307
582
  client = opik_client.get_client_cached()
308
583
 
309
584
  prompts = [prompt] if prompt else None
310
585
 
586
+ experiment_name = _use_or_create_experiment_name(
587
+ experiment_name=experiment_name,
588
+ experiment_name_prefix=experiment_name_prefix,
589
+ )
590
+
311
591
  experiment = client.create_experiment(
312
592
  name=experiment_name,
313
593
  dataset_name=dataset.name,
@@ -315,42 +595,363 @@ def evaluate_prompt(
315
595
  prompts=prompts,
316
596
  )
317
597
 
598
+ # wrap scoring functions if any
599
+ scoring_metrics = _wrap_scoring_functions(
600
+ scoring_functions=scoring_functions,
601
+ scoring_metrics=scoring_metrics,
602
+ project_name=project_name,
603
+ )
604
+
318
605
  start_time = time.time()
319
606
 
320
607
  with asyncio_support.async_http_connections_expire_immediately():
321
608
  evaluation_engine = engine.EvaluationEngine(
322
609
  client=client,
323
610
  project_name=project_name,
324
- experiment_=experiment,
325
611
  scoring_metrics=scoring_metrics,
326
612
  workers=task_threads,
327
613
  verbose=verbose,
328
614
  scoring_key_mapping=None,
329
615
  )
330
- test_results = evaluation_engine.evaluate_llm_tasks(
616
+ test_results = evaluation_engine.evaluate_llm_task_on_dataset(
331
617
  dataset_=dataset,
332
- task=_build_prompt_evaluation_task(model=model, messages=messages),
618
+ task=_build_prompt_evaluation_task(model=opik_model, messages=messages),
333
619
  nb_samples=nb_samples,
334
620
  dataset_item_ids=dataset_item_ids,
621
+ dataset_sampler=dataset_sampler,
622
+ trial_count=trial_count,
623
+ experiment_=experiment,
335
624
  )
336
625
 
337
626
  total_time = time.time() - start_time
338
627
 
339
- if verbose == 1:
340
- report.display_experiment_results(dataset.name, total_time, test_results)
628
+ # Compute experiment scores
629
+ computed_experiment_scores = _compute_experiment_scores(
630
+ experiment_scoring_functions=experiment_scoring_functions,
631
+ test_results=test_results,
632
+ )
341
633
 
342
- report.display_experiment_link(
634
+ if verbose >= 1:
635
+ report.display_experiment_results(
636
+ dataset.name, total_time, test_results, computed_experiment_scores
637
+ )
638
+
639
+ experiment_url = url_helpers.get_experiment_url_by_id(
343
640
  experiment_id=experiment.id,
344
641
  dataset_id=dataset.id,
345
642
  url_override=client.config.url_override,
346
643
  )
347
644
 
645
+ report.display_experiment_link(experiment_url=experiment_url)
646
+
348
647
  client.flush()
349
648
 
649
+ _try_notifying_about_experiment_completion(experiment)
650
+
651
+ # Log experiment scores to backend
652
+ if computed_experiment_scores:
653
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
654
+
350
655
  evaluation_result_ = evaluation_result.EvaluationResult(
351
656
  experiment_id=experiment.id,
657
+ dataset_id=dataset.id,
352
658
  experiment_name=experiment.name,
353
659
  test_results=test_results,
660
+ experiment_url=experiment_url,
661
+ trial_count=trial_count,
662
+ experiment_scores=computed_experiment_scores,
354
663
  )
355
664
 
665
+ if verbose >= 2:
666
+ report.display_evaluation_scores_statistics(
667
+ dataset_name=dataset.name,
668
+ evaluation_results=evaluation_result_,
669
+ )
670
+
356
671
  return evaluation_result_
672
+
673
+
674
+ def evaluate_optimization_trial(
675
+ optimization_id: str,
676
+ dataset: dataset.Dataset,
677
+ task: LLMTask,
678
+ scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
679
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
680
+ experiment_name_prefix: Optional[str] = None,
681
+ experiment_name: Optional[str] = None,
682
+ project_name: Optional[str] = None,
683
+ experiment_config: Optional[Dict[str, Any]] = None,
684
+ verbose: int = 1,
685
+ nb_samples: Optional[int] = None,
686
+ task_threads: int = 16,
687
+ prompt: Optional[base_prompt.BasePrompt] = None,
688
+ prompts: Optional[List[base_prompt.BasePrompt]] = None,
689
+ scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
690
+ dataset_item_ids: Optional[List[str]] = None,
691
+ dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
692
+ trial_count: int = 1,
693
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
694
+ ) -> evaluation_result.EvaluationResult:
695
+ """
696
+ Performs task evaluation on a given dataset.
697
+
698
+ Args:
699
+ optimization_id: The ID of the optimization associated with the experiment.
700
+
701
+ dataset: An Opik dataset instance
702
+
703
+ task: A callable object that takes dict with dataset item content
704
+ as input and returns dict which will later be used for scoring.
705
+
706
+ scoring_functions: List of scorer functions to be executed during evaluation.
707
+ Each scorer function includes a scoring method that accepts predefined
708
+ arguments supplied by the evaluation engine:
709
+ • dataset_item — a dictionary containing the dataset item content,
710
+ • task_outputs — a dictionary containing the LLM task output.
711
+ • task_span - the data collected during the LLM task execution [optional].
712
+
713
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
714
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
715
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
716
+
717
+ experiment_name: The name of the experiment associated with evaluation run.
718
+ If None, a generated name will be used.
719
+
720
+ project_name: The name of the project. If not provided, traces and spans will be logged to the `Default Project`
721
+
722
+ experiment_config: The dictionary with parameters that describe experiment
723
+
724
+ scoring_metrics: List of metrics to calculate during evaluation.
725
+ Each metric has `score(...)` method, arguments for this method
726
+ are taken from the `task` output, check the signature
727
+ of the `score` method in metrics that you need to find out which keys
728
+ are mandatory in `task`-returned dictionary.
729
+ If no value provided, the experiment won't have any scoring metrics.
730
+
731
+ verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
732
+ 0 - no outputs, 1 - outputs are enabled (default).
733
+
734
+ nb_samples: number of samples to evaluate. If no value is provided, all samples in the dataset will be evaluated.
735
+
736
+ task_threads: number of thread workers to run tasks. If set to 1, no additional
737
+ threads are created, all tasks executed in the current thread sequentially.
738
+ are executed sequentially in the current thread.
739
+ Use more than 1 worker if your task object is compatible with sharing across threads.
740
+
741
+ prompt: Prompt object to link with experiment. Deprecated, use `prompts` argument instead.
742
+
743
+ prompts: A list of Prompt objects to link with experiment.
744
+
745
+ scoring_key_mapping: A dictionary that allows you to rename keys present in either the dataset item or the task output
746
+ so that they match the keys expected by the scoring metrics. For example if you have a dataset item with the following content:
747
+ {"user_question": "What is Opik ?"} and a scoring metric that expects a key "input", you can use scoring_key_mapping
748
+ `{"input": "user_question"}` to map the "user_question" key to "input".
749
+
750
+ dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
751
+
752
+ dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
753
+ If not provided, all samples in the dataset will be evaluated.
754
+
755
+ trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
756
+
757
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
758
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
759
+ These scores are computed after all test results are collected and represent aggregate
760
+ metrics across the entire experiment.
761
+ """
762
+ experiment_scoring_functions = (
763
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
764
+ )
765
+
766
+ if scoring_metrics is None:
767
+ scoring_metrics = []
768
+
769
+ checked_prompts = experiment_helpers.handle_prompt_args(
770
+ prompt=prompt,
771
+ prompts=prompts,
772
+ )
773
+
774
+ # wrap scoring functions if any
775
+ scoring_metrics = _wrap_scoring_functions(
776
+ scoring_functions=scoring_functions,
777
+ scoring_metrics=scoring_metrics,
778
+ project_name=project_name,
779
+ )
780
+
781
+ client = opik_client.get_client_cached()
782
+
783
+ experiment_name = _use_or_create_experiment_name(
784
+ experiment_name=experiment_name,
785
+ experiment_name_prefix=experiment_name_prefix,
786
+ )
787
+
788
+ experiment = client.create_experiment(
789
+ name=experiment_name,
790
+ dataset_name=dataset.name,
791
+ experiment_config=experiment_config,
792
+ prompts=checked_prompts,
793
+ type="trial",
794
+ optimization_id=optimization_id,
795
+ )
796
+
797
+ return _evaluate_task(
798
+ client=client,
799
+ experiment=experiment,
800
+ dataset=dataset,
801
+ task=task,
802
+ scoring_metrics=scoring_metrics,
803
+ project_name=project_name,
804
+ verbose=verbose,
805
+ nb_samples=nb_samples,
806
+ task_threads=task_threads,
807
+ scoring_key_mapping=scoring_key_mapping,
808
+ dataset_item_ids=dataset_item_ids,
809
+ dataset_sampler=dataset_sampler,
810
+ trial_count=trial_count,
811
+ experiment_scoring_functions=experiment_scoring_functions,
812
+ )
813
+
814
+
815
+ def evaluate_on_dict_items(
816
+ items: List[Dict[str, Any]],
817
+ task: LLMTask,
818
+ scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
819
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
820
+ project_name: Optional[str] = None,
821
+ verbose: int = 0,
822
+ scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
823
+ scoring_threads: int = 16,
824
+ ) -> evaluation_result.EvaluationResultOnDictItems:
825
+ """
826
+ Lightweight evaluation function that evaluates a task on dataset items (as dictionaries)
827
+ without requiring a Dataset object or creating an experiment.
828
+
829
+ This function is useful for optimization scenarios where you need to evaluate many
830
+ candidate solutions quickly using Opik's metric infrastructure. It creates traces for
831
+ tracking but doesn't require experiment setup or dataset management.
832
+
833
+ Args:
834
+ items: List of dataset item contents (dictionaries with the data to evaluate).
835
+
836
+ task: A callable object that takes dict with dataset item content
837
+ as input and returns dict which will later be used for scoring.
838
+
839
+ scoring_metrics: List of metrics to calculate during evaluation.
840
+ Each metric's `score(...)` method will be called with arguments taken from
841
+ the dataset item and task output.
842
+
843
+ scoring_functions: List of scorer functions to be executed during evaluation.
844
+ Each scorer function accepts predefined arguments:
845
+ • dataset_item — a dictionary containing the dataset item content,
846
+ • task_outputs — a dictionary containing the LLM task output.
847
+
848
+ project_name: The name of the project for logging traces.
849
+
850
+ verbose: Controls evaluation output logs and progress bars.
851
+ 0 - no outputs (default), 1 - enable outputs.
852
+
853
+ scoring_key_mapping: A dictionary that allows you to rename keys present in either
854
+ the dataset item or the task output to match the keys expected by scoring metrics.
855
+
856
+ scoring_threads: Number of thread workers to run scoring metrics.
857
+
858
+ Returns:
859
+ EvaluationResultOnDictItems object containing test results and providing methods
860
+ to aggregate scores, similar to the regular evaluation result.
861
+
862
+ Example:
863
+ ```python
864
+ import opik
865
+ from opik.evaluation.metrics import Equals
866
+
867
+ items = [
868
+ {"input": "What is 2+2?", "expected_output": "4"},
869
+ {"input": "What is 3+3?", "expected_output": "6"},
870
+ ]
871
+
872
+ def my_task(item):
873
+ # Your LLM call here
874
+ question = item["input"]
875
+ # ... call model ...
876
+ return {"output": model_output}
877
+
878
+ result = opik.evaluate_on_dict_items(
879
+ items=items,
880
+ task=my_task,
881
+ scoring_metrics=[Equals()],
882
+ scoring_key_mapping={"reference": "expected_output"},
883
+ )
884
+
885
+ # Access individual test results
886
+ for test_result in result.test_results:
887
+ print(f"Score: {test_result.score_results[0].value}")
888
+
889
+ # Get aggregated statistics
890
+ aggregated = result.aggregate_evaluation_scores()
891
+ print(f"Mean equals score: {aggregated['equals_metric'].mean}")
892
+ ```
893
+ """
894
+ # Wrap scoring functions if any
895
+ scoring_metrics = _wrap_scoring_functions(
896
+ scoring_functions=scoring_functions,
897
+ scoring_metrics=scoring_metrics,
898
+ project_name=project_name,
899
+ )
900
+
901
+ if not scoring_metrics:
902
+ LOGGER.warning("No scoring metrics provided for items evaluation")
903
+ return evaluation_result.EvaluationResultOnDictItems(test_results=[])
904
+
905
+ client = opik_client.get_client_cached()
906
+
907
+ # Create evaluation engine
908
+ with asyncio_support.async_http_connections_expire_immediately():
909
+ evaluation_engine = engine.EvaluationEngine(
910
+ client=client,
911
+ project_name=project_name,
912
+ scoring_metrics=scoring_metrics,
913
+ workers=scoring_threads,
914
+ verbose=verbose,
915
+ scoring_key_mapping=scoring_key_mapping,
916
+ )
917
+
918
+ # Use the new evaluate_items method
919
+ test_results = evaluation_engine.evaluate_llm_task_on_dict_items(
920
+ items=items,
921
+ task=task,
922
+ )
923
+
924
+ return evaluation_result.EvaluationResultOnDictItems(
925
+ test_results=test_results,
926
+ )
927
+
928
+
929
+ def _wrap_scoring_functions(
930
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]],
931
+ scoring_metrics: Optional[List[base_metric.BaseMetric]],
932
+ project_name: Optional[str],
933
+ ) -> List[base_metric.BaseMetric]:
934
+ if scoring_functions:
935
+ function_metrics = scorer_wrapper_metric.wrap_scorer_functions(
936
+ scoring_functions, project_name=project_name
937
+ )
938
+ if scoring_metrics:
939
+ scoring_metrics.extend(function_metrics)
940
+ else:
941
+ scoring_metrics = function_metrics
942
+
943
+ return scoring_metrics if scoring_metrics else []
944
+
945
+
946
+ def _use_or_create_experiment_name(
947
+ experiment_name: Optional[str], experiment_name_prefix: Optional[str]
948
+ ) -> Optional[str]:
949
+ if experiment_name:
950
+ return experiment_name
951
+
952
+ if experiment_name_prefix:
953
+ return experiment_helpers.generate_unique_experiment_name(
954
+ experiment_name_prefix
955
+ )
956
+ else:
957
+ return None