opik 1.9.5__py3-none-any.whl → 1.9.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. opik/__init__.py +10 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/dataset/rest_operations.py +5 -0
  9. opik/api_objects/experiment/experiment.py +46 -49
  10. opik/api_objects/experiment/helpers.py +34 -10
  11. opik/api_objects/local_recording.py +8 -3
  12. opik/api_objects/opik_client.py +230 -48
  13. opik/api_objects/opik_query_language.py +9 -0
  14. opik/api_objects/prompt/__init__.py +11 -3
  15. opik/api_objects/prompt/base_prompt.py +69 -0
  16. opik/api_objects/prompt/base_prompt_template.py +29 -0
  17. opik/api_objects/prompt/chat/__init__.py +1 -0
  18. opik/api_objects/prompt/chat/chat_prompt.py +193 -0
  19. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  20. opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +37 -35
  21. opik/api_objects/prompt/client.py +101 -30
  22. opik/api_objects/prompt/text/__init__.py +1 -0
  23. opik/api_objects/prompt/text/prompt.py +174 -0
  24. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  25. opik/api_objects/prompt/types.py +1 -1
  26. opik/cli/export.py +6 -2
  27. opik/cli/usage_report/charts.py +39 -10
  28. opik/cli/usage_report/cli.py +164 -45
  29. opik/cli/usage_report/pdf.py +14 -1
  30. opik/config.py +0 -5
  31. opik/decorator/base_track_decorator.py +37 -40
  32. opik/decorator/context_manager/span_context_manager.py +9 -0
  33. opik/decorator/context_manager/trace_context_manager.py +5 -0
  34. opik/dict_utils.py +3 -3
  35. opik/evaluation/__init__.py +13 -2
  36. opik/evaluation/engine/engine.py +195 -223
  37. opik/evaluation/engine/helpers.py +8 -7
  38. opik/evaluation/engine/metrics_evaluator.py +237 -0
  39. opik/evaluation/evaluation_result.py +35 -1
  40. opik/evaluation/evaluator.py +318 -30
  41. opik/evaluation/models/litellm/util.py +78 -6
  42. opik/evaluation/models/model_capabilities.py +33 -0
  43. opik/evaluation/report.py +14 -2
  44. opik/evaluation/rest_operations.py +36 -33
  45. opik/evaluation/test_case.py +2 -2
  46. opik/evaluation/types.py +9 -1
  47. opik/exceptions.py +17 -0
  48. opik/hooks/__init__.py +17 -1
  49. opik/hooks/anonymizer_hook.py +36 -0
  50. opik/id_helpers.py +18 -0
  51. opik/integrations/adk/helpers.py +16 -7
  52. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  53. opik/integrations/adk/opik_tracer.py +3 -1
  54. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  55. opik/integrations/adk/recursive_callback_injector.py +1 -6
  56. opik/integrations/dspy/callback.py +1 -4
  57. opik/integrations/haystack/opik_connector.py +2 -2
  58. opik/integrations/haystack/opik_tracer.py +2 -4
  59. opik/integrations/langchain/opik_tracer.py +273 -82
  60. opik/integrations/llama_index/callback.py +110 -108
  61. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  62. opik/integrations/openai/opik_tracker.py +1 -1
  63. opik/message_processing/batching/batchers.py +11 -7
  64. opik/message_processing/encoder_helpers.py +79 -0
  65. opik/message_processing/messages.py +25 -1
  66. opik/message_processing/online_message_processor.py +23 -8
  67. opik/opik_context.py +7 -7
  68. opik/rest_api/__init__.py +188 -12
  69. opik/rest_api/client.py +3 -0
  70. opik/rest_api/dashboards/__init__.py +4 -0
  71. opik/rest_api/dashboards/client.py +462 -0
  72. opik/rest_api/dashboards/raw_client.py +648 -0
  73. opik/rest_api/datasets/client.py +893 -89
  74. opik/rest_api/datasets/raw_client.py +1328 -87
  75. opik/rest_api/experiments/client.py +30 -2
  76. opik/rest_api/experiments/raw_client.py +26 -0
  77. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  78. opik/rest_api/optimizations/client.py +302 -0
  79. opik/rest_api/optimizations/raw_client.py +463 -0
  80. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  81. opik/rest_api/prompts/__init__.py +2 -2
  82. opik/rest_api/prompts/client.py +34 -4
  83. opik/rest_api/prompts/raw_client.py +32 -2
  84. opik/rest_api/prompts/types/__init__.py +3 -1
  85. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  86. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  87. opik/rest_api/spans/__init__.py +0 -2
  88. opik/rest_api/spans/client.py +148 -64
  89. opik/rest_api/spans/raw_client.py +210 -83
  90. opik/rest_api/spans/types/__init__.py +0 -2
  91. opik/rest_api/traces/client.py +241 -73
  92. opik/rest_api/traces/raw_client.py +344 -90
  93. opik/rest_api/types/__init__.py +200 -15
  94. opik/rest_api/types/aggregation_data.py +1 -0
  95. opik/rest_api/types/alert_trigger_config_public_type.py +6 -1
  96. opik/rest_api/types/alert_trigger_config_type.py +6 -1
  97. opik/rest_api/types/alert_trigger_config_write_type.py +6 -1
  98. opik/rest_api/types/automation_rule_evaluator.py +23 -1
  99. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  100. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  101. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  102. opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
  103. opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
  104. opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
  105. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  106. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  107. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  108. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  109. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  110. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  111. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  112. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  113. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  114. opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
  115. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  116. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  117. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  118. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  119. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  120. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  121. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  122. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  123. opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
  124. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  125. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  126. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  127. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  128. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  129. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  130. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  131. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  132. opik/rest_api/types/dashboard_page_public.py +24 -0
  133. opik/rest_api/types/dashboard_public.py +30 -0
  134. opik/rest_api/types/dataset.py +2 -0
  135. opik/rest_api/types/dataset_item.py +2 -0
  136. opik/rest_api/types/dataset_item_compare.py +2 -0
  137. opik/rest_api/types/dataset_item_filter.py +23 -0
  138. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  139. opik/rest_api/types/dataset_item_page_compare.py +1 -0
  140. opik/rest_api/types/dataset_item_page_public.py +1 -0
  141. opik/rest_api/types/dataset_item_public.py +2 -0
  142. opik/rest_api/types/dataset_item_update.py +39 -0
  143. opik/rest_api/types/dataset_item_write.py +1 -0
  144. opik/rest_api/types/dataset_public.py +2 -0
  145. opik/rest_api/types/dataset_public_status.py +5 -0
  146. opik/rest_api/types/dataset_status.py +5 -0
  147. opik/rest_api/types/dataset_version_diff.py +22 -0
  148. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  149. opik/rest_api/types/dataset_version_page_public.py +23 -0
  150. opik/rest_api/types/dataset_version_public.py +49 -0
  151. opik/rest_api/types/experiment.py +2 -0
  152. opik/rest_api/types/experiment_public.py +2 -0
  153. opik/rest_api/types/experiment_score.py +20 -0
  154. opik/rest_api/types/experiment_score_public.py +20 -0
  155. opik/rest_api/types/experiment_score_write.py +20 -0
  156. opik/rest_api/types/feedback.py +20 -1
  157. opik/rest_api/types/feedback_create.py +16 -1
  158. opik/rest_api/types/feedback_object_public.py +22 -1
  159. opik/rest_api/types/feedback_public.py +20 -1
  160. opik/rest_api/types/feedback_score_public.py +4 -0
  161. opik/rest_api/types/feedback_update.py +16 -1
  162. opik/rest_api/types/image_url.py +20 -0
  163. opik/rest_api/types/image_url_public.py +20 -0
  164. opik/rest_api/types/image_url_write.py +20 -0
  165. opik/rest_api/types/llm_as_judge_message.py +5 -1
  166. opik/rest_api/types/llm_as_judge_message_content.py +24 -0
  167. opik/rest_api/types/llm_as_judge_message_content_public.py +24 -0
  168. opik/rest_api/types/llm_as_judge_message_content_write.py +24 -0
  169. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  170. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  171. opik/rest_api/types/llm_as_judge_model_parameters.py +2 -0
  172. opik/rest_api/types/llm_as_judge_model_parameters_public.py +2 -0
  173. opik/rest_api/types/llm_as_judge_model_parameters_write.py +2 -0
  174. opik/rest_api/types/optimization.py +2 -0
  175. opik/rest_api/types/optimization_public.py +2 -0
  176. opik/rest_api/types/optimization_public_status.py +3 -1
  177. opik/rest_api/types/optimization_status.py +3 -1
  178. opik/rest_api/types/optimization_studio_config.py +27 -0
  179. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  180. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  181. opik/rest_api/types/optimization_studio_log.py +22 -0
  182. opik/rest_api/types/optimization_write.py +2 -0
  183. opik/rest_api/types/optimization_write_status.py +3 -1
  184. opik/rest_api/types/prompt.py +6 -0
  185. opik/rest_api/types/prompt_detail.py +6 -0
  186. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  187. opik/rest_api/types/prompt_public.py +6 -0
  188. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  189. opik/rest_api/types/prompt_template_structure.py +5 -0
  190. opik/rest_api/types/prompt_version.py +2 -0
  191. opik/rest_api/types/prompt_version_detail.py +2 -0
  192. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  193. opik/rest_api/types/prompt_version_public.py +2 -0
  194. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  195. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  196. opik/rest_api/types/score_name.py +1 -0
  197. opik/rest_api/types/service_toggles_config.py +6 -0
  198. opik/rest_api/types/span_enrichment_options.py +31 -0
  199. opik/rest_api/types/span_filter.py +23 -0
  200. opik/rest_api/types/span_filter_operator.py +21 -0
  201. opik/rest_api/types/span_filter_write.py +23 -0
  202. opik/rest_api/types/span_filter_write_operator.py +21 -0
  203. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  204. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  205. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  206. opik/rest_api/types/span_update.py +46 -0
  207. opik/rest_api/types/studio_evaluation.py +20 -0
  208. opik/rest_api/types/studio_evaluation_public.py +20 -0
  209. opik/rest_api/types/studio_evaluation_write.py +20 -0
  210. opik/rest_api/types/studio_llm_model.py +21 -0
  211. opik/rest_api/types/studio_llm_model_public.py +21 -0
  212. opik/rest_api/types/studio_llm_model_write.py +21 -0
  213. opik/rest_api/types/studio_message.py +20 -0
  214. opik/rest_api/types/studio_message_public.py +20 -0
  215. opik/rest_api/types/studio_message_write.py +20 -0
  216. opik/rest_api/types/studio_metric.py +21 -0
  217. opik/rest_api/types/studio_metric_public.py +21 -0
  218. opik/rest_api/types/studio_metric_write.py +21 -0
  219. opik/rest_api/types/studio_optimizer.py +21 -0
  220. opik/rest_api/types/studio_optimizer_public.py +21 -0
  221. opik/rest_api/types/studio_optimizer_write.py +21 -0
  222. opik/rest_api/types/studio_prompt.py +20 -0
  223. opik/rest_api/types/studio_prompt_public.py +20 -0
  224. opik/rest_api/types/studio_prompt_write.py +20 -0
  225. opik/rest_api/types/trace.py +6 -0
  226. opik/rest_api/types/trace_public.py +6 -0
  227. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  228. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  229. opik/rest_api/types/trace_thread_update.py +19 -0
  230. opik/rest_api/types/trace_update.py +39 -0
  231. opik/rest_api/types/value_entry.py +2 -0
  232. opik/rest_api/types/value_entry_compare.py +2 -0
  233. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  234. opik/rest_api/types/value_entry_public.py +2 -0
  235. opik/rest_api/types/video_url.py +19 -0
  236. opik/rest_api/types/video_url_public.py +19 -0
  237. opik/rest_api/types/video_url_write.py +19 -0
  238. opik/synchronization.py +5 -6
  239. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  240. {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/METADATA +5 -4
  241. {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/RECORD +246 -151
  242. opik/api_objects/prompt/chat_prompt_template.py +0 -164
  243. opik/api_objects/prompt/prompt.py +0 -131
  244. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  245. {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/WHEEL +0 -0
  246. {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/entry_points.txt +0 -0
  247. {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/licenses/LICENSE +0 -0
  248. {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/top_level.txt +0 -0
@@ -2,10 +2,12 @@ import logging
2
2
  import time
3
3
  from typing import Any, Callable, Dict, List, Optional, Union, cast
4
4
 
5
- from .. import Prompt
5
+ from ..api_objects.prompt import base_prompt
6
6
  from ..api_objects import opik_client
7
7
  from ..api_objects import dataset, experiment
8
8
  from ..api_objects.experiment import helpers as experiment_helpers
9
+ from ..api_objects.prompt.chat import chat_prompt_template
10
+ from ..api_objects.prompt import types as prompt_types
9
11
  from . import (
10
12
  asyncio_support,
11
13
  engine,
@@ -14,13 +16,12 @@ from . import (
14
16
  rest_operations,
15
17
  samplers,
16
18
  )
17
- from .metrics import base_metric
19
+ from .metrics import base_metric, score_result
18
20
  from .models import ModelCapabilities, base_model, models_factory
19
21
  from .scorers import scorer_function, scorer_wrapper_metric
20
- from .types import LLMTask, ScoringKeyMappingType
22
+ from . import test_result
23
+ from .types import ExperimentScoreFunction, LLMTask, ScoringKeyMappingType
21
24
  from .. import url_helpers
22
- from opik.api_objects.prompt.chat_prompt_template import ChatPromptTemplate
23
- from opik.api_objects.prompt.types import SupportedModalities
24
25
 
25
26
  LOGGER = logging.getLogger(__name__)
26
27
  MODALITY_SUPPORT_DOC_URL = (
@@ -41,23 +42,52 @@ def _try_notifying_about_experiment_completion(
41
42
  )
42
43
 
43
44
 
45
+ def _compute_experiment_scores(
46
+ experiment_scoring_functions: List[ExperimentScoreFunction],
47
+ test_results: List[test_result.TestResult],
48
+ ) -> List[score_result.ScoreResult]:
49
+ """Compute experiment-level scores from test results."""
50
+ if not experiment_scoring_functions or not test_results:
51
+ return []
52
+
53
+ all_scores: List[score_result.ScoreResult] = []
54
+ for score_function in experiment_scoring_functions:
55
+ try:
56
+ scores = score_function(test_results)
57
+ # Handle Union[ScoreResult, List[ScoreResult]]
58
+ if isinstance(scores, list):
59
+ all_scores.extend(scores)
60
+ else:
61
+ all_scores.append(scores)
62
+ except Exception as e:
63
+ LOGGER.warning(
64
+ "Failed to compute experiment score: %s",
65
+ e,
66
+ exc_info=True,
67
+ )
68
+
69
+ return all_scores
70
+
71
+
44
72
  def evaluate(
45
73
  dataset: dataset.Dataset,
46
74
  task: LLMTask,
47
75
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
48
76
  scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
77
+ experiment_name_prefix: Optional[str] = None,
49
78
  experiment_name: Optional[str] = None,
50
79
  project_name: Optional[str] = None,
51
80
  experiment_config: Optional[Dict[str, Any]] = None,
52
81
  verbose: int = 1,
53
82
  nb_samples: Optional[int] = None,
54
83
  task_threads: int = 16,
55
- prompt: Optional[Prompt] = None,
56
- prompts: Optional[List[Prompt]] = None,
84
+ prompt: Optional[base_prompt.BasePrompt] = None,
85
+ prompts: Optional[List[base_prompt.BasePrompt]] = None,
57
86
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
58
87
  dataset_item_ids: Optional[List[str]] = None,
59
88
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
60
89
  trial_count: int = 1,
90
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
61
91
  ) -> evaluation_result.EvaluationResult:
62
92
  """
63
93
  Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
@@ -70,6 +100,10 @@ def evaluate(
70
100
  task: A callable object that takes dict with dataset item content
71
101
  as input and returns dict which will later be used for scoring.
72
102
 
103
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
104
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
105
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
106
+
73
107
  experiment_name: The name of the experiment associated with evaluation run.
74
108
  If None, a generated name will be used.
75
109
 
@@ -117,7 +151,16 @@ def evaluate(
117
151
  If not provided, all samples in the dataset will be evaluated.
118
152
 
119
153
  trial_count: number of times to run the task and evaluate the task output for every dataset item.
154
+
155
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
156
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
157
+ These scores are computed after all test results are collected and represent aggregate
158
+ metrics across the entire experiment.
120
159
  """
160
+ experiment_scoring_functions = (
161
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
162
+ )
163
+
121
164
  checked_prompts = experiment_helpers.handle_prompt_args(
122
165
  prompt=prompt,
123
166
  prompts=prompts,
@@ -125,6 +168,11 @@ def evaluate(
125
168
 
126
169
  client = opik_client.get_client_cached()
127
170
 
171
+ experiment_name = _use_or_create_experiment_name(
172
+ experiment_name=experiment_name,
173
+ experiment_name_prefix=experiment_name_prefix,
174
+ )
175
+
128
176
  experiment = client.create_experiment(
129
177
  name=experiment_name,
130
178
  dataset_name=dataset.name,
@@ -153,6 +201,7 @@ def evaluate(
153
201
  dataset_item_ids=dataset_item_ids,
154
202
  dataset_sampler=dataset_sampler,
155
203
  trial_count=trial_count,
204
+ experiment_scoring_functions=experiment_scoring_functions,
156
205
  )
157
206
 
158
207
 
@@ -171,6 +220,7 @@ def _evaluate_task(
171
220
  dataset_item_ids: Optional[List[str]],
172
221
  dataset_sampler: Optional[samplers.BaseDatasetSampler],
173
222
  trial_count: int,
223
+ experiment_scoring_functions: List[ExperimentScoreFunction],
174
224
  ) -> evaluation_result.EvaluationResult:
175
225
  start_time = time.time()
176
226
 
@@ -178,25 +228,33 @@ def _evaluate_task(
178
228
  evaluation_engine = engine.EvaluationEngine(
179
229
  client=client,
180
230
  project_name=project_name,
181
- experiment_=experiment,
182
231
  scoring_metrics=scoring_metrics,
183
232
  workers=task_threads,
184
233
  verbose=verbose,
185
234
  scoring_key_mapping=scoring_key_mapping,
186
235
  )
187
- test_results = evaluation_engine.evaluate_llm_tasks(
236
+ test_results = evaluation_engine.evaluate_llm_task_on_dataset(
188
237
  dataset_=dataset,
189
238
  task=task,
190
239
  nb_samples=nb_samples,
191
240
  dataset_item_ids=dataset_item_ids,
192
241
  dataset_sampler=dataset_sampler,
193
242
  trial_count=trial_count,
243
+ experiment_=experiment,
194
244
  )
195
245
 
196
246
  total_time = time.time() - start_time
197
247
 
248
+ # Compute experiment scores
249
+ computed_experiment_scores = _compute_experiment_scores(
250
+ experiment_scoring_functions=experiment_scoring_functions,
251
+ test_results=test_results,
252
+ )
253
+
198
254
  if verbose >= 1:
199
- report.display_experiment_results(dataset.name, total_time, test_results)
255
+ report.display_experiment_results(
256
+ dataset.name, total_time, test_results, computed_experiment_scores
257
+ )
200
258
 
201
259
  experiment_url = url_helpers.get_experiment_url_by_id(
202
260
  experiment_id=experiment.id,
@@ -210,6 +268,10 @@ def _evaluate_task(
210
268
 
211
269
  _try_notifying_about_experiment_completion(experiment)
212
270
 
271
+ # Log experiment scores to backend
272
+ if computed_experiment_scores:
273
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
274
+
213
275
  evaluation_result_ = evaluation_result.EvaluationResult(
214
276
  dataset_id=dataset.id,
215
277
  experiment_id=experiment.id,
@@ -217,6 +279,7 @@ def _evaluate_task(
217
279
  test_results=test_results,
218
280
  experiment_url=experiment_url,
219
281
  trial_count=trial_count,
282
+ experiment_scores=computed_experiment_scores,
220
283
  )
221
284
 
222
285
  if verbose >= 2:
@@ -236,6 +299,7 @@ def evaluate_experiment(
236
299
  verbose: int = 1,
237
300
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
238
301
  experiment_id: Optional[str] = None,
302
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
239
303
  ) -> evaluation_result.EvaluationResult:
240
304
  """Update the existing experiment with new evaluation metrics. You can use either `scoring_metrics` or `scorer_functions` to calculate
241
305
  evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
@@ -267,7 +331,15 @@ def evaluate_experiment(
267
331
  `{"input": "user_question"}` to map the "user_question" key to "input".
268
332
 
269
333
  experiment_id: The ID of the experiment to evaluate. If not provided, the experiment will be evaluated based on the experiment name.
334
+
335
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
336
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
337
+ These scores are computed after all test results are collected and represent aggregate
338
+ metrics across the entire experiment.
270
339
  """
340
+ experiment_scoring_functions = (
341
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
342
+ )
271
343
  start_time = time.time()
272
344
 
273
345
  client = opik_client.get_client_cached()
@@ -280,10 +352,11 @@ def evaluate_experiment(
280
352
  client=client, experiment_name=experiment_name
281
353
  )
282
354
 
355
+ dataset_ = client.get_dataset(name=experiment.dataset_name)
356
+
283
357
  test_cases = rest_operations.get_experiment_test_cases(
284
- client=client,
285
- experiment_id=experiment.id,
286
- dataset_id=experiment.dataset_id,
358
+ experiment_=experiment,
359
+ dataset_=dataset_,
287
360
  scoring_key_mapping=scoring_key_mapping,
288
361
  )
289
362
  first_trace_id = test_cases[0].trace_id
@@ -302,7 +375,6 @@ def evaluate_experiment(
302
375
  evaluation_engine = engine.EvaluationEngine(
303
376
  client=client,
304
377
  project_name=project_name,
305
- experiment_=experiment,
306
378
  scoring_metrics=scoring_metrics,
307
379
  workers=scoring_threads,
308
380
  verbose=verbose,
@@ -314,14 +386,23 @@ def evaluate_experiment(
314
386
 
315
387
  total_time = time.time() - start_time
316
388
 
389
+ # Compute experiment scores
390
+ computed_experiment_scores = _compute_experiment_scores(
391
+ experiment_scoring_functions=experiment_scoring_functions,
392
+ test_results=test_results,
393
+ )
394
+
317
395
  if verbose >= 1:
318
396
  report.display_experiment_results(
319
- experiment.dataset_name, total_time, test_results
397
+ dataset_.name,
398
+ total_time,
399
+ test_results,
400
+ computed_experiment_scores,
320
401
  )
321
402
 
322
403
  experiment_url = url_helpers.get_experiment_url_by_id(
323
404
  experiment_id=experiment.id,
324
- dataset_id=experiment.dataset_id,
405
+ dataset_id=dataset_.id,
325
406
  url_override=client.config.url_override,
326
407
  )
327
408
 
@@ -329,18 +410,23 @@ def evaluate_experiment(
329
410
 
330
411
  _try_notifying_about_experiment_completion(experiment)
331
412
 
413
+ # Log experiment scores to backend
414
+ if computed_experiment_scores:
415
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
416
+
332
417
  evaluation_result_ = evaluation_result.EvaluationResult(
333
- dataset_id=experiment.dataset_id,
418
+ dataset_id=dataset_.id,
334
419
  experiment_id=experiment.id,
335
420
  experiment_name=experiment.name,
336
421
  test_results=test_results,
337
422
  experiment_url=experiment_url,
338
423
  trial_count=1,
424
+ experiment_scores=computed_experiment_scores,
339
425
  )
340
426
 
341
427
  if verbose >= 2:
342
428
  report.display_evaluation_scores_statistics(
343
- dataset_name=experiment.dataset_name,
429
+ dataset_name=dataset_.name,
344
430
  evaluation_results=evaluation_result_,
345
431
  )
346
432
 
@@ -351,16 +437,22 @@ def _build_prompt_evaluation_task(
351
437
  model: base_model.OpikBaseModel, messages: List[Dict[str, Any]]
352
438
  ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
353
439
  supported_modalities = cast(
354
- SupportedModalities,
440
+ prompt_types.SupportedModalities,
355
441
  {
356
442
  "vision": ModelCapabilities.supports_vision(
357
443
  getattr(model, "model_name", None)
358
- )
444
+ ),
445
+ "video": ModelCapabilities.supports_video(
446
+ getattr(model, "model_name", None)
447
+ ),
359
448
  },
360
449
  )
361
- chat_prompt_template = ChatPromptTemplate(messages=messages)
450
+ # Disable placeholder validation since we pass all dataset item fields to format()
451
+ chat_prompt_template_ = chat_prompt_template.ChatPromptTemplate(
452
+ messages=messages, validate_placeholders=False
453
+ )
362
454
 
363
- required_modalities = chat_prompt_template.required_modalities()
455
+ required_modalities = chat_prompt_template_.required_modalities()
364
456
  unsupported_modalities = {
365
457
  modality
366
458
  for modality in required_modalities
@@ -379,7 +471,7 @@ def _build_prompt_evaluation_task(
379
471
 
380
472
  def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
381
473
  template_type_override = prompt_variables.get("type")
382
- processed_messages = chat_prompt_template.format(
474
+ processed_messages = chat_prompt_template_.format(
383
475
  variables=prompt_variables,
384
476
  supported_modalities=supported_modalities,
385
477
  template_type=template_type_override,
@@ -402,16 +494,18 @@ def evaluate_prompt(
402
494
  model: Optional[Union[str, base_model.OpikBaseModel]] = None,
403
495
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
404
496
  scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
497
+ experiment_name_prefix: Optional[str] = None,
405
498
  experiment_name: Optional[str] = None,
406
499
  project_name: Optional[str] = None,
407
500
  experiment_config: Optional[Dict[str, Any]] = None,
408
501
  verbose: int = 1,
409
502
  nb_samples: Optional[int] = None,
410
503
  task_threads: int = 16,
411
- prompt: Optional[Prompt] = None,
504
+ prompt: Optional[base_prompt.BasePrompt] = None,
412
505
  dataset_item_ids: Optional[List[str]] = None,
413
506
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
414
507
  trial_count: int = 1,
508
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
415
509
  ) -> evaluation_result.EvaluationResult:
416
510
  """
417
511
  Performs prompt evaluation on a given dataset.
@@ -433,6 +527,10 @@ def evaluate_prompt(
433
527
  • task_outputs — a dictionary containing the LLM task output.
434
528
  • task_span - the data collected during the LLM task execution [optional].
435
529
 
530
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
531
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
532
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
533
+
436
534
  experiment_name: name of the experiment.
437
535
 
438
536
  project_name: The name of the project to log data
@@ -453,7 +551,15 @@ def evaluate_prompt(
453
551
  If not provided, all samples in the dataset will be evaluated.
454
552
 
455
553
  trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
554
+
555
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
556
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
557
+ These scores are computed after all test results are collected and represent aggregate
558
+ metrics across the entire experiment.
456
559
  """
560
+ experiment_scoring_functions = (
561
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
562
+ )
457
563
  if isinstance(model, str):
458
564
  opik_model = models_factory.get(model_name=model)
459
565
  elif not isinstance(model, base_model.OpikBaseModel):
@@ -477,6 +583,11 @@ def evaluate_prompt(
477
583
 
478
584
  prompts = [prompt] if prompt else None
479
585
 
586
+ experiment_name = _use_or_create_experiment_name(
587
+ experiment_name=experiment_name,
588
+ experiment_name_prefix=experiment_name_prefix,
589
+ )
590
+
480
591
  experiment = client.create_experiment(
481
592
  name=experiment_name,
482
593
  dataset_name=dataset.name,
@@ -497,25 +608,33 @@ def evaluate_prompt(
497
608
  evaluation_engine = engine.EvaluationEngine(
498
609
  client=client,
499
610
  project_name=project_name,
500
- experiment_=experiment,
501
611
  scoring_metrics=scoring_metrics,
502
612
  workers=task_threads,
503
613
  verbose=verbose,
504
614
  scoring_key_mapping=None,
505
615
  )
506
- test_results = evaluation_engine.evaluate_llm_tasks(
616
+ test_results = evaluation_engine.evaluate_llm_task_on_dataset(
507
617
  dataset_=dataset,
508
618
  task=_build_prompt_evaluation_task(model=opik_model, messages=messages),
509
619
  nb_samples=nb_samples,
510
620
  dataset_item_ids=dataset_item_ids,
511
621
  dataset_sampler=dataset_sampler,
512
622
  trial_count=trial_count,
623
+ experiment_=experiment,
513
624
  )
514
625
 
515
626
  total_time = time.time() - start_time
516
627
 
628
+ # Compute experiment scores
629
+ computed_experiment_scores = _compute_experiment_scores(
630
+ experiment_scoring_functions=experiment_scoring_functions,
631
+ test_results=test_results,
632
+ )
633
+
517
634
  if verbose >= 1:
518
- report.display_experiment_results(dataset.name, total_time, test_results)
635
+ report.display_experiment_results(
636
+ dataset.name, total_time, test_results, computed_experiment_scores
637
+ )
519
638
 
520
639
  experiment_url = url_helpers.get_experiment_url_by_id(
521
640
  experiment_id=experiment.id,
@@ -529,6 +648,10 @@ def evaluate_prompt(
529
648
 
530
649
  _try_notifying_about_experiment_completion(experiment)
531
650
 
651
+ # Log experiment scores to backend
652
+ if computed_experiment_scores:
653
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
654
+
532
655
  evaluation_result_ = evaluation_result.EvaluationResult(
533
656
  experiment_id=experiment.id,
534
657
  dataset_id=dataset.id,
@@ -536,6 +659,7 @@ def evaluate_prompt(
536
659
  test_results=test_results,
537
660
  experiment_url=experiment_url,
538
661
  trial_count=trial_count,
662
+ experiment_scores=computed_experiment_scores,
539
663
  )
540
664
 
541
665
  if verbose >= 2:
@@ -552,18 +676,21 @@ def evaluate_optimization_trial(
552
676
  dataset: dataset.Dataset,
553
677
  task: LLMTask,
554
678
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
679
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
680
+ experiment_name_prefix: Optional[str] = None,
555
681
  experiment_name: Optional[str] = None,
556
682
  project_name: Optional[str] = None,
557
683
  experiment_config: Optional[Dict[str, Any]] = None,
558
684
  verbose: int = 1,
559
685
  nb_samples: Optional[int] = None,
560
686
  task_threads: int = 16,
561
- prompt: Optional[Prompt] = None,
562
- prompts: Optional[List[Prompt]] = None,
687
+ prompt: Optional[base_prompt.BasePrompt] = None,
688
+ prompts: Optional[List[base_prompt.BasePrompt]] = None,
563
689
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
564
690
  dataset_item_ids: Optional[List[str]] = None,
565
691
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
566
692
  trial_count: int = 1,
693
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
567
694
  ) -> evaluation_result.EvaluationResult:
568
695
  """
569
696
  Performs task evaluation on a given dataset.
@@ -576,6 +703,17 @@ def evaluate_optimization_trial(
576
703
  task: A callable object that takes dict with dataset item content
577
704
  as input and returns dict which will later be used for scoring.
578
705
 
706
+ scoring_functions: List of scorer functions to be executed during evaluation.
707
+ Each scorer function includes a scoring method that accepts predefined
708
+ arguments supplied by the evaluation engine:
709
+ • dataset_item — a dictionary containing the dataset item content,
710
+ • task_outputs — a dictionary containing the LLM task output.
711
+ • task_span - the data collected during the LLM task execution [optional].
712
+
713
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
714
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
715
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
716
+
579
717
  experiment_name: The name of the experiment associated with evaluation run.
580
718
  If None, a generated name will be used.
581
719
 
@@ -615,7 +753,16 @@ def evaluate_optimization_trial(
615
753
  If not provided, all samples in the dataset will be evaluated.
616
754
 
617
755
  trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
756
+
757
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
758
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
759
+ These scores are computed after all test results are collected and represent aggregate
760
+ metrics across the entire experiment.
618
761
  """
762
+ experiment_scoring_functions = (
763
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
764
+ )
765
+
619
766
  if scoring_metrics is None:
620
767
  scoring_metrics = []
621
768
 
@@ -624,8 +771,20 @@ def evaluate_optimization_trial(
624
771
  prompts=prompts,
625
772
  )
626
773
 
774
+ # wrap scoring functions if any
775
+ scoring_metrics = _wrap_scoring_functions(
776
+ scoring_functions=scoring_functions,
777
+ scoring_metrics=scoring_metrics,
778
+ project_name=project_name,
779
+ )
780
+
627
781
  client = opik_client.get_client_cached()
628
782
 
783
+ experiment_name = _use_or_create_experiment_name(
784
+ experiment_name=experiment_name,
785
+ experiment_name_prefix=experiment_name_prefix,
786
+ )
787
+
629
788
  experiment = client.create_experiment(
630
789
  name=experiment_name,
631
790
  dataset_name=dataset.name,
@@ -649,13 +808,128 @@ def evaluate_optimization_trial(
649
808
  dataset_item_ids=dataset_item_ids,
650
809
  dataset_sampler=dataset_sampler,
651
810
  trial_count=trial_count,
811
+ experiment_scoring_functions=experiment_scoring_functions,
652
812
  )
653
813
 
654
814
 
655
- def _wrap_scoring_functions(
815
+ def evaluate_on_dict_items(
816
+ items: List[Dict[str, Any]],
817
+ task: LLMTask,
656
818
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
657
819
  scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
658
820
  project_name: Optional[str] = None,
821
+ verbose: int = 0,
822
+ scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
823
+ scoring_threads: int = 16,
824
+ ) -> evaluation_result.EvaluationResultOnDictItems:
825
+ """
826
+ Lightweight evaluation function that evaluates a task on dataset items (as dictionaries)
827
+ without requiring a Dataset object or creating an experiment.
828
+
829
+ This function is useful for optimization scenarios where you need to evaluate many
830
+ candidate solutions quickly using Opik's metric infrastructure. It creates traces for
831
+ tracking but doesn't require experiment setup or dataset management.
832
+
833
+ Args:
834
+ items: List of dataset item contents (dictionaries with the data to evaluate).
835
+
836
+ task: A callable object that takes dict with dataset item content
837
+ as input and returns dict which will later be used for scoring.
838
+
839
+ scoring_metrics: List of metrics to calculate during evaluation.
840
+ Each metric's `score(...)` method will be called with arguments taken from
841
+ the dataset item and task output.
842
+
843
+ scoring_functions: List of scorer functions to be executed during evaluation.
844
+ Each scorer function accepts predefined arguments:
845
+ • dataset_item — a dictionary containing the dataset item content,
846
+ • task_outputs — a dictionary containing the LLM task output.
847
+
848
+ project_name: The name of the project for logging traces.
849
+
850
+ verbose: Controls evaluation output logs and progress bars.
851
+ 0 - no outputs (default), 1 - enable outputs.
852
+
853
+ scoring_key_mapping: A dictionary that allows you to rename keys present in either
854
+ the dataset item or the task output to match the keys expected by scoring metrics.
855
+
856
+ scoring_threads: Number of thread workers to run scoring metrics.
857
+
858
+ Returns:
859
+ EvaluationResultOnDictItems object containing test results and providing methods
860
+ to aggregate scores, similar to the regular evaluation result.
861
+
862
+ Example:
863
+ ```python
864
+ import opik
865
+ from opik.evaluation.metrics import Equals
866
+
867
+ items = [
868
+ {"input": "What is 2+2?", "expected_output": "4"},
869
+ {"input": "What is 3+3?", "expected_output": "6"},
870
+ ]
871
+
872
+ def my_task(item):
873
+ # Your LLM call here
874
+ question = item["input"]
875
+ # ... call model ...
876
+ return {"output": model_output}
877
+
878
+ result = opik.evaluate_on_dict_items(
879
+ items=items,
880
+ task=my_task,
881
+ scoring_metrics=[Equals()],
882
+ scoring_key_mapping={"reference": "expected_output"},
883
+ )
884
+
885
+ # Access individual test results
886
+ for test_result in result.test_results:
887
+ print(f"Score: {test_result.score_results[0].value}")
888
+
889
+ # Get aggregated statistics
890
+ aggregated = result.aggregate_evaluation_scores()
891
+ print(f"Mean equals score: {aggregated['equals_metric'].mean}")
892
+ ```
893
+ """
894
+ # Wrap scoring functions if any
895
+ scoring_metrics = _wrap_scoring_functions(
896
+ scoring_functions=scoring_functions,
897
+ scoring_metrics=scoring_metrics,
898
+ project_name=project_name,
899
+ )
900
+
901
+ if not scoring_metrics:
902
+ LOGGER.warning("No scoring metrics provided for items evaluation")
903
+ return evaluation_result.EvaluationResultOnDictItems(test_results=[])
904
+
905
+ client = opik_client.get_client_cached()
906
+
907
+ # Create evaluation engine
908
+ with asyncio_support.async_http_connections_expire_immediately():
909
+ evaluation_engine = engine.EvaluationEngine(
910
+ client=client,
911
+ project_name=project_name,
912
+ scoring_metrics=scoring_metrics,
913
+ workers=scoring_threads,
914
+ verbose=verbose,
915
+ scoring_key_mapping=scoring_key_mapping,
916
+ )
917
+
918
+ # Use the new evaluate_items method
919
+ test_results = evaluation_engine.evaluate_llm_task_on_dict_items(
920
+ items=items,
921
+ task=task,
922
+ )
923
+
924
+ return evaluation_result.EvaluationResultOnDictItems(
925
+ test_results=test_results,
926
+ )
927
+
928
+
929
+ def _wrap_scoring_functions(
930
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]],
931
+ scoring_metrics: Optional[List[base_metric.BaseMetric]],
932
+ project_name: Optional[str],
659
933
  ) -> List[base_metric.BaseMetric]:
660
934
  if scoring_functions:
661
935
  function_metrics = scorer_wrapper_metric.wrap_scorer_functions(
@@ -667,3 +941,17 @@ def _wrap_scoring_functions(
667
941
  scoring_metrics = function_metrics
668
942
 
669
943
  return scoring_metrics if scoring_metrics else []
944
+
945
+
946
+ def _use_or_create_experiment_name(
947
+ experiment_name: Optional[str], experiment_name_prefix: Optional[str]
948
+ ) -> Optional[str]:
949
+ if experiment_name:
950
+ return experiment_name
951
+
952
+ if experiment_name_prefix:
953
+ return experiment_helpers.generate_unique_experiment_name(
954
+ experiment_name_prefix
955
+ )
956
+ else:
957
+ return None