opik 1.9.26__py3-none-any.whl → 1.9.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. opik/__init__.py +10 -3
  2. opik/api_objects/dataset/rest_operations.py +2 -0
  3. opik/api_objects/experiment/experiment.py +31 -5
  4. opik/api_objects/experiment/helpers.py +34 -10
  5. opik/api_objects/local_recording.py +8 -3
  6. opik/api_objects/opik_client.py +218 -46
  7. opik/api_objects/opik_query_language.py +9 -0
  8. opik/api_objects/prompt/__init__.py +11 -3
  9. opik/api_objects/prompt/base_prompt.py +69 -0
  10. opik/api_objects/prompt/base_prompt_template.py +29 -0
  11. opik/api_objects/prompt/chat/__init__.py +1 -0
  12. opik/api_objects/prompt/chat/chat_prompt.py +193 -0
  13. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  14. opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +31 -34
  15. opik/api_objects/prompt/client.py +101 -30
  16. opik/api_objects/prompt/text/__init__.py +1 -0
  17. opik/api_objects/prompt/{prompt.py → text/prompt.py} +55 -32
  18. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +8 -5
  19. opik/cli/export.py +6 -2
  20. opik/config.py +0 -5
  21. opik/decorator/base_track_decorator.py +37 -40
  22. opik/evaluation/__init__.py +13 -2
  23. opik/evaluation/engine/engine.py +195 -223
  24. opik/evaluation/engine/helpers.py +8 -7
  25. opik/evaluation/engine/metrics_evaluator.py +237 -0
  26. opik/evaluation/evaluation_result.py +35 -1
  27. opik/evaluation/evaluator.py +309 -23
  28. opik/evaluation/models/litellm/util.py +78 -6
  29. opik/evaluation/report.py +14 -2
  30. opik/evaluation/rest_operations.py +6 -9
  31. opik/evaluation/test_case.py +2 -2
  32. opik/evaluation/types.py +9 -1
  33. opik/exceptions.py +17 -0
  34. opik/id_helpers.py +18 -0
  35. opik/integrations/adk/helpers.py +16 -7
  36. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  37. opik/integrations/adk/opik_tracer.py +3 -1
  38. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  39. opik/integrations/dspy/callback.py +1 -4
  40. opik/integrations/haystack/opik_connector.py +2 -2
  41. opik/integrations/haystack/opik_tracer.py +2 -4
  42. opik/integrations/langchain/opik_tracer.py +1 -4
  43. opik/integrations/llama_index/callback.py +2 -4
  44. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  45. opik/integrations/openai/opik_tracker.py +1 -1
  46. opik/opik_context.py +7 -7
  47. opik/rest_api/__init__.py +127 -11
  48. opik/rest_api/dashboards/client.py +65 -2
  49. opik/rest_api/dashboards/raw_client.py +82 -0
  50. opik/rest_api/datasets/client.py +538 -2
  51. opik/rest_api/datasets/raw_client.py +1347 -441
  52. opik/rest_api/experiments/client.py +30 -2
  53. opik/rest_api/experiments/raw_client.py +26 -0
  54. opik/rest_api/optimizations/client.py +302 -0
  55. opik/rest_api/optimizations/raw_client.py +463 -0
  56. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  57. opik/rest_api/prompts/__init__.py +2 -2
  58. opik/rest_api/prompts/client.py +34 -4
  59. opik/rest_api/prompts/raw_client.py +32 -2
  60. opik/rest_api/prompts/types/__init__.py +3 -1
  61. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  62. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  63. opik/rest_api/traces/client.py +6 -6
  64. opik/rest_api/traces/raw_client.py +4 -4
  65. opik/rest_api/types/__init__.py +125 -11
  66. opik/rest_api/types/aggregation_data.py +1 -0
  67. opik/rest_api/types/automation_rule_evaluator.py +23 -1
  68. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  69. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  70. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  71. opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
  72. opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
  73. opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
  74. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  75. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  76. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  77. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  78. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  79. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  80. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  81. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  82. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  83. opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
  84. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  85. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  86. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  87. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  88. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  89. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  90. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  91. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  92. opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
  93. opik/rest_api/types/dashboard_page_public.py +1 -0
  94. opik/rest_api/types/dataset.py +4 -0
  95. opik/rest_api/types/dataset_item.py +1 -0
  96. opik/rest_api/types/dataset_item_compare.py +1 -0
  97. opik/rest_api/types/dataset_item_page_compare.py +1 -0
  98. opik/rest_api/types/dataset_item_page_public.py +1 -0
  99. opik/rest_api/types/dataset_item_public.py +1 -0
  100. opik/rest_api/types/dataset_public.py +4 -0
  101. opik/rest_api/types/dataset_public_status.py +5 -0
  102. opik/rest_api/types/dataset_status.py +5 -0
  103. opik/rest_api/types/dataset_version_diff.py +22 -0
  104. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  105. opik/rest_api/types/dataset_version_page_public.py +23 -0
  106. opik/rest_api/types/dataset_version_public.py +54 -0
  107. opik/rest_api/types/dataset_version_summary.py +41 -0
  108. opik/rest_api/types/dataset_version_summary_public.py +41 -0
  109. opik/rest_api/types/experiment.py +2 -0
  110. opik/rest_api/types/experiment_public.py +2 -0
  111. opik/rest_api/types/experiment_score.py +20 -0
  112. opik/rest_api/types/experiment_score_public.py +20 -0
  113. opik/rest_api/types/experiment_score_write.py +20 -0
  114. opik/rest_api/types/feedback_score_public.py +4 -0
  115. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  116. opik/rest_api/types/optimization.py +2 -0
  117. opik/rest_api/types/optimization_public.py +2 -0
  118. opik/rest_api/types/optimization_public_status.py +3 -1
  119. opik/rest_api/types/optimization_status.py +3 -1
  120. opik/rest_api/types/optimization_studio_config.py +27 -0
  121. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  122. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  123. opik/rest_api/types/optimization_studio_log.py +22 -0
  124. opik/rest_api/types/optimization_write.py +2 -0
  125. opik/rest_api/types/optimization_write_status.py +3 -1
  126. opik/rest_api/types/prompt.py +6 -0
  127. opik/rest_api/types/prompt_detail.py +6 -0
  128. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  129. opik/rest_api/types/prompt_public.py +6 -0
  130. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  131. opik/rest_api/types/prompt_template_structure.py +5 -0
  132. opik/rest_api/types/prompt_version.py +2 -0
  133. opik/rest_api/types/prompt_version_detail.py +2 -0
  134. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  135. opik/rest_api/types/prompt_version_public.py +2 -0
  136. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  137. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  138. opik/rest_api/types/score_name.py +1 -0
  139. opik/rest_api/types/service_toggles_config.py +5 -0
  140. opik/rest_api/types/span_filter.py +23 -0
  141. opik/rest_api/types/span_filter_operator.py +21 -0
  142. opik/rest_api/types/span_filter_write.py +23 -0
  143. opik/rest_api/types/span_filter_write_operator.py +21 -0
  144. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  145. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  146. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  147. opik/rest_api/types/studio_evaluation.py +20 -0
  148. opik/rest_api/types/studio_evaluation_public.py +20 -0
  149. opik/rest_api/types/studio_evaluation_write.py +20 -0
  150. opik/rest_api/types/studio_llm_model.py +21 -0
  151. opik/rest_api/types/studio_llm_model_public.py +21 -0
  152. opik/rest_api/types/studio_llm_model_write.py +21 -0
  153. opik/rest_api/types/studio_message.py +20 -0
  154. opik/rest_api/types/studio_message_public.py +20 -0
  155. opik/rest_api/types/studio_message_write.py +20 -0
  156. opik/rest_api/types/studio_metric.py +21 -0
  157. opik/rest_api/types/studio_metric_public.py +21 -0
  158. opik/rest_api/types/studio_metric_write.py +21 -0
  159. opik/rest_api/types/studio_optimizer.py +21 -0
  160. opik/rest_api/types/studio_optimizer_public.py +21 -0
  161. opik/rest_api/types/studio_optimizer_write.py +21 -0
  162. opik/rest_api/types/studio_prompt.py +20 -0
  163. opik/rest_api/types/studio_prompt_public.py +20 -0
  164. opik/rest_api/types/studio_prompt_write.py +20 -0
  165. opik/rest_api/types/trace.py +6 -0
  166. opik/rest_api/types/trace_public.py +6 -0
  167. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  168. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  169. opik/rest_api/types/value_entry.py +2 -0
  170. opik/rest_api/types/value_entry_compare.py +2 -0
  171. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  172. opik/rest_api/types/value_entry_public.py +2 -0
  173. opik/synchronization.py +5 -6
  174. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  175. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/METADATA +4 -3
  176. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/RECORD +180 -120
  177. opik/api_objects/prompt/chat_prompt_template.py +0 -200
  178. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/WHEEL +0 -0
  179. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/entry_points.txt +0 -0
  180. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/licenses/LICENSE +0 -0
  181. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/top_level.txt +0 -0
@@ -2,10 +2,12 @@ import logging
2
2
  import time
3
3
  from typing import Any, Callable, Dict, List, Optional, Union, cast
4
4
 
5
- from .. import Prompt
5
+ from ..api_objects.prompt import base_prompt
6
6
  from ..api_objects import opik_client
7
7
  from ..api_objects import dataset, experiment
8
8
  from ..api_objects.experiment import helpers as experiment_helpers
9
+ from ..api_objects.prompt.chat import chat_prompt_template
10
+ from ..api_objects.prompt import types as prompt_types
9
11
  from . import (
10
12
  asyncio_support,
11
13
  engine,
@@ -14,13 +16,12 @@ from . import (
14
16
  rest_operations,
15
17
  samplers,
16
18
  )
17
- from .metrics import base_metric
19
+ from .metrics import base_metric, score_result
18
20
  from .models import ModelCapabilities, base_model, models_factory
19
21
  from .scorers import scorer_function, scorer_wrapper_metric
20
- from .types import LLMTask, ScoringKeyMappingType
22
+ from . import test_result
23
+ from .types import ExperimentScoreFunction, LLMTask, ScoringKeyMappingType
21
24
  from .. import url_helpers
22
- from opik.api_objects.prompt.chat_prompt_template import ChatPromptTemplate
23
- from opik.api_objects.prompt.types import SupportedModalities
24
25
 
25
26
  LOGGER = logging.getLogger(__name__)
26
27
  MODALITY_SUPPORT_DOC_URL = (
@@ -41,23 +42,52 @@ def _try_notifying_about_experiment_completion(
41
42
  )
42
43
 
43
44
 
45
+ def _compute_experiment_scores(
46
+ experiment_scoring_functions: List[ExperimentScoreFunction],
47
+ test_results: List[test_result.TestResult],
48
+ ) -> List[score_result.ScoreResult]:
49
+ """Compute experiment-level scores from test results."""
50
+ if not experiment_scoring_functions or not test_results:
51
+ return []
52
+
53
+ all_scores: List[score_result.ScoreResult] = []
54
+ for score_function in experiment_scoring_functions:
55
+ try:
56
+ scores = score_function(test_results)
57
+ # Handle Union[ScoreResult, List[ScoreResult]]
58
+ if isinstance(scores, list):
59
+ all_scores.extend(scores)
60
+ else:
61
+ all_scores.append(scores)
62
+ except Exception as e:
63
+ LOGGER.warning(
64
+ "Failed to compute experiment score: %s",
65
+ e,
66
+ exc_info=True,
67
+ )
68
+
69
+ return all_scores
70
+
71
+
44
72
  def evaluate(
45
73
  dataset: dataset.Dataset,
46
74
  task: LLMTask,
47
75
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
48
76
  scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
77
+ experiment_name_prefix: Optional[str] = None,
49
78
  experiment_name: Optional[str] = None,
50
79
  project_name: Optional[str] = None,
51
80
  experiment_config: Optional[Dict[str, Any]] = None,
52
81
  verbose: int = 1,
53
82
  nb_samples: Optional[int] = None,
54
83
  task_threads: int = 16,
55
- prompt: Optional[Prompt] = None,
56
- prompts: Optional[List[Prompt]] = None,
84
+ prompt: Optional[base_prompt.BasePrompt] = None,
85
+ prompts: Optional[List[base_prompt.BasePrompt]] = None,
57
86
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
58
87
  dataset_item_ids: Optional[List[str]] = None,
59
88
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
60
89
  trial_count: int = 1,
90
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
61
91
  ) -> evaluation_result.EvaluationResult:
62
92
  """
63
93
  Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
@@ -70,6 +100,10 @@ def evaluate(
70
100
  task: A callable object that takes dict with dataset item content
71
101
  as input and returns dict which will later be used for scoring.
72
102
 
103
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
104
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
105
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
106
+
73
107
  experiment_name: The name of the experiment associated with evaluation run.
74
108
  If None, a generated name will be used.
75
109
 
@@ -117,7 +151,16 @@ def evaluate(
117
151
  If not provided, all samples in the dataset will be evaluated.
118
152
 
119
153
  trial_count: number of times to run the task and evaluate the task output for every dataset item.
154
+
155
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
156
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
157
+ These scores are computed after all test results are collected and represent aggregate
158
+ metrics across the entire experiment.
120
159
  """
160
+ experiment_scoring_functions = (
161
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
162
+ )
163
+
121
164
  checked_prompts = experiment_helpers.handle_prompt_args(
122
165
  prompt=prompt,
123
166
  prompts=prompts,
@@ -125,6 +168,11 @@ def evaluate(
125
168
 
126
169
  client = opik_client.get_client_cached()
127
170
 
171
+ experiment_name = _use_or_create_experiment_name(
172
+ experiment_name=experiment_name,
173
+ experiment_name_prefix=experiment_name_prefix,
174
+ )
175
+
128
176
  experiment = client.create_experiment(
129
177
  name=experiment_name,
130
178
  dataset_name=dataset.name,
@@ -153,6 +201,7 @@ def evaluate(
153
201
  dataset_item_ids=dataset_item_ids,
154
202
  dataset_sampler=dataset_sampler,
155
203
  trial_count=trial_count,
204
+ experiment_scoring_functions=experiment_scoring_functions,
156
205
  )
157
206
 
158
207
 
@@ -171,6 +220,7 @@ def _evaluate_task(
171
220
  dataset_item_ids: Optional[List[str]],
172
221
  dataset_sampler: Optional[samplers.BaseDatasetSampler],
173
222
  trial_count: int,
223
+ experiment_scoring_functions: List[ExperimentScoreFunction],
174
224
  ) -> evaluation_result.EvaluationResult:
175
225
  start_time = time.time()
176
226
 
@@ -178,25 +228,33 @@ def _evaluate_task(
178
228
  evaluation_engine = engine.EvaluationEngine(
179
229
  client=client,
180
230
  project_name=project_name,
181
- experiment_=experiment,
182
231
  scoring_metrics=scoring_metrics,
183
232
  workers=task_threads,
184
233
  verbose=verbose,
185
234
  scoring_key_mapping=scoring_key_mapping,
186
235
  )
187
- test_results = evaluation_engine.evaluate_llm_tasks(
236
+ test_results = evaluation_engine.evaluate_llm_task_on_dataset(
188
237
  dataset_=dataset,
189
238
  task=task,
190
239
  nb_samples=nb_samples,
191
240
  dataset_item_ids=dataset_item_ids,
192
241
  dataset_sampler=dataset_sampler,
193
242
  trial_count=trial_count,
243
+ experiment_=experiment,
194
244
  )
195
245
 
196
246
  total_time = time.time() - start_time
197
247
 
248
+ # Compute experiment scores
249
+ computed_experiment_scores = _compute_experiment_scores(
250
+ experiment_scoring_functions=experiment_scoring_functions,
251
+ test_results=test_results,
252
+ )
253
+
198
254
  if verbose >= 1:
199
- report.display_experiment_results(dataset.name, total_time, test_results)
255
+ report.display_experiment_results(
256
+ dataset.name, total_time, test_results, computed_experiment_scores
257
+ )
200
258
 
201
259
  experiment_url = url_helpers.get_experiment_url_by_id(
202
260
  experiment_id=experiment.id,
@@ -210,6 +268,10 @@ def _evaluate_task(
210
268
 
211
269
  _try_notifying_about_experiment_completion(experiment)
212
270
 
271
+ # Log experiment scores to backend
272
+ if computed_experiment_scores:
273
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
274
+
213
275
  evaluation_result_ = evaluation_result.EvaluationResult(
214
276
  dataset_id=dataset.id,
215
277
  experiment_id=experiment.id,
@@ -217,6 +279,7 @@ def _evaluate_task(
217
279
  test_results=test_results,
218
280
  experiment_url=experiment_url,
219
281
  trial_count=trial_count,
282
+ experiment_scores=computed_experiment_scores,
220
283
  )
221
284
 
222
285
  if verbose >= 2:
@@ -236,6 +299,7 @@ def evaluate_experiment(
236
299
  verbose: int = 1,
237
300
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
238
301
  experiment_id: Optional[str] = None,
302
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
239
303
  ) -> evaluation_result.EvaluationResult:
240
304
  """Update the existing experiment with new evaluation metrics. You can use either `scoring_metrics` or `scorer_functions` to calculate
241
305
  evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
@@ -267,7 +331,15 @@ def evaluate_experiment(
267
331
  `{"input": "user_question"}` to map the "user_question" key to "input".
268
332
 
269
333
  experiment_id: The ID of the experiment to evaluate. If not provided, the experiment will be evaluated based on the experiment name.
334
+
335
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
336
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
337
+ These scores are computed after all test results are collected and represent aggregate
338
+ metrics across the entire experiment.
270
339
  """
340
+ experiment_scoring_functions = (
341
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
342
+ )
271
343
  start_time = time.time()
272
344
 
273
345
  client = opik_client.get_client_cached()
@@ -303,7 +375,6 @@ def evaluate_experiment(
303
375
  evaluation_engine = engine.EvaluationEngine(
304
376
  client=client,
305
377
  project_name=project_name,
306
- experiment_=experiment,
307
378
  scoring_metrics=scoring_metrics,
308
379
  workers=scoring_threads,
309
380
  verbose=verbose,
@@ -315,8 +386,19 @@ def evaluate_experiment(
315
386
 
316
387
  total_time = time.time() - start_time
317
388
 
389
+ # Compute experiment scores
390
+ computed_experiment_scores = _compute_experiment_scores(
391
+ experiment_scoring_functions=experiment_scoring_functions,
392
+ test_results=test_results,
393
+ )
394
+
318
395
  if verbose >= 1:
319
- report.display_experiment_results(dataset_.name, total_time, test_results)
396
+ report.display_experiment_results(
397
+ dataset_.name,
398
+ total_time,
399
+ test_results,
400
+ computed_experiment_scores,
401
+ )
320
402
 
321
403
  experiment_url = url_helpers.get_experiment_url_by_id(
322
404
  experiment_id=experiment.id,
@@ -328,6 +410,10 @@ def evaluate_experiment(
328
410
 
329
411
  _try_notifying_about_experiment_completion(experiment)
330
412
 
413
+ # Log experiment scores to backend
414
+ if computed_experiment_scores:
415
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
416
+
331
417
  evaluation_result_ = evaluation_result.EvaluationResult(
332
418
  dataset_id=dataset_.id,
333
419
  experiment_id=experiment.id,
@@ -335,6 +421,7 @@ def evaluate_experiment(
335
421
  test_results=test_results,
336
422
  experiment_url=experiment_url,
337
423
  trial_count=1,
424
+ experiment_scores=computed_experiment_scores,
338
425
  )
339
426
 
340
427
  if verbose >= 2:
@@ -350,7 +437,7 @@ def _build_prompt_evaluation_task(
350
437
  model: base_model.OpikBaseModel, messages: List[Dict[str, Any]]
351
438
  ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
352
439
  supported_modalities = cast(
353
- SupportedModalities,
440
+ prompt_types.SupportedModalities,
354
441
  {
355
442
  "vision": ModelCapabilities.supports_vision(
356
443
  getattr(model, "model_name", None)
@@ -360,9 +447,12 @@ def _build_prompt_evaluation_task(
360
447
  ),
361
448
  },
362
449
  )
363
- chat_prompt_template = ChatPromptTemplate(messages=messages)
450
+ # Disable placeholder validation since we pass all dataset item fields to format()
451
+ chat_prompt_template_ = chat_prompt_template.ChatPromptTemplate(
452
+ messages=messages, validate_placeholders=False
453
+ )
364
454
 
365
- required_modalities = chat_prompt_template.required_modalities()
455
+ required_modalities = chat_prompt_template_.required_modalities()
366
456
  unsupported_modalities = {
367
457
  modality
368
458
  for modality in required_modalities
@@ -381,7 +471,7 @@ def _build_prompt_evaluation_task(
381
471
 
382
472
  def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
383
473
  template_type_override = prompt_variables.get("type")
384
- processed_messages = chat_prompt_template.format(
474
+ processed_messages = chat_prompt_template_.format(
385
475
  variables=prompt_variables,
386
476
  supported_modalities=supported_modalities,
387
477
  template_type=template_type_override,
@@ -404,16 +494,18 @@ def evaluate_prompt(
404
494
  model: Optional[Union[str, base_model.OpikBaseModel]] = None,
405
495
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
406
496
  scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
497
+ experiment_name_prefix: Optional[str] = None,
407
498
  experiment_name: Optional[str] = None,
408
499
  project_name: Optional[str] = None,
409
500
  experiment_config: Optional[Dict[str, Any]] = None,
410
501
  verbose: int = 1,
411
502
  nb_samples: Optional[int] = None,
412
503
  task_threads: int = 16,
413
- prompt: Optional[Prompt] = None,
504
+ prompt: Optional[base_prompt.BasePrompt] = None,
414
505
  dataset_item_ids: Optional[List[str]] = None,
415
506
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
416
507
  trial_count: int = 1,
508
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
417
509
  ) -> evaluation_result.EvaluationResult:
418
510
  """
419
511
  Performs prompt evaluation on a given dataset.
@@ -435,6 +527,10 @@ def evaluate_prompt(
435
527
  • task_outputs — a dictionary containing the LLM task output.
436
528
  • task_span - the data collected during the LLM task execution [optional].
437
529
 
530
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
531
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
532
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
533
+
438
534
  experiment_name: name of the experiment.
439
535
 
440
536
  project_name: The name of the project to log data
@@ -455,7 +551,15 @@ def evaluate_prompt(
455
551
  If not provided, all samples in the dataset will be evaluated.
456
552
 
457
553
  trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
554
+
555
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
556
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
557
+ These scores are computed after all test results are collected and represent aggregate
558
+ metrics across the entire experiment.
458
559
  """
560
+ experiment_scoring_functions = (
561
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
562
+ )
459
563
  if isinstance(model, str):
460
564
  opik_model = models_factory.get(model_name=model)
461
565
  elif not isinstance(model, base_model.OpikBaseModel):
@@ -479,6 +583,11 @@ def evaluate_prompt(
479
583
 
480
584
  prompts = [prompt] if prompt else None
481
585
 
586
+ experiment_name = _use_or_create_experiment_name(
587
+ experiment_name=experiment_name,
588
+ experiment_name_prefix=experiment_name_prefix,
589
+ )
590
+
482
591
  experiment = client.create_experiment(
483
592
  name=experiment_name,
484
593
  dataset_name=dataset.name,
@@ -499,25 +608,33 @@ def evaluate_prompt(
499
608
  evaluation_engine = engine.EvaluationEngine(
500
609
  client=client,
501
610
  project_name=project_name,
502
- experiment_=experiment,
503
611
  scoring_metrics=scoring_metrics,
504
612
  workers=task_threads,
505
613
  verbose=verbose,
506
614
  scoring_key_mapping=None,
507
615
  )
508
- test_results = evaluation_engine.evaluate_llm_tasks(
616
+ test_results = evaluation_engine.evaluate_llm_task_on_dataset(
509
617
  dataset_=dataset,
510
618
  task=_build_prompt_evaluation_task(model=opik_model, messages=messages),
511
619
  nb_samples=nb_samples,
512
620
  dataset_item_ids=dataset_item_ids,
513
621
  dataset_sampler=dataset_sampler,
514
622
  trial_count=trial_count,
623
+ experiment_=experiment,
515
624
  )
516
625
 
517
626
  total_time = time.time() - start_time
518
627
 
628
+ # Compute experiment scores
629
+ computed_experiment_scores = _compute_experiment_scores(
630
+ experiment_scoring_functions=experiment_scoring_functions,
631
+ test_results=test_results,
632
+ )
633
+
519
634
  if verbose >= 1:
520
- report.display_experiment_results(dataset.name, total_time, test_results)
635
+ report.display_experiment_results(
636
+ dataset.name, total_time, test_results, computed_experiment_scores
637
+ )
521
638
 
522
639
  experiment_url = url_helpers.get_experiment_url_by_id(
523
640
  experiment_id=experiment.id,
@@ -531,6 +648,10 @@ def evaluate_prompt(
531
648
 
532
649
  _try_notifying_about_experiment_completion(experiment)
533
650
 
651
+ # Log experiment scores to backend
652
+ if computed_experiment_scores:
653
+ experiment.log_experiment_scores(score_results=computed_experiment_scores)
654
+
534
655
  evaluation_result_ = evaluation_result.EvaluationResult(
535
656
  experiment_id=experiment.id,
536
657
  dataset_id=dataset.id,
@@ -538,6 +659,7 @@ def evaluate_prompt(
538
659
  test_results=test_results,
539
660
  experiment_url=experiment_url,
540
661
  trial_count=trial_count,
662
+ experiment_scores=computed_experiment_scores,
541
663
  )
542
664
 
543
665
  if verbose >= 2:
@@ -554,18 +676,21 @@ def evaluate_optimization_trial(
554
676
  dataset: dataset.Dataset,
555
677
  task: LLMTask,
556
678
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
679
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
680
+ experiment_name_prefix: Optional[str] = None,
557
681
  experiment_name: Optional[str] = None,
558
682
  project_name: Optional[str] = None,
559
683
  experiment_config: Optional[Dict[str, Any]] = None,
560
684
  verbose: int = 1,
561
685
  nb_samples: Optional[int] = None,
562
686
  task_threads: int = 16,
563
- prompt: Optional[Prompt] = None,
564
- prompts: Optional[List[Prompt]] = None,
687
+ prompt: Optional[base_prompt.BasePrompt] = None,
688
+ prompts: Optional[List[base_prompt.BasePrompt]] = None,
565
689
  scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
566
690
  dataset_item_ids: Optional[List[str]] = None,
567
691
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
568
692
  trial_count: int = 1,
693
+ experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
569
694
  ) -> evaluation_result.EvaluationResult:
570
695
  """
571
696
  Performs task evaluation on a given dataset.
@@ -578,6 +703,17 @@ def evaluate_optimization_trial(
578
703
  task: A callable object that takes dict with dataset item content
579
704
  as input and returns dict which will later be used for scoring.
580
705
 
706
+ scoring_functions: List of scorer functions to be executed during evaluation.
707
+ Each scorer function includes a scoring method that accepts predefined
708
+ arguments supplied by the evaluation engine:
709
+ • dataset_item — a dictionary containing the dataset item content,
710
+ • task_outputs — a dictionary containing the LLM task output.
711
+ • task_span - the data collected during the LLM task execution [optional].
712
+
713
+ experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
714
+ but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
715
+ the first experiment created will be named `my-experiment-<unique-random-part>`.
716
+
581
717
  experiment_name: The name of the experiment associated with evaluation run.
582
718
  If None, a generated name will be used.
583
719
 
@@ -617,7 +753,16 @@ def evaluate_optimization_trial(
617
753
  If not provided, all samples in the dataset will be evaluated.
618
754
 
619
755
  trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
756
+
757
+ experiment_scoring_functions: List of callable functions that compute experiment-level scores.
758
+ Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
759
+ These scores are computed after all test results are collected and represent aggregate
760
+ metrics across the entire experiment.
620
761
  """
762
+ experiment_scoring_functions = (
763
+ [] if experiment_scoring_functions is None else experiment_scoring_functions
764
+ )
765
+
621
766
  if scoring_metrics is None:
622
767
  scoring_metrics = []
623
768
 
@@ -626,8 +771,20 @@ def evaluate_optimization_trial(
626
771
  prompts=prompts,
627
772
  )
628
773
 
774
+ # wrap scoring functions if any
775
+ scoring_metrics = _wrap_scoring_functions(
776
+ scoring_functions=scoring_functions,
777
+ scoring_metrics=scoring_metrics,
778
+ project_name=project_name,
779
+ )
780
+
629
781
  client = opik_client.get_client_cached()
630
782
 
783
+ experiment_name = _use_or_create_experiment_name(
784
+ experiment_name=experiment_name,
785
+ experiment_name_prefix=experiment_name_prefix,
786
+ )
787
+
631
788
  experiment = client.create_experiment(
632
789
  name=experiment_name,
633
790
  dataset_name=dataset.name,
@@ -651,13 +808,128 @@ def evaluate_optimization_trial(
651
808
  dataset_item_ids=dataset_item_ids,
652
809
  dataset_sampler=dataset_sampler,
653
810
  trial_count=trial_count,
811
+ experiment_scoring_functions=experiment_scoring_functions,
654
812
  )
655
813
 
656
814
 
657
- def _wrap_scoring_functions(
815
+ def evaluate_on_dict_items(
816
+ items: List[Dict[str, Any]],
817
+ task: LLMTask,
658
818
  scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
659
819
  scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
660
820
  project_name: Optional[str] = None,
821
+ verbose: int = 0,
822
+ scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
823
+ scoring_threads: int = 16,
824
+ ) -> evaluation_result.EvaluationResultOnDictItems:
825
+ """
826
+ Lightweight evaluation function that evaluates a task on dataset items (as dictionaries)
827
+ without requiring a Dataset object or creating an experiment.
828
+
829
+ This function is useful for optimization scenarios where you need to evaluate many
830
+ candidate solutions quickly using Opik's metric infrastructure. It creates traces for
831
+ tracking but doesn't require experiment setup or dataset management.
832
+
833
+ Args:
834
+ items: List of dataset item contents (dictionaries with the data to evaluate).
835
+
836
+ task: A callable object that takes dict with dataset item content
837
+ as input and returns dict which will later be used for scoring.
838
+
839
+ scoring_metrics: List of metrics to calculate during evaluation.
840
+ Each metric's `score(...)` method will be called with arguments taken from
841
+ the dataset item and task output.
842
+
843
+ scoring_functions: List of scorer functions to be executed during evaluation.
844
+ Each scorer function accepts predefined arguments:
845
+ • dataset_item — a dictionary containing the dataset item content,
846
+ • task_outputs — a dictionary containing the LLM task output.
847
+
848
+ project_name: The name of the project for logging traces.
849
+
850
+ verbose: Controls evaluation output logs and progress bars.
851
+ 0 - no outputs (default), 1 - enable outputs.
852
+
853
+ scoring_key_mapping: A dictionary that allows you to rename keys present in either
854
+ the dataset item or the task output to match the keys expected by scoring metrics.
855
+
856
+ scoring_threads: Number of thread workers to run scoring metrics.
857
+
858
+ Returns:
859
+ EvaluationResultOnDictItems object containing test results and providing methods
860
+ to aggregate scores, similar to the regular evaluation result.
861
+
862
+ Example:
863
+ ```python
864
+ import opik
865
+ from opik.evaluation.metrics import Equals
866
+
867
+ items = [
868
+ {"input": "What is 2+2?", "expected_output": "4"},
869
+ {"input": "What is 3+3?", "expected_output": "6"},
870
+ ]
871
+
872
+ def my_task(item):
873
+ # Your LLM call here
874
+ question = item["input"]
875
+ # ... call model ...
876
+ return {"output": model_output}
877
+
878
+ result = opik.evaluate_on_dict_items(
879
+ items=items,
880
+ task=my_task,
881
+ scoring_metrics=[Equals()],
882
+ scoring_key_mapping={"reference": "expected_output"},
883
+ )
884
+
885
+ # Access individual test results
886
+ for test_result in result.test_results:
887
+ print(f"Score: {test_result.score_results[0].value}")
888
+
889
+ # Get aggregated statistics
890
+ aggregated = result.aggregate_evaluation_scores()
891
+ print(f"Mean equals score: {aggregated['equals_metric'].mean}")
892
+ ```
893
+ """
894
+ # Wrap scoring functions if any
895
+ scoring_metrics = _wrap_scoring_functions(
896
+ scoring_functions=scoring_functions,
897
+ scoring_metrics=scoring_metrics,
898
+ project_name=project_name,
899
+ )
900
+
901
+ if not scoring_metrics:
902
+ LOGGER.warning("No scoring metrics provided for items evaluation")
903
+ return evaluation_result.EvaluationResultOnDictItems(test_results=[])
904
+
905
+ client = opik_client.get_client_cached()
906
+
907
+ # Create evaluation engine
908
+ with asyncio_support.async_http_connections_expire_immediately():
909
+ evaluation_engine = engine.EvaluationEngine(
910
+ client=client,
911
+ project_name=project_name,
912
+ scoring_metrics=scoring_metrics,
913
+ workers=scoring_threads,
914
+ verbose=verbose,
915
+ scoring_key_mapping=scoring_key_mapping,
916
+ )
917
+
918
+ # Use the new evaluate_items method
919
+ test_results = evaluation_engine.evaluate_llm_task_on_dict_items(
920
+ items=items,
921
+ task=task,
922
+ )
923
+
924
+ return evaluation_result.EvaluationResultOnDictItems(
925
+ test_results=test_results,
926
+ )
927
+
928
+
929
+ def _wrap_scoring_functions(
930
+ scoring_functions: Optional[List[scorer_function.ScorerFunction]],
931
+ scoring_metrics: Optional[List[base_metric.BaseMetric]],
932
+ project_name: Optional[str],
661
933
  ) -> List[base_metric.BaseMetric]:
662
934
  if scoring_functions:
663
935
  function_metrics = scorer_wrapper_metric.wrap_scorer_functions(
@@ -669,3 +941,17 @@ def _wrap_scoring_functions(
669
941
  scoring_metrics = function_metrics
670
942
 
671
943
  return scoring_metrics if scoring_metrics else []
944
+
945
+
946
+ def _use_or_create_experiment_name(
947
+ experiment_name: Optional[str], experiment_name_prefix: Optional[str]
948
+ ) -> Optional[str]:
949
+ if experiment_name:
950
+ return experiment_name
951
+
952
+ if experiment_name_prefix:
953
+ return experiment_helpers.generate_unique_experiment_name(
954
+ experiment_name_prefix
955
+ )
956
+ else:
957
+ return None