opik 1.9.26__py3-none-any.whl → 1.9.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. opik/__init__.py +10 -3
  2. opik/api_objects/dataset/rest_operations.py +2 -0
  3. opik/api_objects/experiment/experiment.py +31 -5
  4. opik/api_objects/experiment/helpers.py +34 -10
  5. opik/api_objects/local_recording.py +8 -3
  6. opik/api_objects/opik_client.py +218 -46
  7. opik/api_objects/opik_query_language.py +9 -0
  8. opik/api_objects/prompt/__init__.py +11 -3
  9. opik/api_objects/prompt/base_prompt.py +69 -0
  10. opik/api_objects/prompt/base_prompt_template.py +29 -0
  11. opik/api_objects/prompt/chat/__init__.py +1 -0
  12. opik/api_objects/prompt/chat/chat_prompt.py +193 -0
  13. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  14. opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +31 -34
  15. opik/api_objects/prompt/client.py +101 -30
  16. opik/api_objects/prompt/text/__init__.py +1 -0
  17. opik/api_objects/prompt/{prompt.py → text/prompt.py} +55 -32
  18. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +8 -5
  19. opik/cli/export.py +6 -2
  20. opik/config.py +0 -5
  21. opik/decorator/base_track_decorator.py +37 -40
  22. opik/evaluation/__init__.py +13 -2
  23. opik/evaluation/engine/engine.py +195 -223
  24. opik/evaluation/engine/helpers.py +8 -7
  25. opik/evaluation/engine/metrics_evaluator.py +237 -0
  26. opik/evaluation/evaluation_result.py +35 -1
  27. opik/evaluation/evaluator.py +309 -23
  28. opik/evaluation/models/litellm/util.py +78 -6
  29. opik/evaluation/report.py +14 -2
  30. opik/evaluation/rest_operations.py +6 -9
  31. opik/evaluation/test_case.py +2 -2
  32. opik/evaluation/types.py +9 -1
  33. opik/exceptions.py +17 -0
  34. opik/id_helpers.py +18 -0
  35. opik/integrations/adk/helpers.py +16 -7
  36. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  37. opik/integrations/adk/opik_tracer.py +3 -1
  38. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  39. opik/integrations/dspy/callback.py +1 -4
  40. opik/integrations/haystack/opik_connector.py +2 -2
  41. opik/integrations/haystack/opik_tracer.py +2 -4
  42. opik/integrations/langchain/opik_tracer.py +1 -4
  43. opik/integrations/llama_index/callback.py +2 -4
  44. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  45. opik/integrations/openai/opik_tracker.py +1 -1
  46. opik/opik_context.py +7 -7
  47. opik/rest_api/__init__.py +127 -11
  48. opik/rest_api/dashboards/client.py +65 -2
  49. opik/rest_api/dashboards/raw_client.py +82 -0
  50. opik/rest_api/datasets/client.py +538 -2
  51. opik/rest_api/datasets/raw_client.py +1347 -441
  52. opik/rest_api/experiments/client.py +30 -2
  53. opik/rest_api/experiments/raw_client.py +26 -0
  54. opik/rest_api/optimizations/client.py +302 -0
  55. opik/rest_api/optimizations/raw_client.py +463 -0
  56. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  57. opik/rest_api/prompts/__init__.py +2 -2
  58. opik/rest_api/prompts/client.py +34 -4
  59. opik/rest_api/prompts/raw_client.py +32 -2
  60. opik/rest_api/prompts/types/__init__.py +3 -1
  61. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  62. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  63. opik/rest_api/traces/client.py +6 -6
  64. opik/rest_api/traces/raw_client.py +4 -4
  65. opik/rest_api/types/__init__.py +125 -11
  66. opik/rest_api/types/aggregation_data.py +1 -0
  67. opik/rest_api/types/automation_rule_evaluator.py +23 -1
  68. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  69. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  70. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  71. opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
  72. opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
  73. opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
  74. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  75. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  76. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  77. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  78. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  79. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  80. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  81. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  82. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  83. opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
  84. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  85. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  86. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  87. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  88. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  89. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  90. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  91. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  92. opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
  93. opik/rest_api/types/dashboard_page_public.py +1 -0
  94. opik/rest_api/types/dataset.py +4 -0
  95. opik/rest_api/types/dataset_item.py +1 -0
  96. opik/rest_api/types/dataset_item_compare.py +1 -0
  97. opik/rest_api/types/dataset_item_page_compare.py +1 -0
  98. opik/rest_api/types/dataset_item_page_public.py +1 -0
  99. opik/rest_api/types/dataset_item_public.py +1 -0
  100. opik/rest_api/types/dataset_public.py +4 -0
  101. opik/rest_api/types/dataset_public_status.py +5 -0
  102. opik/rest_api/types/dataset_status.py +5 -0
  103. opik/rest_api/types/dataset_version_diff.py +22 -0
  104. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  105. opik/rest_api/types/dataset_version_page_public.py +23 -0
  106. opik/rest_api/types/dataset_version_public.py +54 -0
  107. opik/rest_api/types/dataset_version_summary.py +41 -0
  108. opik/rest_api/types/dataset_version_summary_public.py +41 -0
  109. opik/rest_api/types/experiment.py +2 -0
  110. opik/rest_api/types/experiment_public.py +2 -0
  111. opik/rest_api/types/experiment_score.py +20 -0
  112. opik/rest_api/types/experiment_score_public.py +20 -0
  113. opik/rest_api/types/experiment_score_write.py +20 -0
  114. opik/rest_api/types/feedback_score_public.py +4 -0
  115. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  116. opik/rest_api/types/optimization.py +2 -0
  117. opik/rest_api/types/optimization_public.py +2 -0
  118. opik/rest_api/types/optimization_public_status.py +3 -1
  119. opik/rest_api/types/optimization_status.py +3 -1
  120. opik/rest_api/types/optimization_studio_config.py +27 -0
  121. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  122. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  123. opik/rest_api/types/optimization_studio_log.py +22 -0
  124. opik/rest_api/types/optimization_write.py +2 -0
  125. opik/rest_api/types/optimization_write_status.py +3 -1
  126. opik/rest_api/types/prompt.py +6 -0
  127. opik/rest_api/types/prompt_detail.py +6 -0
  128. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  129. opik/rest_api/types/prompt_public.py +6 -0
  130. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  131. opik/rest_api/types/prompt_template_structure.py +5 -0
  132. opik/rest_api/types/prompt_version.py +2 -0
  133. opik/rest_api/types/prompt_version_detail.py +2 -0
  134. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  135. opik/rest_api/types/prompt_version_public.py +2 -0
  136. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  137. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  138. opik/rest_api/types/score_name.py +1 -0
  139. opik/rest_api/types/service_toggles_config.py +5 -0
  140. opik/rest_api/types/span_filter.py +23 -0
  141. opik/rest_api/types/span_filter_operator.py +21 -0
  142. opik/rest_api/types/span_filter_write.py +23 -0
  143. opik/rest_api/types/span_filter_write_operator.py +21 -0
  144. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  145. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  146. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  147. opik/rest_api/types/studio_evaluation.py +20 -0
  148. opik/rest_api/types/studio_evaluation_public.py +20 -0
  149. opik/rest_api/types/studio_evaluation_write.py +20 -0
  150. opik/rest_api/types/studio_llm_model.py +21 -0
  151. opik/rest_api/types/studio_llm_model_public.py +21 -0
  152. opik/rest_api/types/studio_llm_model_write.py +21 -0
  153. opik/rest_api/types/studio_message.py +20 -0
  154. opik/rest_api/types/studio_message_public.py +20 -0
  155. opik/rest_api/types/studio_message_write.py +20 -0
  156. opik/rest_api/types/studio_metric.py +21 -0
  157. opik/rest_api/types/studio_metric_public.py +21 -0
  158. opik/rest_api/types/studio_metric_write.py +21 -0
  159. opik/rest_api/types/studio_optimizer.py +21 -0
  160. opik/rest_api/types/studio_optimizer_public.py +21 -0
  161. opik/rest_api/types/studio_optimizer_write.py +21 -0
  162. opik/rest_api/types/studio_prompt.py +20 -0
  163. opik/rest_api/types/studio_prompt_public.py +20 -0
  164. opik/rest_api/types/studio_prompt_write.py +20 -0
  165. opik/rest_api/types/trace.py +6 -0
  166. opik/rest_api/types/trace_public.py +6 -0
  167. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  168. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  169. opik/rest_api/types/value_entry.py +2 -0
  170. opik/rest_api/types/value_entry_compare.py +2 -0
  171. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  172. opik/rest_api/types/value_entry_public.py +2 -0
  173. opik/synchronization.py +5 -6
  174. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  175. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/METADATA +4 -3
  176. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/RECORD +180 -120
  177. opik/api_objects/prompt/chat_prompt_template.py +0 -200
  178. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/WHEEL +0 -0
  179. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/entry_points.txt +0 -0
  180. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/licenses/LICENSE +0 -0
  181. {opik-1.9.26.dist-info → opik-1.9.41.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ import opik.context_storage as context_storage
11
11
 
12
12
  @contextlib.contextmanager
13
13
  def evaluate_llm_task_context(
14
- experiment: experiment.Experiment,
14
+ experiment: Optional[experiment.Experiment],
15
15
  dataset_item_id: str,
16
16
  trace_data: trace.TraceData,
17
17
  client: opik_client.Opik,
@@ -36,12 +36,13 @@ def evaluate_llm_task_context(
36
36
  client = client if client is not None else opik_client.get_client_cached()
37
37
  client.trace(**trace_data.as_parameters)
38
38
 
39
- experiment_item_ = experiment_item.ExperimentItemReferences(
40
- dataset_item_id=dataset_item_id,
41
- trace_id=trace_data.id,
42
- )
43
-
44
- experiment.insert(experiment_items_references=[experiment_item_])
39
+ # Only insert experiment item if an experiment is provided
40
+ if experiment is not None:
41
+ experiment_item_ = experiment_item.ExperimentItemReferences(
42
+ dataset_item_id=dataset_item_id,
43
+ trace_id=trace_data.id,
44
+ )
45
+ experiment.insert(experiment_items_references=[experiment_item_])
45
46
 
46
47
 
47
48
  @contextlib.contextmanager
@@ -0,0 +1,237 @@
1
+ import inspect
2
+ import logging
3
+ from typing import List, Dict, Any, Optional, Callable, Tuple
4
+
5
+ import opik.exceptions as exceptions
6
+ import opik.logging_messages as logging_messages
7
+ from opik.evaluation.metrics import (
8
+ arguments_helpers,
9
+ base_metric,
10
+ score_result,
11
+ arguments_validator,
12
+ )
13
+ from opik.evaluation.scorers import scorer_wrapper_metric
14
+ from opik.evaluation.types import ScoringKeyMappingType
15
+ from opik.message_processing.emulation import models
16
+
17
+ from . import exception_analyzer
18
+
19
+
20
+ LOGGER = logging.getLogger(__name__)
21
+
22
+ EVALUATION_SPAN_PARAMETER_NAME = "task_span"
23
+
24
+
25
+ def _has_evaluation_span_parameter(func: Callable) -> bool:
26
+ """Check if a scoring function expects the task_span parameter."""
27
+ try:
28
+ sig = inspect.signature(func)
29
+ return EVALUATION_SPAN_PARAMETER_NAME in sig.parameters
30
+ except (ValueError, TypeError):
31
+ return False
32
+
33
+
34
+ def _compute_metric_scores(
35
+ scoring_metrics: List[base_metric.BaseMetric],
36
+ mapped_scoring_inputs: Dict[str, Any],
37
+ scoring_key_mapping: Optional[ScoringKeyMappingType],
38
+ dataset_item_content: Dict[str, Any],
39
+ task_output: Dict[str, Any],
40
+ ) -> List[score_result.ScoreResult]:
41
+ """
42
+ Compute scores using given metrics.
43
+
44
+ Args:
45
+ scoring_metrics: List of metrics to compute
46
+ mapped_scoring_inputs: Scoring inputs after key mapping (will be used for regular metrics)
47
+ scoring_key_mapping: Optional mapping for renaming score arguments
48
+ dataset_item_content: Dataset item content (will be used for ScorerWrapperMetric)
49
+ task_output: Task output (will be used for ScorerWrapperMetric)
50
+
51
+ Returns:
52
+ List of computed score results
53
+ """
54
+ score_results: List[score_result.ScoreResult] = []
55
+
56
+ for metric in scoring_metrics:
57
+ try:
58
+ LOGGER.debug("Metric %s score started", metric.name)
59
+
60
+ if isinstance(metric, scorer_wrapper_metric.ScorerWrapperMetric):
61
+ # ScorerWrapperMetric uses original dataset item and task output without mappings
62
+ if (
63
+ task_span := mapped_scoring_inputs.get(
64
+ EVALUATION_SPAN_PARAMETER_NAME
65
+ )
66
+ ) is not None:
67
+ result = metric.score(
68
+ dataset_item=dataset_item_content,
69
+ task_outputs=task_output,
70
+ task_span=task_span,
71
+ )
72
+ else:
73
+ result = metric.score(
74
+ dataset_item=dataset_item_content,
75
+ task_outputs=task_output,
76
+ )
77
+ else:
78
+ arguments_validator.validate_score_arguments(
79
+ metric=metric,
80
+ kwargs=mapped_scoring_inputs,
81
+ scoring_key_mapping=scoring_key_mapping,
82
+ )
83
+ result = metric.score(**mapped_scoring_inputs)
84
+
85
+ LOGGER.debug("Metric %s score ended", metric.name)
86
+
87
+ if isinstance(result, list):
88
+ score_results += result
89
+ else:
90
+ score_results.append(result)
91
+
92
+ except exceptions.ScoreMethodMissingArguments:
93
+ raise
94
+ except Exception as exception:
95
+ LOGGER.error(
96
+ "Failed to compute metric %s. Score result will be marked as failed.",
97
+ metric.name,
98
+ exc_info=True,
99
+ )
100
+
101
+ if exception_analyzer.is_llm_provider_rate_limit_error(exception):
102
+ LOGGER.error(
103
+ logging_messages.LLM_PROVIDER_RATE_LIMIT_ERROR_DETECTED_IN_EVALUATE_FUNCTION
104
+ )
105
+
106
+ score_results.append(
107
+ score_result.ScoreResult(
108
+ name=metric.name,
109
+ value=0.0,
110
+ reason=str(exception),
111
+ scoring_failed=True,
112
+ )
113
+ )
114
+
115
+ return score_results
116
+
117
+
118
+ class MetricsEvaluator:
119
+ """
120
+ Handles metric computation and scoring.
121
+
122
+ Separates metrics into:
123
+ - Regular metrics: Score based on inputs/outputs
124
+ - Task span metrics: Score based on LLM call metadata (tokens, latency, etc)
125
+ """
126
+
127
+ def __init__(
128
+ self,
129
+ scoring_metrics: List[base_metric.BaseMetric],
130
+ scoring_key_mapping: Optional[ScoringKeyMappingType],
131
+ ):
132
+ self._scoring_key_mapping = scoring_key_mapping
133
+ self._regular_metrics: List[base_metric.BaseMetric] = []
134
+ self._task_span_metrics: List[base_metric.BaseMetric] = []
135
+
136
+ self._analyze_metrics(scoring_metrics)
137
+
138
+ @property
139
+ def has_task_span_metrics(self) -> bool:
140
+ """Check if any task span scoring metrics are configured."""
141
+ return len(self._task_span_metrics) > 0
142
+
143
+ @property
144
+ def task_span_metrics(self) -> List[base_metric.BaseMetric]:
145
+ """Get list of task span scoring metrics."""
146
+ return self._task_span_metrics
147
+
148
+ @property
149
+ def regular_metrics(self) -> List[base_metric.BaseMetric]:
150
+ """Get list of regular scoring metrics."""
151
+ return self._regular_metrics
152
+
153
+ def _analyze_metrics(
154
+ self,
155
+ scoring_metrics: List[base_metric.BaseMetric],
156
+ ) -> None:
157
+ """Separate metrics into regular and task-span categories."""
158
+ for metric in scoring_metrics:
159
+ if _has_evaluation_span_parameter(metric.score):
160
+ self._task_span_metrics.append(metric)
161
+ else:
162
+ self._regular_metrics.append(metric)
163
+
164
+ if self.has_task_span_metrics:
165
+ LOGGER.debug(
166
+ "Detected %d LLM task span scoring metrics.",
167
+ len(self._task_span_metrics),
168
+ )
169
+
170
+ def compute_regular_scores(
171
+ self,
172
+ dataset_item_content: Dict[str, Any],
173
+ task_output: Dict[str, Any],
174
+ ) -> Tuple[List[score_result.ScoreResult], Dict[str, Any]]:
175
+ """
176
+ Compute scores using regular metrics.
177
+
178
+ Args:
179
+ dataset_item_content: Dataset item content
180
+ task_output: Task output
181
+
182
+ Returns:
183
+ Tuple of (score results, mapped scoring inputs used for scoring regular non-wrapper metrics)
184
+ """
185
+ mapped_scoring_inputs = arguments_helpers.create_scoring_inputs(
186
+ dataset_item=dataset_item_content,
187
+ task_output=task_output,
188
+ scoring_key_mapping=self._scoring_key_mapping,
189
+ )
190
+
191
+ score_results = _compute_metric_scores(
192
+ scoring_metrics=self._regular_metrics,
193
+ mapped_scoring_inputs=mapped_scoring_inputs,
194
+ scoring_key_mapping=self._scoring_key_mapping,
195
+ dataset_item_content=dataset_item_content,
196
+ task_output=task_output,
197
+ )
198
+
199
+ return score_results, mapped_scoring_inputs
200
+
201
+ def compute_task_span_scores(
202
+ self,
203
+ dataset_item_content: Dict[str, Any],
204
+ task_output: Dict[str, Any],
205
+ task_span: models.SpanModel,
206
+ ) -> Tuple[List[score_result.ScoreResult], Dict[str, Any]]:
207
+ """
208
+ Compute scores using task span metrics.
209
+
210
+ Args:
211
+ dataset_item_content: Dataset item content
212
+ task_output: Task output
213
+ task_span: Span model containing task execution metadata
214
+
215
+ Returns:
216
+ Tuple of (score results, mapped scoring inputs used for scoring regular non-wrapper metrics)
217
+ """
218
+ mapped_scoring_inputs = arguments_helpers.create_scoring_inputs(
219
+ dataset_item=dataset_item_content,
220
+ task_output=task_output,
221
+ scoring_key_mapping=self._scoring_key_mapping,
222
+ )
223
+
224
+ mapped_scoring_inputs_with_span = {
225
+ **mapped_scoring_inputs,
226
+ EVALUATION_SPAN_PARAMETER_NAME: task_span,
227
+ }
228
+
229
+ score_results = _compute_metric_scores(
230
+ scoring_metrics=self._task_span_metrics,
231
+ mapped_scoring_inputs=mapped_scoring_inputs_with_span,
232
+ scoring_key_mapping=self._scoring_key_mapping,
233
+ dataset_item_content=dataset_item_content,
234
+ task_output=task_output,
235
+ )
236
+
237
+ return score_results, mapped_scoring_inputs_with_span
@@ -1,10 +1,14 @@
1
- from typing import List, Optional, Dict
1
+ from typing import List, Optional, Dict, TYPE_CHECKING
2
2
  from collections import defaultdict
3
3
  import logging
4
4
 
5
5
  import dataclasses
6
6
 
7
7
  from . import score_statistics, test_result
8
+ from .metrics import score_result
9
+
10
+ if TYPE_CHECKING:
11
+ pass
8
12
 
9
13
  LOGGER = logging.getLogger(__name__)
10
14
 
@@ -68,6 +72,9 @@ class EvaluationResult:
68
72
  test_results: List[test_result.TestResult]
69
73
  experiment_url: Optional[str]
70
74
  trial_count: int
75
+ experiment_scores: List[score_result.ScoreResult] = dataclasses.field(
76
+ default_factory=list
77
+ )
71
78
 
72
79
  def aggregate_evaluation_scores(self) -> EvaluationResultAggregatedScoresView:
73
80
  """
@@ -143,3 +150,30 @@ class EvaluationResult:
143
150
  )
144
151
 
145
152
  return dataset_items_results
153
+
154
+
155
+ @dataclasses.dataclass
156
+ class EvaluationResultOnDictItems:
157
+ """
158
+ Evaluation result for dict items evaluation without experiment tracking.
159
+
160
+ This class provides a similar interface to EvaluationResult but is designed
161
+ for lightweight evaluations that don't require experiment or dataset management.
162
+ It can aggregate scores across test results just like the regular evaluation.
163
+
164
+ Attributes:
165
+ test_results: Collection of test results from the evaluation.
166
+ """
167
+
168
+ test_results: List[test_result.TestResult]
169
+
170
+ def aggregate_evaluation_scores(
171
+ self,
172
+ ) -> Dict[str, score_statistics.ScoreStatistics]:
173
+ """
174
+ Aggregates evaluation scores from test results.
175
+
176
+ Returns:
177
+ Dictionary mapping score names to their aggregated statistics.
178
+ """
179
+ return score_statistics.calculate_aggregated_statistics(self.test_results)