opik 1.9.26__py3-none-any.whl → 1.9.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. opik/__init__.py +10 -3
  2. opik/api_objects/dataset/rest_operations.py +2 -0
  3. opik/api_objects/experiment/experiment.py +31 -5
  4. opik/api_objects/experiment/helpers.py +34 -10
  5. opik/api_objects/local_recording.py +8 -3
  6. opik/api_objects/opik_client.py +218 -46
  7. opik/api_objects/opik_query_language.py +9 -0
  8. opik/api_objects/prompt/__init__.py +11 -3
  9. opik/api_objects/prompt/base_prompt.py +69 -0
  10. opik/api_objects/prompt/base_prompt_template.py +29 -0
  11. opik/api_objects/prompt/chat/__init__.py +1 -0
  12. opik/api_objects/prompt/chat/chat_prompt.py +193 -0
  13. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  14. opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +31 -34
  15. opik/api_objects/prompt/client.py +101 -30
  16. opik/api_objects/prompt/text/__init__.py +1 -0
  17. opik/api_objects/prompt/{prompt.py → text/prompt.py} +55 -32
  18. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +8 -5
  19. opik/cli/export.py +6 -2
  20. opik/config.py +0 -5
  21. opik/decorator/base_track_decorator.py +37 -40
  22. opik/evaluation/__init__.py +13 -2
  23. opik/evaluation/engine/engine.py +195 -223
  24. opik/evaluation/engine/helpers.py +8 -7
  25. opik/evaluation/engine/metrics_evaluator.py +237 -0
  26. opik/evaluation/evaluation_result.py +35 -1
  27. opik/evaluation/evaluator.py +309 -23
  28. opik/evaluation/models/litellm/util.py +78 -6
  29. opik/evaluation/report.py +14 -2
  30. opik/evaluation/rest_operations.py +6 -9
  31. opik/evaluation/test_case.py +2 -2
  32. opik/evaluation/types.py +9 -1
  33. opik/exceptions.py +17 -0
  34. opik/id_helpers.py +18 -0
  35. opik/integrations/adk/helpers.py +16 -7
  36. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  37. opik/integrations/adk/opik_tracer.py +3 -1
  38. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  39. opik/integrations/dspy/callback.py +1 -4
  40. opik/integrations/haystack/opik_connector.py +2 -2
  41. opik/integrations/haystack/opik_tracer.py +2 -4
  42. opik/integrations/langchain/opik_tracer.py +1 -4
  43. opik/integrations/llama_index/callback.py +2 -4
  44. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  45. opik/integrations/openai/opik_tracker.py +1 -1
  46. opik/opik_context.py +7 -7
  47. opik/rest_api/__init__.py +123 -11
  48. opik/rest_api/dashboards/client.py +65 -2
  49. opik/rest_api/dashboards/raw_client.py +82 -0
  50. opik/rest_api/datasets/client.py +441 -2
  51. opik/rest_api/datasets/raw_client.py +1225 -505
  52. opik/rest_api/experiments/client.py +30 -2
  53. opik/rest_api/experiments/raw_client.py +26 -0
  54. opik/rest_api/optimizations/client.py +302 -0
  55. opik/rest_api/optimizations/raw_client.py +463 -0
  56. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  57. opik/rest_api/prompts/__init__.py +2 -2
  58. opik/rest_api/prompts/client.py +34 -4
  59. opik/rest_api/prompts/raw_client.py +32 -2
  60. opik/rest_api/prompts/types/__init__.py +3 -1
  61. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  62. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  63. opik/rest_api/traces/client.py +6 -6
  64. opik/rest_api/traces/raw_client.py +4 -4
  65. opik/rest_api/types/__init__.py +121 -11
  66. opik/rest_api/types/aggregation_data.py +1 -0
  67. opik/rest_api/types/automation_rule_evaluator.py +23 -1
  68. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  69. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  70. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  71. opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
  72. opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
  73. opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
  74. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  75. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  76. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  77. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  78. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  79. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  80. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  81. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  82. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  83. opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
  84. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  85. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  86. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  87. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  88. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  89. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  90. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  91. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  92. opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
  93. opik/rest_api/types/dashboard_page_public.py +1 -0
  94. opik/rest_api/types/dataset.py +2 -0
  95. opik/rest_api/types/dataset_item.py +1 -0
  96. opik/rest_api/types/dataset_item_compare.py +1 -0
  97. opik/rest_api/types/dataset_item_page_compare.py +1 -0
  98. opik/rest_api/types/dataset_item_page_public.py +1 -0
  99. opik/rest_api/types/dataset_item_public.py +1 -0
  100. opik/rest_api/types/dataset_public.py +2 -0
  101. opik/rest_api/types/dataset_public_status.py +5 -0
  102. opik/rest_api/types/dataset_status.py +5 -0
  103. opik/rest_api/types/dataset_version_diff.py +22 -0
  104. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  105. opik/rest_api/types/dataset_version_page_public.py +23 -0
  106. opik/rest_api/types/dataset_version_public.py +49 -0
  107. opik/rest_api/types/experiment.py +2 -0
  108. opik/rest_api/types/experiment_public.py +2 -0
  109. opik/rest_api/types/experiment_score.py +20 -0
  110. opik/rest_api/types/experiment_score_public.py +20 -0
  111. opik/rest_api/types/experiment_score_write.py +20 -0
  112. opik/rest_api/types/feedback_score_public.py +4 -0
  113. opik/rest_api/types/optimization.py +2 -0
  114. opik/rest_api/types/optimization_public.py +2 -0
  115. opik/rest_api/types/optimization_public_status.py +3 -1
  116. opik/rest_api/types/optimization_status.py +3 -1
  117. opik/rest_api/types/optimization_studio_config.py +27 -0
  118. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  119. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  120. opik/rest_api/types/optimization_studio_log.py +22 -0
  121. opik/rest_api/types/optimization_write.py +2 -0
  122. opik/rest_api/types/optimization_write_status.py +3 -1
  123. opik/rest_api/types/prompt.py +6 -0
  124. opik/rest_api/types/prompt_detail.py +6 -0
  125. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  126. opik/rest_api/types/prompt_public.py +6 -0
  127. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  128. opik/rest_api/types/prompt_template_structure.py +5 -0
  129. opik/rest_api/types/prompt_version.py +2 -0
  130. opik/rest_api/types/prompt_version_detail.py +2 -0
  131. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  132. opik/rest_api/types/prompt_version_public.py +2 -0
  133. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  134. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  135. opik/rest_api/types/score_name.py +1 -0
  136. opik/rest_api/types/service_toggles_config.py +5 -0
  137. opik/rest_api/types/span_filter.py +23 -0
  138. opik/rest_api/types/span_filter_operator.py +21 -0
  139. opik/rest_api/types/span_filter_write.py +23 -0
  140. opik/rest_api/types/span_filter_write_operator.py +21 -0
  141. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  142. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  143. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  144. opik/rest_api/types/studio_evaluation.py +20 -0
  145. opik/rest_api/types/studio_evaluation_public.py +20 -0
  146. opik/rest_api/types/studio_evaluation_write.py +20 -0
  147. opik/rest_api/types/studio_llm_model.py +21 -0
  148. opik/rest_api/types/studio_llm_model_public.py +21 -0
  149. opik/rest_api/types/studio_llm_model_write.py +21 -0
  150. opik/rest_api/types/studio_message.py +20 -0
  151. opik/rest_api/types/studio_message_public.py +20 -0
  152. opik/rest_api/types/studio_message_write.py +20 -0
  153. opik/rest_api/types/studio_metric.py +21 -0
  154. opik/rest_api/types/studio_metric_public.py +21 -0
  155. opik/rest_api/types/studio_metric_write.py +21 -0
  156. opik/rest_api/types/studio_optimizer.py +21 -0
  157. opik/rest_api/types/studio_optimizer_public.py +21 -0
  158. opik/rest_api/types/studio_optimizer_write.py +21 -0
  159. opik/rest_api/types/studio_prompt.py +20 -0
  160. opik/rest_api/types/studio_prompt_public.py +20 -0
  161. opik/rest_api/types/studio_prompt_write.py +20 -0
  162. opik/rest_api/types/trace.py +6 -0
  163. opik/rest_api/types/trace_public.py +6 -0
  164. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  165. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  166. opik/rest_api/types/value_entry.py +2 -0
  167. opik/rest_api/types/value_entry_compare.py +2 -0
  168. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  169. opik/rest_api/types/value_entry_public.py +2 -0
  170. opik/synchronization.py +5 -6
  171. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  172. {opik-1.9.26.dist-info → opik-1.9.39.dist-info}/METADATA +2 -1
  173. {opik-1.9.26.dist-info → opik-1.9.39.dist-info}/RECORD +177 -119
  174. opik/api_objects/prompt/chat_prompt_template.py +0 -200
  175. {opik-1.9.26.dist-info → opik-1.9.39.dist-info}/WHEEL +0 -0
  176. {opik-1.9.26.dist-info → opik-1.9.39.dist-info}/entry_points.txt +0 -0
  177. {opik-1.9.26.dist-info → opik-1.9.39.dist-info}/licenses/LICENSE +0 -0
  178. {opik-1.9.26.dist-info → opik-1.9.39.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,7 @@ from typing import (
14
14
  NamedTuple,
15
15
  )
16
16
 
17
- from .. import context_storage, logging_messages
17
+ from .. import context_storage, logging_messages, tracing_runtime_config
18
18
  from ..api_objects import opik_client, span, trace
19
19
  from ..types import DistributedTraceHeadersDict, ErrorInfoDict, SpanType
20
20
  from . import (
@@ -24,7 +24,6 @@ from . import (
24
24
  inspect_helpers,
25
25
  opik_args,
26
26
  span_creation_handler,
27
- tracing_runtime_config,
28
27
  )
29
28
 
30
29
  LOGGER = logging.getLogger(__name__)
@@ -337,25 +336,24 @@ class BaseTrackDecorator(abc.ABC):
337
336
  )
338
337
  error_info = error_info_collector.collect(exception)
339
338
  func_exception = exception
340
- finally:
341
- stream_or_stream_manager = self._streams_handler(
342
- result,
343
- track_options.capture_output,
344
- track_options.generations_aggregator,
345
- )
346
- if stream_or_stream_manager is not None:
347
- return stream_or_stream_manager
348
-
349
- self._after_call(
350
- output=result,
351
- error_info=error_info,
352
- capture_output=track_options.capture_output,
353
- flush=track_options.flush,
354
- )
355
- if func_exception is not None:
356
- raise func_exception
357
- else:
358
- return result
339
+
340
+ stream_or_stream_manager = self._streams_handler(
341
+ result,
342
+ track_options.capture_output,
343
+ track_options.generations_aggregator,
344
+ )
345
+ if stream_or_stream_manager is not None:
346
+ return stream_or_stream_manager
347
+
348
+ self._after_call(
349
+ output=result,
350
+ error_info=error_info,
351
+ capture_output=track_options.capture_output,
352
+ flush=track_options.flush,
353
+ )
354
+ if func_exception is not None:
355
+ raise func_exception
356
+ return result
359
357
 
360
358
  wrapper.opik_tracked = True # type: ignore
361
359
 
@@ -391,25 +389,24 @@ class BaseTrackDecorator(abc.ABC):
391
389
  )
392
390
  error_info = error_info_collector.collect(exception)
393
391
  func_exception = exception
394
- finally:
395
- stream_or_stream_manager = self._streams_handler(
396
- result,
397
- track_options.capture_output,
398
- track_options.generations_aggregator,
399
- )
400
- if stream_or_stream_manager is not None:
401
- return stream_or_stream_manager
402
-
403
- self._after_call(
404
- output=result,
405
- error_info=error_info,
406
- capture_output=track_options.capture_output,
407
- flush=track_options.flush,
408
- )
409
- if func_exception is not None:
410
- raise func_exception
411
- else:
412
- return result
392
+
393
+ stream_or_stream_manager = self._streams_handler(
394
+ result,
395
+ track_options.capture_output,
396
+ track_options.generations_aggregator,
397
+ )
398
+ if stream_or_stream_manager is not None:
399
+ return stream_or_stream_manager
400
+
401
+ self._after_call(
402
+ output=result,
403
+ error_info=error_info,
404
+ capture_output=track_options.capture_output,
405
+ flush=track_options.flush,
406
+ )
407
+ if func_exception is not None:
408
+ raise func_exception
409
+ return result
413
410
 
414
411
  wrapper.opik_tracked = True # type: ignore
415
412
  return wrapper
@@ -1,4 +1,15 @@
1
- from .evaluator import evaluate, evaluate_prompt, evaluate_experiment
1
+ from .evaluator import (
2
+ evaluate,
3
+ evaluate_prompt,
4
+ evaluate_experiment,
5
+ evaluate_on_dict_items,
6
+ )
2
7
  from .threads.evaluator import evaluate_threads
3
8
 
4
- __all__ = ["evaluate", "evaluate_prompt", "evaluate_experiment", "evaluate_threads"]
9
+ __all__ = [
10
+ "evaluate",
11
+ "evaluate_prompt",
12
+ "evaluate_experiment",
13
+ "evaluate_on_dict_items",
14
+ "evaluate_threads",
15
+ ]
@@ -1,13 +1,11 @@
1
1
  import functools
2
- import inspect
3
2
  import logging
4
- from typing import List, Optional, Callable, Any, Dict
3
+ from typing import List, Optional, Any, Dict
5
4
 
6
- import opik.exceptions as exceptions
7
5
  import opik.logging_messages as logging_messages
8
6
  import opik.opik_context as opik_context
9
7
  import opik
10
- from opik.api_objects import opik_client, trace
8
+ from opik.api_objects import opik_client, trace, local_recording
11
9
  from opik.api_objects.dataset import dataset, dataset_item
12
10
  from opik.api_objects.experiment import experiment
13
11
  from opik.evaluation import (
@@ -18,18 +16,15 @@ from opik.evaluation import (
18
16
  )
19
17
  from opik.evaluation.types import LLMTask, ScoringKeyMappingType
20
18
 
21
- from . import evaluation_tasks_executor, exception_analyzer, helpers
19
+ from . import evaluation_tasks_executor, exception_analyzer, helpers, metrics_evaluator
22
20
  from .types import EvaluationTask
23
- from ..metrics import arguments_validator, arguments_helpers, base_metric, score_result
24
- from ..scorers import scorer_wrapper_metric
25
- from ...message_processing import message_processors_chain
21
+ from ..metrics import base_metric, score_result
26
22
  from ...message_processing.emulation import models
27
23
 
28
24
 
29
25
  LOGGER = logging.getLogger(__name__)
30
26
 
31
27
  EVALUATION_TASK_NAME = "evaluation_task"
32
- EVALUATION_SPAN_PARAMETER_NAME = "task_span"
33
28
 
34
29
 
35
30
  class EvaluationEngine:
@@ -37,7 +32,6 @@ class EvaluationEngine:
37
32
  self,
38
33
  client: opik_client.Opik,
39
34
  project_name: Optional[str],
40
- experiment_: experiment.Experiment,
41
35
  scoring_metrics: List[base_metric.BaseMetric],
42
36
  workers: int,
43
37
  verbose: int,
@@ -45,41 +39,28 @@ class EvaluationEngine:
45
39
  ) -> None:
46
40
  self._client = client
47
41
  self._project_name = project_name
48
- self._experiment = experiment_
49
42
  self._workers = workers
50
43
  self._verbose = verbose
51
- self._scoring_metrics: List[base_metric.BaseMetric] = []
52
- self._task_span_scoring_metrics: List[base_metric.BaseMetric] = []
53
- self._scoring_key_mapping = scoring_key_mapping
54
44
 
55
- # Analyze metrics
56
- self._analyze_metrics(scoring_metrics)
57
-
58
- if len(self._task_span_scoring_metrics) > 0:
59
- LOGGER.info(
60
- "Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
61
- len(self._task_span_scoring_metrics),
62
- )
63
-
64
- def _analyze_metrics(self, scoring_metrics: List[base_metric.BaseMetric]) -> None:
65
- for metric in scoring_metrics:
66
- if _has_evaluation_span_parameter(metric.score):
67
- self._task_span_scoring_metrics.append(metric)
68
- else:
69
- self._scoring_metrics.append(metric)
45
+ # Delegate metric analysis to MetricsEvaluator
46
+ self._metrics_evaluator = metrics_evaluator.MetricsEvaluator(
47
+ scoring_metrics=scoring_metrics,
48
+ scoring_key_mapping=scoring_key_mapping,
49
+ )
70
50
 
71
51
  @opik.track(name="metrics_calculation") # type: ignore[attr-defined,has-type]
72
- def _evaluate_test_case(
52
+ def _compute_test_result_for_test_case(
73
53
  self,
74
54
  test_case_: test_case.TestCase,
75
55
  trial_id: int = 0,
76
56
  ) -> test_result.TestResult:
77
- score_results = _scores_by_metrics(
78
- scoring_metrics=self._scoring_metrics,
79
- score_kwargs=test_case_.scoring_inputs,
80
- scoring_key_mapping=self._scoring_key_mapping,
81
- test_case_=test_case_,
57
+ score_results, mapped_scoring_inputs = (
58
+ self._metrics_evaluator.compute_regular_scores(
59
+ dataset_item_content=test_case_.dataset_item_content,
60
+ task_output=test_case_.task_output,
61
+ )
82
62
  )
63
+ test_case_.mapped_scoring_inputs = mapped_scoring_inputs
83
64
 
84
65
  test_result_ = test_result.TestResult(
85
66
  test_case=test_case_,
@@ -94,11 +75,40 @@ class EvaluationEngine:
94
75
  )
95
76
  return test_result_
96
77
 
97
- def _evaluate_llm_task(
78
+ @opik.track( # type: ignore[attr-defined,has-type]
79
+ name="task_span_metrics_calculation",
80
+ ignore_arguments=["test_case_"],
81
+ )
82
+ def _compute_scores_for_test_case_with_task_span(
83
+ self,
84
+ trace_id: str,
85
+ task_span: models.SpanModel,
86
+ test_case_: test_case.TestCase,
87
+ ) -> List[score_result.ScoreResult]:
88
+ score_results, mapped_scoring_inputs = (
89
+ self._metrics_evaluator.compute_task_span_scores(
90
+ dataset_item_content=test_case_.dataset_item_content,
91
+ task_output=test_case_.task_output,
92
+ task_span=task_span,
93
+ )
94
+ )
95
+ test_case_.mapped_scoring_inputs = mapped_scoring_inputs
96
+
97
+ # log feedback scores
98
+ rest_operations.log_test_result_feedback_scores(
99
+ client=self._client,
100
+ score_results=score_results,
101
+ trace_id=trace_id,
102
+ project_name=self._project_name,
103
+ )
104
+ return score_results
105
+
106
+ def _compute_test_result_for_llm_task(
98
107
  self,
99
108
  item: dataset_item.DatasetItem,
100
109
  task: LLMTask,
101
110
  trial_id: int,
111
+ experiment_: Optional[experiment.Experiment],
102
112
  ) -> test_result.TestResult:
103
113
  if not hasattr(task, "opik_tracked"):
104
114
  name = task.__name__ if hasattr(task, "__name__") else "llm_task"
@@ -113,7 +123,7 @@ class EvaluationEngine:
113
123
  )
114
124
 
115
125
  with helpers.evaluate_llm_task_context(
116
- experiment=self._experiment,
126
+ experiment=experiment_,
117
127
  dataset_item_id=item.id,
118
128
  trace_data=trace_data,
119
129
  client=self._client,
@@ -132,121 +142,53 @@ class EvaluationEngine:
132
142
 
133
143
  opik_context.update_current_trace(output=task_output_)
134
144
 
135
- scoring_inputs = arguments_helpers.create_scoring_inputs(
136
- dataset_item=item_content,
137
- task_output=task_output_,
138
- scoring_key_mapping=self._scoring_key_mapping,
139
- )
140
-
141
145
  test_case_ = test_case.TestCase(
142
146
  trace_id=trace_data.id,
143
147
  dataset_item_id=item.id,
144
- scoring_inputs=scoring_inputs,
145
148
  task_output=task_output_,
146
149
  dataset_item_content=item_content,
147
150
  )
148
- test_result_ = self._evaluate_test_case(
151
+ test_result_ = self._compute_test_result_for_test_case(
149
152
  test_case_=test_case_,
150
153
  trial_id=trial_id,
151
154
  )
152
155
 
153
156
  return test_result_
154
157
 
155
- def evaluate_llm_tasks(
158
+ def _compute_test_results_for_llm_task(
156
159
  self,
157
- dataset_: dataset.Dataset,
160
+ dataset_items: List[dataset_item.DatasetItem],
158
161
  task: LLMTask,
159
- nb_samples: Optional[int],
160
- dataset_item_ids: Optional[List[str]],
161
- dataset_sampler: Optional[samplers.BaseDatasetSampler],
162
+ experiment_: Optional[experiment.Experiment],
162
163
  trial_count: int,
164
+ description: str,
163
165
  ) -> List[test_result.TestResult]:
164
- task_span_scoring_enabled = False
165
- if len(self._task_span_scoring_metrics) > 0:
166
- message_processors_chain.toggle_local_emulator_message_processor(
167
- active=True, chain=self._client._message_processor
168
- )
169
- task_span_scoring_enabled = True
170
-
171
- dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
172
- nb_samples=nb_samples,
173
- dataset_item_ids=dataset_item_ids,
174
- )
175
-
176
- if dataset_sampler is not None:
177
- dataset_items = dataset_sampler.sample(dataset_items)
178
-
179
166
  test_results: List[test_result.TestResult] = []
180
167
 
181
168
  for trial_id in range(trial_count):
182
169
  evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
183
170
  functools.partial(
184
- self._evaluate_llm_task,
171
+ self._compute_test_result_for_llm_task,
185
172
  item=item,
186
173
  task=task,
187
174
  trial_id=trial_id,
175
+ experiment_=experiment_,
188
176
  )
189
177
  for item in dataset_items
190
178
  ]
191
179
 
192
180
  test_results += evaluation_tasks_executor.execute(
193
- evaluation_tasks,
194
- self._workers,
195
- self._verbose,
196
- desc=f"Evaluation trial {trial_id}"
181
+ evaluation_tasks=evaluation_tasks,
182
+ workers=self._workers,
183
+ verbose=self._verbose,
184
+ desc=f"{description} trial {trial_id}"
197
185
  if trial_count > 1
198
- else "Evaluation",
199
- )
200
-
201
- if task_span_scoring_enabled:
202
- # flush Opik client to make sure all spans are collected
203
- self._client.flush()
204
-
205
- self._evaluate_llm_tasks_spans(test_results)
206
-
207
- LOGGER.info(
208
- "Task evaluation span handling is disabled — the evaluation has been completed."
209
- )
210
- message_processors_chain.toggle_local_emulator_message_processor(
211
- active=False, chain=self._client._message_processor
186
+ else description,
212
187
  )
213
188
 
214
189
  return test_results
215
190
 
216
- def _evaluate_llm_tasks_spans(
217
- self, test_results: List[test_result.TestResult]
218
- ) -> None:
219
- local = message_processors_chain.get_local_emulator_message_processor(
220
- chain=self._client._message_processor
221
- )
222
- if local is None:
223
- LOGGER.warning("Local emulator message processor not found in the chain.")
224
- return
225
-
226
- # get trace trees from a local emulator
227
- trace_trees = local.trace_trees
228
- if len(trace_trees) == 0:
229
- LOGGER.warning("No trace trees found in the local emulator.")
230
- return
231
-
232
- # create span evaluation tasks from LLM tasks evaluation results and evaluate them in parallel
233
- span_evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
234
- functools.partial(
235
- self._evaluate_llm_task_result_span,
236
- evaluation_task_result=test_result_,
237
- trace_trees=trace_trees,
238
- )
239
- for test_result_ in test_results
240
- ]
241
-
242
- evaluation_tasks_executor.execute(
243
- span_evaluation_tasks,
244
- self._workers,
245
- self._verbose,
246
- desc="LLM task spans evaluation",
247
- )
248
-
249
- def _evaluate_llm_task_result_span(
191
+ def _update_test_result_with_task_span_metrics(
250
192
  self,
251
193
  evaluation_task_result: test_result.TestResult,
252
194
  trace_trees: List[models.TraceModel],
@@ -288,7 +230,7 @@ class EvaluationEngine:
288
230
  ),
289
231
  client=self._client,
290
232
  ):
291
- score_results = self._score_llm_task_result_span(
233
+ score_results = self._compute_scores_for_test_case_with_task_span(
292
234
  trace_id=trace_id,
293
235
  task_span=evaluation_span,
294
236
  test_case_=evaluation_task_result.test_case,
@@ -297,129 +239,159 @@ class EvaluationEngine:
297
239
  evaluation_task_result.score_results += score_results
298
240
  return evaluation_task_result
299
241
 
300
- @opik.track( # type: ignore[attr-defined,has-type]
301
- name="task_span_metrics_calculation",
302
- ignore_arguments=["test_case_"],
303
- )
304
- def _score_llm_task_result_span(
242
+ def _update_test_results_with_task_span_metrics(
305
243
  self,
306
- trace_id: str,
307
- task_span: models.SpanModel,
308
- test_case_: test_case.TestCase,
309
- ) -> List[score_result.ScoreResult]:
310
- score_kwargs = {
311
- **test_case_.scoring_inputs,
312
- EVALUATION_SPAN_PARAMETER_NAME: task_span,
313
- }
314
-
315
- score_results = _scores_by_metrics(
316
- scoring_metrics=self._task_span_scoring_metrics,
317
- score_kwargs=score_kwargs,
318
- scoring_key_mapping=self._scoring_key_mapping,
319
- test_case_=test_case_,
244
+ test_results: List[test_result.TestResult],
245
+ recording: local_recording._LocalRecordingHandle,
246
+ ) -> None:
247
+ """Evaluate task spans from a local recording."""
248
+ # Get trace trees from the recording (this flushes automatically)
249
+ trace_trees = recording.trace_trees
250
+ if len(trace_trees) == 0:
251
+ LOGGER.warning("No trace trees found in the local recording.")
252
+ return
253
+
254
+ # Create span evaluation tasks from LLM tasks evaluation results and evaluate them in parallel
255
+ span_evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
256
+ functools.partial(
257
+ self._update_test_result_with_task_span_metrics,
258
+ evaluation_task_result=test_result_,
259
+ trace_trees=trace_trees,
260
+ )
261
+ for test_result_ in test_results
262
+ ]
263
+
264
+ evaluation_tasks_executor.execute(
265
+ evaluation_tasks=span_evaluation_tasks,
266
+ workers=self._workers,
267
+ verbose=self._verbose,
268
+ desc="LLM task spans evaluation",
320
269
  )
321
270
 
322
- # log feedback scores
323
- rest_operations.log_test_result_feedback_scores(
324
- client=self._client,
325
- score_results=score_results,
326
- trace_id=trace_id,
327
- project_name=self._project_name,
271
+ LOGGER.debug(
272
+ "Task evaluation span handling is disabled — the evaluation has been completed."
328
273
  )
329
- return score_results
330
274
 
331
- def evaluate_test_cases(
275
+ def evaluate_llm_task_on_dataset(
332
276
  self,
333
- test_cases: List[test_case.TestCase],
277
+ dataset_: dataset.Dataset,
278
+ task: LLMTask,
279
+ nb_samples: Optional[int],
280
+ dataset_item_ids: Optional[List[str]],
281
+ dataset_sampler: Optional[samplers.BaseDatasetSampler],
282
+ trial_count: int,
283
+ experiment_: Optional[experiment.Experiment],
334
284
  ) -> List[test_result.TestResult]:
335
- evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
336
- functools.partial(
337
- self._evaluate_test_case,
338
- test_case_=test_case_,
285
+ dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
286
+ nb_samples=nb_samples,
287
+ dataset_item_ids=dataset_item_ids,
288
+ )
289
+
290
+ if dataset_sampler is not None:
291
+ dataset_items = dataset_sampler.sample(dataset_items)
292
+
293
+ if not self._metrics_evaluator.has_task_span_metrics:
294
+ return self._compute_test_results_for_llm_task(
295
+ dataset_items=dataset_items,
296
+ task=task,
297
+ experiment_=experiment_,
298
+ trial_count=trial_count,
299
+ description="Evaluation",
339
300
  )
340
- for test_case_ in test_cases
341
- ]
342
301
 
343
- test_results = evaluation_tasks_executor.execute(
344
- evaluation_tasks, self._workers, self._verbose
302
+ LOGGER.debug(
303
+ "Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
304
+ len(self._metrics_evaluator.task_span_metrics),
345
305
  )
346
306
 
307
+ with local_recording.record_traces_locally(client=self._client) as recording:
308
+ test_results = self._compute_test_results_for_llm_task(
309
+ dataset_items=dataset_items,
310
+ task=task,
311
+ experiment_=experiment_,
312
+ trial_count=trial_count,
313
+ description="Evaluation",
314
+ )
315
+ self._update_test_results_with_task_span_metrics(
316
+ test_results=test_results,
317
+ recording=recording,
318
+ )
319
+
347
320
  return test_results
348
321
 
322
+ def evaluate_llm_task_on_dict_items(
323
+ self,
324
+ items: List[Dict[str, Any]],
325
+ task: LLMTask,
326
+ ) -> List[test_result.TestResult]:
327
+ """
328
+ Evaluate an LLM task on a list of dict items.
329
+
330
+ This method creates traces for each evaluation but doesn't require a Dataset object
331
+ or experiment. It's useful for optimization scenarios where you have items in memory
332
+ and want to evaluate them with a task function.
333
+
334
+ Args:
335
+ items: List of dataset item contents (dictionaries).
336
+ task: A callable that takes a dataset item dict and returns a dict with outputs.
337
+
338
+ Returns:
339
+ List of TestResult objects containing scores for each item.
340
+ """
341
+ # Convert raw items to DatasetItem objects for compatibility
342
+ dataset_items = [
343
+ dataset_item.DatasetItem(
344
+ id=f"temp_item_{idx}",
345
+ **item,
346
+ )
347
+ for idx, item in enumerate(items)
348
+ ]
349
349
 
350
- def _scores_by_metrics(
351
- scoring_metrics: List[base_metric.BaseMetric],
352
- score_kwargs: Dict[str, Any],
353
- scoring_key_mapping: Optional[ScoringKeyMappingType],
354
- test_case_: test_case.TestCase,
355
- ) -> List[score_result.ScoreResult]:
356
- score_results: List[score_result.ScoreResult] = []
357
- for metric in scoring_metrics:
358
- try:
359
- LOGGER.debug("Metric %s score started", metric.name)
360
-
361
- if isinstance(metric, scorer_wrapper_metric.ScorerWrapperMetric):
362
- # use original dataset item content without any mappings applied
363
- if (
364
- task_span := score_kwargs.get(EVALUATION_SPAN_PARAMETER_NAME)
365
- ) is not None:
366
- result = metric.score(
367
- dataset_item=test_case_.dataset_item_content,
368
- task_outputs=test_case_.task_output,
369
- task_span=task_span,
370
- )
371
- else:
372
- result = metric.score(
373
- dataset_item=test_case_.dataset_item_content,
374
- task_outputs=test_case_.task_output,
375
- )
376
- else:
377
- arguments_validator.validate_score_arguments(
378
- metric=metric,
379
- kwargs=score_kwargs,
380
- scoring_key_mapping=scoring_key_mapping,
381
- )
382
- result = metric.score(**score_kwargs)
383
-
384
- LOGGER.debug("Metric %s score ended", metric.name)
385
-
386
- if isinstance(result, list):
387
- score_results += result
388
- else:
389
- score_results.append(result)
390
- except exceptions.ScoreMethodMissingArguments:
391
- raise
392
- except Exception as exception:
393
- # This can be problematic if the metric returns a list of strings as we will not know the name of the metrics that have failed
394
- LOGGER.error(
395
- "Failed to compute metric %s. Score result will be marked as failed.",
396
- metric.name,
397
- exc_info=True,
350
+ if not self._metrics_evaluator.has_task_span_metrics:
351
+ return self._compute_test_results_for_llm_task(
352
+ dataset_items=dataset_items,
353
+ task=task,
354
+ experiment_=None,
355
+ trial_count=1,
356
+ description="Items evaluation",
398
357
  )
399
358
 
400
- if exception_analyzer.is_llm_provider_rate_limit_error(exception):
401
- LOGGER.error(
402
- logging_messages.LLM_PROVIDER_RATE_LIMIT_ERROR_DETECTED_IN_EVALUATE_FUNCTION
403
- )
359
+ LOGGER.debug(
360
+ "Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
361
+ len(self._metrics_evaluator.task_span_metrics),
362
+ )
404
363
 
405
- score_results.append(
406
- score_result.ScoreResult(
407
- name=metric.name,
408
- value=0.0,
409
- reason=str(exception),
410
- scoring_failed=True,
411
- )
364
+ with local_recording.record_traces_locally(client=self._client) as recording:
365
+ test_results = self._compute_test_results_for_llm_task(
366
+ dataset_items=dataset_items,
367
+ task=task,
368
+ experiment_=None,
369
+ trial_count=1,
370
+ description="Items evaluation",
371
+ )
372
+ self._update_test_results_with_task_span_metrics(
373
+ test_results=test_results,
374
+ recording=recording,
412
375
  )
413
376
 
414
- return score_results
377
+ return test_results
415
378
 
379
+ def evaluate_test_cases(
380
+ self,
381
+ test_cases: List[test_case.TestCase],
382
+ ) -> List[test_result.TestResult]:
383
+ evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
384
+ functools.partial(
385
+ self._compute_test_result_for_test_case,
386
+ test_case_=test_case_,
387
+ )
388
+ for test_case_ in test_cases
389
+ ]
416
390
 
417
- def _has_evaluation_span_parameter(func: Callable) -> bool:
418
- try:
419
- sig = inspect.signature(func)
420
- has_param = EVALUATION_SPAN_PARAMETER_NAME in sig.parameters
421
- except (ValueError, TypeError):
422
- # If we can't inspect the signature, assume no parameter
423
- has_param = False
391
+ test_results = evaluation_tasks_executor.execute(
392
+ evaluation_tasks=evaluation_tasks,
393
+ workers=self._workers,
394
+ verbose=self._verbose,
395
+ )
424
396
 
425
- return has_param
397
+ return test_results