PyPI - opik - Versions diffs - 1.9.26__py3-none-any.whl → 1.9.39__py3-none-any.whl - Mend

opik 1.9.26py3-none-any.whl → 1.9.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (178) hide show

opik/__init__.py +10 -3
opik/api_objects/dataset/rest_operations.py +2 -0
opik/api_objects/experiment/experiment.py +31 -5
opik/api_objects/experiment/helpers.py +34 -10
opik/api_objects/local_recording.py +8 -3
opik/api_objects/opik_client.py +218 -46
opik/api_objects/opik_query_language.py +9 -0
opik/api_objects/prompt/__init__.py +11 -3
opik/api_objects/prompt/base_prompt.py +69 -0
opik/api_objects/prompt/base_prompt_template.py +29 -0
opik/api_objects/prompt/chat/__init__.py +1 -0
opik/api_objects/prompt/chat/chat_prompt.py +193 -0
opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +31 -34
opik/api_objects/prompt/client.py +101 -30
opik/api_objects/prompt/text/__init__.py +1 -0
opik/api_objects/prompt/{prompt.py → text/prompt.py} +55 -32
opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +8 -5
opik/cli/export.py +6 -2
opik/config.py +0 -5
opik/decorator/base_track_decorator.py +37 -40
opik/evaluation/__init__.py +13 -2
opik/evaluation/engine/engine.py +195 -223
opik/evaluation/engine/helpers.py +8 -7
opik/evaluation/engine/metrics_evaluator.py +237 -0
opik/evaluation/evaluation_result.py +35 -1
opik/evaluation/evaluator.py +309 -23
opik/evaluation/models/litellm/util.py +78 -6
opik/evaluation/report.py +14 -2
opik/evaluation/rest_operations.py +6 -9
opik/evaluation/test_case.py +2 -2
opik/evaluation/types.py +9 -1
opik/exceptions.py +17 -0
opik/id_helpers.py +18 -0
opik/integrations/adk/helpers.py +16 -7
opik/integrations/adk/legacy_opik_tracer.py +7 -4
opik/integrations/adk/opik_tracer.py +3 -1
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
opik/integrations/dspy/callback.py +1 -4
opik/integrations/haystack/opik_connector.py +2 -2
opik/integrations/haystack/opik_tracer.py +2 -4
opik/integrations/langchain/opik_tracer.py +1 -4
opik/integrations/llama_index/callback.py +2 -4
opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
opik/integrations/openai/opik_tracker.py +1 -1
opik/opik_context.py +7 -7
opik/rest_api/__init__.py +123 -11
opik/rest_api/dashboards/client.py +65 -2
opik/rest_api/dashboards/raw_client.py +82 -0
opik/rest_api/datasets/client.py +441 -2
opik/rest_api/datasets/raw_client.py +1225 -505
opik/rest_api/experiments/client.py +30 -2
opik/rest_api/experiments/raw_client.py +26 -0
opik/rest_api/optimizations/client.py +302 -0
opik/rest_api/optimizations/raw_client.py +463 -0
opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
opik/rest_api/prompts/__init__.py +2 -2
opik/rest_api/prompts/client.py +34 -4
opik/rest_api/prompts/raw_client.py +32 -2
opik/rest_api/prompts/types/__init__.py +3 -1
opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
opik/rest_api/traces/client.py +6 -6
opik/rest_api/traces/raw_client.py +4 -4
opik/rest_api/types/__init__.py +121 -11
opik/rest_api/types/aggregation_data.py +1 -0
opik/rest_api/types/automation_rule_evaluator.py +23 -1
opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
opik/rest_api/types/dashboard_page_public.py +1 -0
opik/rest_api/types/dataset.py +2 -0
opik/rest_api/types/dataset_item.py +1 -0
opik/rest_api/types/dataset_item_compare.py +1 -0
opik/rest_api/types/dataset_item_page_compare.py +1 -0
opik/rest_api/types/dataset_item_page_public.py +1 -0
opik/rest_api/types/dataset_item_public.py +1 -0
opik/rest_api/types/dataset_public.py +2 -0
opik/rest_api/types/dataset_public_status.py +5 -0
opik/rest_api/types/dataset_status.py +5 -0
opik/rest_api/types/dataset_version_diff.py +22 -0
opik/rest_api/types/dataset_version_diff_stats.py +24 -0
opik/rest_api/types/dataset_version_page_public.py +23 -0
opik/rest_api/types/dataset_version_public.py +49 -0
opik/rest_api/types/experiment.py +2 -0
opik/rest_api/types/experiment_public.py +2 -0
opik/rest_api/types/experiment_score.py +20 -0
opik/rest_api/types/experiment_score_public.py +20 -0
opik/rest_api/types/experiment_score_write.py +20 -0
opik/rest_api/types/feedback_score_public.py +4 -0
opik/rest_api/types/optimization.py +2 -0
opik/rest_api/types/optimization_public.py +2 -0
opik/rest_api/types/optimization_public_status.py +3 -1
opik/rest_api/types/optimization_status.py +3 -1
opik/rest_api/types/optimization_studio_config.py +27 -0
opik/rest_api/types/optimization_studio_config_public.py +27 -0
opik/rest_api/types/optimization_studio_config_write.py +27 -0
opik/rest_api/types/optimization_studio_log.py +22 -0
opik/rest_api/types/optimization_write.py +2 -0
opik/rest_api/types/optimization_write_status.py +3 -1
opik/rest_api/types/prompt.py +6 -0
opik/rest_api/types/prompt_detail.py +6 -0
opik/rest_api/types/prompt_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_public.py +6 -0
opik/rest_api/types/prompt_public_template_structure.py +5 -0
opik/rest_api/types/prompt_template_structure.py +5 -0
opik/rest_api/types/prompt_version.py +2 -0
opik/rest_api/types/prompt_version_detail.py +2 -0
opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_version_public.py +2 -0
opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
opik/rest_api/types/prompt_version_template_structure.py +5 -0
opik/rest_api/types/score_name.py +1 -0
opik/rest_api/types/service_toggles_config.py +5 -0
opik/rest_api/types/span_filter.py +23 -0
opik/rest_api/types/span_filter_operator.py +21 -0
opik/rest_api/types/span_filter_write.py +23 -0
opik/rest_api/types/span_filter_write_operator.py +21 -0
opik/rest_api/types/span_llm_as_judge_code.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
opik/rest_api/types/studio_evaluation.py +20 -0
opik/rest_api/types/studio_evaluation_public.py +20 -0
opik/rest_api/types/studio_evaluation_write.py +20 -0
opik/rest_api/types/studio_llm_model.py +21 -0
opik/rest_api/types/studio_llm_model_public.py +21 -0
opik/rest_api/types/studio_llm_model_write.py +21 -0
opik/rest_api/types/studio_message.py +20 -0
opik/rest_api/types/studio_message_public.py +20 -0
opik/rest_api/types/studio_message_write.py +20 -0
opik/rest_api/types/studio_metric.py +21 -0
opik/rest_api/types/studio_metric_public.py +21 -0
opik/rest_api/types/studio_metric_write.py +21 -0
opik/rest_api/types/studio_optimizer.py +21 -0
opik/rest_api/types/studio_optimizer_public.py +21 -0
opik/rest_api/types/studio_optimizer_write.py +21 -0
opik/rest_api/types/studio_prompt.py +20 -0
opik/rest_api/types/studio_prompt_public.py +20 -0
opik/rest_api/types/studio_prompt_write.py +20 -0
opik/rest_api/types/trace.py +6 -0
opik/rest_api/types/trace_public.py +6 -0
opik/rest_api/types/trace_thread_filter_write.py +23 -0
opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
opik/rest_api/types/value_entry.py +2 -0
opik/rest_api/types/value_entry_compare.py +2 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
opik/rest_api/types/value_entry_public.py +2 -0
opik/synchronization.py +5 -6
opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
{opik-1.9.26.dist-info → opik-1.9.39.dist-info}/METADATA +2 -1
{opik-1.9.26.dist-info → opik-1.9.39.dist-info}/RECORD +177 -119
opik/api_objects/prompt/chat_prompt_template.py +0 -200
{opik-1.9.26.dist-info → opik-1.9.39.dist-info}/WHEEL +0 -0
{opik-1.9.26.dist-info → opik-1.9.39.dist-info}/entry_points.txt +0 -0
{opik-1.9.26.dist-info → opik-1.9.39.dist-info}/licenses/LICENSE +0 -0
{opik-1.9.26.dist-info → opik-1.9.39.dist-info}/top_level.txt +0 -0

opik/decorator/base_track_decorator.py CHANGED Viewed

@@ -14,7 +14,7 @@ from typing import (
     NamedTuple,
 )
-from .. import context_storage, logging_messages
+from .. import context_storage, logging_messages, tracing_runtime_config
 from ..api_objects import opik_client, span, trace
 from ..types import DistributedTraceHeadersDict, ErrorInfoDict, SpanType
 from . import (
@@ -24,7 +24,6 @@ from . import (
     inspect_helpers,
     opik_args,
     span_creation_handler,
-    tracing_runtime_config,
 )
 LOGGER = logging.getLogger(__name__)
@@ -337,25 +336,24 @@ class BaseTrackDecorator(abc.ABC):
                 )
                 error_info = error_info_collector.collect(exception)
                 func_exception = exception
-            finally:
-                stream_or_stream_manager = self._streams_handler(
-                    result,
-                    track_options.capture_output,
-                    track_options.generations_aggregator,
-                )
-                if stream_or_stream_manager is not None:
-                    return stream_or_stream_manager
-                self._after_call(
-                    output=result,
-                    error_info=error_info,
-                    capture_output=track_options.capture_output,
-                    flush=track_options.flush,
-                )
-                if func_exception is not None:
-                    raise func_exception
-                else:
-                    return result
+            stream_or_stream_manager = self._streams_handler(
+                result,
+                track_options.capture_output,
+                track_options.generations_aggregator,
+            )
+            if stream_or_stream_manager is not None:
+                return stream_or_stream_manager
+            self._after_call(
+                output=result,
+                error_info=error_info,
+                capture_output=track_options.capture_output,
+                flush=track_options.flush,
+            )
+            if func_exception is not None:
+                raise func_exception
+            return result
         wrapper.opik_tracked = True  # type: ignore
@@ -391,25 +389,24 @@ class BaseTrackDecorator(abc.ABC):
                 )
                 error_info = error_info_collector.collect(exception)
                 func_exception = exception
-            finally:
-                stream_or_stream_manager = self._streams_handler(
-                    result,
-                    track_options.capture_output,
-                    track_options.generations_aggregator,
-                )
-                if stream_or_stream_manager is not None:
-                    return stream_or_stream_manager
-                self._after_call(
-                    output=result,
-                    error_info=error_info,
-                    capture_output=track_options.capture_output,
-                    flush=track_options.flush,
-                )
-                if func_exception is not None:
-                    raise func_exception
-                else:
-                    return result
+            stream_or_stream_manager = self._streams_handler(
+                result,
+                track_options.capture_output,
+                track_options.generations_aggregator,
+            )
+            if stream_or_stream_manager is not None:
+                return stream_or_stream_manager
+            self._after_call(
+                output=result,
+                error_info=error_info,
+                capture_output=track_options.capture_output,
+                flush=track_options.flush,
+            )
+            if func_exception is not None:
+                raise func_exception
+            return result
         wrapper.opik_tracked = True  # type: ignore
         return wrapper

opik/evaluation/__init__.py CHANGED Viewed

@@ -1,4 +1,15 @@
-from .evaluator import evaluate, evaluate_prompt, evaluate_experiment
+from .evaluator import (
+    evaluate,
+    evaluate_prompt,
+    evaluate_experiment,
+    evaluate_on_dict_items,
+)
 from .threads.evaluator import evaluate_threads
-__all__ = ["evaluate", "evaluate_prompt", "evaluate_experiment", "evaluate_threads"]
+__all__ = [
+    "evaluate",
+    "evaluate_prompt",
+    "evaluate_experiment",
+    "evaluate_on_dict_items",
+    "evaluate_threads",
+]

opik/evaluation/engine/engine.py CHANGED Viewed

@@ -1,13 +1,11 @@
 import functools
-import inspect
 import logging
-from typing import List, Optional, Callable, Any, Dict
+from typing import List, Optional, Any, Dict
-import opik.exceptions as exceptions
 import opik.logging_messages as logging_messages
 import opik.opik_context as opik_context
 import opik
-from opik.api_objects import opik_client, trace
+from opik.api_objects import opik_client, trace, local_recording
 from opik.api_objects.dataset import dataset, dataset_item
 from opik.api_objects.experiment import experiment
 from opik.evaluation import (
@@ -18,18 +16,15 @@ from opik.evaluation import (
 )
 from opik.evaluation.types import LLMTask, ScoringKeyMappingType
-from . import evaluation_tasks_executor, exception_analyzer, helpers
+from . import evaluation_tasks_executor, exception_analyzer, helpers, metrics_evaluator
 from .types import EvaluationTask
-from ..metrics import arguments_validator, arguments_helpers, base_metric, score_result
-from ..scorers import scorer_wrapper_metric
-from ...message_processing import message_processors_chain
+from ..metrics import base_metric, score_result
 from ...message_processing.emulation import models
 LOGGER = logging.getLogger(__name__)
 EVALUATION_TASK_NAME = "evaluation_task"
-EVALUATION_SPAN_PARAMETER_NAME = "task_span"
 class EvaluationEngine:
@@ -37,7 +32,6 @@ class EvaluationEngine:
         self,
         client: opik_client.Opik,
         project_name: Optional[str],
-        experiment_: experiment.Experiment,
         scoring_metrics: List[base_metric.BaseMetric],
         workers: int,
         verbose: int,
@@ -45,41 +39,28 @@ class EvaluationEngine:
     ) -> None:
         self._client = client
         self._project_name = project_name
-        self._experiment = experiment_
         self._workers = workers
         self._verbose = verbose
-        self._scoring_metrics: List[base_metric.BaseMetric] = []
-        self._task_span_scoring_metrics: List[base_metric.BaseMetric] = []
-        self._scoring_key_mapping = scoring_key_mapping
-        # Analyze metrics
-        self._analyze_metrics(scoring_metrics)
-        if len(self._task_span_scoring_metrics) > 0:
-            LOGGER.info(
-                "Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
-                len(self._task_span_scoring_metrics),
-            )
-    def _analyze_metrics(self, scoring_metrics: List[base_metric.BaseMetric]) -> None:
-        for metric in scoring_metrics:
-            if _has_evaluation_span_parameter(metric.score):
-                self._task_span_scoring_metrics.append(metric)
-            else:
-                self._scoring_metrics.append(metric)
+        # Delegate metric analysis to MetricsEvaluator
+        self._metrics_evaluator = metrics_evaluator.MetricsEvaluator(
+            scoring_metrics=scoring_metrics,
+            scoring_key_mapping=scoring_key_mapping,
+        )
     @opik.track(name="metrics_calculation")  # type: ignore[attr-defined,has-type]
-    def _evaluate_test_case(
+    def _compute_test_result_for_test_case(
         self,
         test_case_: test_case.TestCase,
         trial_id: int = 0,
     ) -> test_result.TestResult:
-        score_results = _scores_by_metrics(
-            scoring_metrics=self._scoring_metrics,
-            score_kwargs=test_case_.scoring_inputs,
-            scoring_key_mapping=self._scoring_key_mapping,
-            test_case_=test_case_,
+        score_results, mapped_scoring_inputs = (
+            self._metrics_evaluator.compute_regular_scores(
+                dataset_item_content=test_case_.dataset_item_content,
+                task_output=test_case_.task_output,
+            )
         )
+        test_case_.mapped_scoring_inputs = mapped_scoring_inputs
         test_result_ = test_result.TestResult(
             test_case=test_case_,
@@ -94,11 +75,40 @@ class EvaluationEngine:
         )
         return test_result_
-    def _evaluate_llm_task(
+    @opik.track(  # type: ignore[attr-defined,has-type]
+        name="task_span_metrics_calculation",
+        ignore_arguments=["test_case_"],
+    )
+    def _compute_scores_for_test_case_with_task_span(
+        self,
+        trace_id: str,
+        task_span: models.SpanModel,
+        test_case_: test_case.TestCase,
+    ) -> List[score_result.ScoreResult]:
+        score_results, mapped_scoring_inputs = (
+            self._metrics_evaluator.compute_task_span_scores(
+                dataset_item_content=test_case_.dataset_item_content,
+                task_output=test_case_.task_output,
+                task_span=task_span,
+            )
+        )
+        test_case_.mapped_scoring_inputs = mapped_scoring_inputs
+        # log feedback scores
+        rest_operations.log_test_result_feedback_scores(
+            client=self._client,
+            score_results=score_results,
+            trace_id=trace_id,
+            project_name=self._project_name,
+        )
+        return score_results
+    def _compute_test_result_for_llm_task(
         self,
         item: dataset_item.DatasetItem,
         task: LLMTask,
         trial_id: int,
+        experiment_: Optional[experiment.Experiment],
     ) -> test_result.TestResult:
         if not hasattr(task, "opik_tracked"):
             name = task.__name__ if hasattr(task, "__name__") else "llm_task"
@@ -113,7 +123,7 @@ class EvaluationEngine:
         )
         with helpers.evaluate_llm_task_context(
-            experiment=self._experiment,
+            experiment=experiment_,
             dataset_item_id=item.id,
             trace_data=trace_data,
             client=self._client,
@@ -132,121 +142,53 @@ class EvaluationEngine:
             opik_context.update_current_trace(output=task_output_)
-            scoring_inputs = arguments_helpers.create_scoring_inputs(
-                dataset_item=item_content,
-                task_output=task_output_,
-                scoring_key_mapping=self._scoring_key_mapping,
-            )
             test_case_ = test_case.TestCase(
                 trace_id=trace_data.id,
                 dataset_item_id=item.id,
-                scoring_inputs=scoring_inputs,
                 task_output=task_output_,
                 dataset_item_content=item_content,
             )
-            test_result_ = self._evaluate_test_case(
+            test_result_ = self._compute_test_result_for_test_case(
                 test_case_=test_case_,
                 trial_id=trial_id,
             )
         return test_result_
-    def evaluate_llm_tasks(
+    def _compute_test_results_for_llm_task(
         self,
-        dataset_: dataset.Dataset,
+        dataset_items: List[dataset_item.DatasetItem],
         task: LLMTask,
-        nb_samples: Optional[int],
-        dataset_item_ids: Optional[List[str]],
-        dataset_sampler: Optional[samplers.BaseDatasetSampler],
+        experiment_: Optional[experiment.Experiment],
         trial_count: int,
+        description: str,
     ) -> List[test_result.TestResult]:
-        task_span_scoring_enabled = False
-        if len(self._task_span_scoring_metrics) > 0:
-            message_processors_chain.toggle_local_emulator_message_processor(
-                active=True, chain=self._client._message_processor
-            )
-            task_span_scoring_enabled = True
-        dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
-            nb_samples=nb_samples,
-            dataset_item_ids=dataset_item_ids,
-        )
-        if dataset_sampler is not None:
-            dataset_items = dataset_sampler.sample(dataset_items)
         test_results: List[test_result.TestResult] = []
         for trial_id in range(trial_count):
             evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
                 functools.partial(
-                    self._evaluate_llm_task,
+                    self._compute_test_result_for_llm_task,
                     item=item,
                     task=task,
                     trial_id=trial_id,
+                    experiment_=experiment_,
                 )
                 for item in dataset_items
             ]
             test_results += evaluation_tasks_executor.execute(
-                evaluation_tasks,
-                self._workers,
-                self._verbose,
-                desc=f"Evaluation trial {trial_id}"
+                evaluation_tasks=evaluation_tasks,
+                workers=self._workers,
+                verbose=self._verbose,
+                desc=f"{description} trial {trial_id}"
                 if trial_count > 1
-                else "Evaluation",
-            )
-        if task_span_scoring_enabled:
-            # flush Opik client to make sure all spans are collected
-            self._client.flush()
-            self._evaluate_llm_tasks_spans(test_results)
-            LOGGER.info(
-                "Task evaluation span handling is disabled — the evaluation has been completed."
-            )
-            message_processors_chain.toggle_local_emulator_message_processor(
-                active=False, chain=self._client._message_processor
+                else description,
             )
         return test_results
-    def _evaluate_llm_tasks_spans(
-        self, test_results: List[test_result.TestResult]
-    ) -> None:
-        local = message_processors_chain.get_local_emulator_message_processor(
-            chain=self._client._message_processor
-        )
-        if local is None:
-            LOGGER.warning("Local emulator message processor not found in the chain.")
-            return
-        # get trace trees from a local emulator
-        trace_trees = local.trace_trees
-        if len(trace_trees) == 0:
-            LOGGER.warning("No trace trees found in the local emulator.")
-            return
-        # create span evaluation tasks from LLM tasks evaluation results and evaluate them in parallel
-        span_evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
-            functools.partial(
-                self._evaluate_llm_task_result_span,
-                evaluation_task_result=test_result_,
-                trace_trees=trace_trees,
-            )
-            for test_result_ in test_results
-        ]
-        evaluation_tasks_executor.execute(
-            span_evaluation_tasks,
-            self._workers,
-            self._verbose,
-            desc="LLM task spans evaluation",
-        )
-    def _evaluate_llm_task_result_span(
+    def _update_test_result_with_task_span_metrics(
         self,
         evaluation_task_result: test_result.TestResult,
         trace_trees: List[models.TraceModel],
@@ -288,7 +230,7 @@ class EvaluationEngine:
             ),
             client=self._client,
         ):
-            score_results = self._score_llm_task_result_span(
+            score_results = self._compute_scores_for_test_case_with_task_span(
                 trace_id=trace_id,
                 task_span=evaluation_span,
                 test_case_=evaluation_task_result.test_case,
@@ -297,129 +239,159 @@ class EvaluationEngine:
             evaluation_task_result.score_results += score_results
             return evaluation_task_result
-    @opik.track(  # type: ignore[attr-defined,has-type]
-        name="task_span_metrics_calculation",
-        ignore_arguments=["test_case_"],
-    )
-    def _score_llm_task_result_span(
+    def _update_test_results_with_task_span_metrics(
         self,
-        trace_id: str,
-        task_span: models.SpanModel,
-        test_case_: test_case.TestCase,
-    ) -> List[score_result.ScoreResult]:
-        score_kwargs = {
-            **test_case_.scoring_inputs,
-            EVALUATION_SPAN_PARAMETER_NAME: task_span,
-        }
-        score_results = _scores_by_metrics(
-            scoring_metrics=self._task_span_scoring_metrics,
-            score_kwargs=score_kwargs,
-            scoring_key_mapping=self._scoring_key_mapping,
-            test_case_=test_case_,
+        test_results: List[test_result.TestResult],
+        recording: local_recording._LocalRecordingHandle,
+    ) -> None:
+        """Evaluate task spans from a local recording."""
+        # Get trace trees from the recording (this flushes automatically)
+        trace_trees = recording.trace_trees
+        if len(trace_trees) == 0:
+            LOGGER.warning("No trace trees found in the local recording.")
+            return
+        # Create span evaluation tasks from LLM tasks evaluation results and evaluate them in parallel
+        span_evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
+            functools.partial(
+                self._update_test_result_with_task_span_metrics,
+                evaluation_task_result=test_result_,
+                trace_trees=trace_trees,
+            )
+            for test_result_ in test_results
+        ]
+        evaluation_tasks_executor.execute(
+            evaluation_tasks=span_evaluation_tasks,
+            workers=self._workers,
+            verbose=self._verbose,
+            desc="LLM task spans evaluation",
         )
-        # log feedback scores
-        rest_operations.log_test_result_feedback_scores(
-            client=self._client,
-            score_results=score_results,
-            trace_id=trace_id,
-            project_name=self._project_name,
+        LOGGER.debug(
+            "Task evaluation span handling is disabled — the evaluation has been completed."
         )
-        return score_results
-    def evaluate_test_cases(
+    def evaluate_llm_task_on_dataset(
         self,
-        test_cases: List[test_case.TestCase],
+        dataset_: dataset.Dataset,
+        task: LLMTask,
+        nb_samples: Optional[int],
+        dataset_item_ids: Optional[List[str]],
+        dataset_sampler: Optional[samplers.BaseDatasetSampler],
+        trial_count: int,
+        experiment_: Optional[experiment.Experiment],
     ) -> List[test_result.TestResult]:
-        evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
-            functools.partial(
-                self._evaluate_test_case,
-                test_case_=test_case_,
+        dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
+            nb_samples=nb_samples,
+            dataset_item_ids=dataset_item_ids,
+        )
+        if dataset_sampler is not None:
+            dataset_items = dataset_sampler.sample(dataset_items)
+        if not self._metrics_evaluator.has_task_span_metrics:
+            return self._compute_test_results_for_llm_task(
+                dataset_items=dataset_items,
+                task=task,
+                experiment_=experiment_,
+                trial_count=trial_count,
+                description="Evaluation",
             )
-            for test_case_ in test_cases
-        ]
-        test_results = evaluation_tasks_executor.execute(
-            evaluation_tasks, self._workers, self._verbose
+        LOGGER.debug(
+            "Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
+            len(self._metrics_evaluator.task_span_metrics),
         )
+        with local_recording.record_traces_locally(client=self._client) as recording:
+            test_results = self._compute_test_results_for_llm_task(
+                dataset_items=dataset_items,
+                task=task,
+                experiment_=experiment_,
+                trial_count=trial_count,
+                description="Evaluation",
+            )
+            self._update_test_results_with_task_span_metrics(
+                test_results=test_results,
+                recording=recording,
+            )
         return test_results
+    def evaluate_llm_task_on_dict_items(
+        self,
+        items: List[Dict[str, Any]],
+        task: LLMTask,
+    ) -> List[test_result.TestResult]:
+        """
+        Evaluate an LLM task on a list of dict items.
+        This method creates traces for each evaluation but doesn't require a Dataset object
+        or experiment. It's useful for optimization scenarios where you have items in memory
+        and want to evaluate them with a task function.
+        Args:
+            items: List of dataset item contents (dictionaries).
+            task: A callable that takes a dataset item dict and returns a dict with outputs.
+        Returns:
+            List of TestResult objects containing scores for each item.
+        """
+        # Convert raw items to DatasetItem objects for compatibility
+        dataset_items = [
+            dataset_item.DatasetItem(
+                id=f"temp_item_{idx}",
+                **item,
+            )
+            for idx, item in enumerate(items)
+        ]
-def _scores_by_metrics(
-    scoring_metrics: List[base_metric.BaseMetric],
-    score_kwargs: Dict[str, Any],
-    scoring_key_mapping: Optional[ScoringKeyMappingType],
-    test_case_: test_case.TestCase,
-) -> List[score_result.ScoreResult]:
-    score_results: List[score_result.ScoreResult] = []
-    for metric in scoring_metrics:
-        try:
-            LOGGER.debug("Metric %s score started", metric.name)
-            if isinstance(metric, scorer_wrapper_metric.ScorerWrapperMetric):
-                # use original dataset item content without any mappings applied
-                if (
-                    task_span := score_kwargs.get(EVALUATION_SPAN_PARAMETER_NAME)
-                ) is not None:
-                    result = metric.score(
-                        dataset_item=test_case_.dataset_item_content,
-                        task_outputs=test_case_.task_output,
-                        task_span=task_span,
-                    )
-                else:
-                    result = metric.score(
-                        dataset_item=test_case_.dataset_item_content,
-                        task_outputs=test_case_.task_output,
-                    )
-            else:
-                arguments_validator.validate_score_arguments(
-                    metric=metric,
-                    kwargs=score_kwargs,
-                    scoring_key_mapping=scoring_key_mapping,
-                )
-                result = metric.score(**score_kwargs)
-            LOGGER.debug("Metric %s score ended", metric.name)
-            if isinstance(result, list):
-                score_results += result
-            else:
-                score_results.append(result)
-        except exceptions.ScoreMethodMissingArguments:
-            raise
-        except Exception as exception:
-            # This can be problematic if the metric returns a list of strings as we will not know the name of the metrics that have failed
-            LOGGER.error(
-                "Failed to compute metric %s. Score result will be marked as failed.",
-                metric.name,
-                exc_info=True,
+        if not self._metrics_evaluator.has_task_span_metrics:
+            return self._compute_test_results_for_llm_task(
+                dataset_items=dataset_items,
+                task=task,
+                experiment_=None,
+                trial_count=1,
+                description="Items evaluation",
             )
-            if exception_analyzer.is_llm_provider_rate_limit_error(exception):
-                LOGGER.error(
-                    logging_messages.LLM_PROVIDER_RATE_LIMIT_ERROR_DETECTED_IN_EVALUATE_FUNCTION
-                )
+        LOGGER.debug(
+            "Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
+            len(self._metrics_evaluator.task_span_metrics),
+        )
-            score_results.append(
-                score_result.ScoreResult(
-                    name=metric.name,
-                    value=0.0,
-                    reason=str(exception),
-                    scoring_failed=True,
-                )
+        with local_recording.record_traces_locally(client=self._client) as recording:
+            test_results = self._compute_test_results_for_llm_task(
+                dataset_items=dataset_items,
+                task=task,
+                experiment_=None,
+                trial_count=1,
+                description="Items evaluation",
+            )
+            self._update_test_results_with_task_span_metrics(
+                test_results=test_results,
+                recording=recording,
             )
-    return score_results
+        return test_results
+    def evaluate_test_cases(
+        self,
+        test_cases: List[test_case.TestCase],
+    ) -> List[test_result.TestResult]:
+        evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
+            functools.partial(
+                self._compute_test_result_for_test_case,
+                test_case_=test_case_,
+            )
+            for test_case_ in test_cases
+        ]
-def _has_evaluation_span_parameter(func: Callable) -> bool:
-    try:
-        sig = inspect.signature(func)
-        has_param = EVALUATION_SPAN_PARAMETER_NAME in sig.parameters
-    except (ValueError, TypeError):
-        # If we can't inspect the signature, assume no parameter
-        has_param = False
+        test_results = evaluation_tasks_executor.execute(
+            evaluation_tasks=evaluation_tasks,
+            workers=self._workers,
+            verbose=self._verbose,
+        )
-    return has_param
+        return test_results

opik 1.9.26__py3-none-any.whl → 1.9.39__py3-none-any.whl

opik 1.9.26py3-none-any.whl → 1.9.39py3-none-any.whl