opik 1.9.5__py3-none-any.whl → 1.9.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +10 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/dataset/rest_operations.py +5 -0
- opik/api_objects/experiment/experiment.py +46 -49
- opik/api_objects/experiment/helpers.py +34 -10
- opik/api_objects/local_recording.py +8 -3
- opik/api_objects/opik_client.py +230 -48
- opik/api_objects/opik_query_language.py +9 -0
- opik/api_objects/prompt/__init__.py +11 -3
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +193 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +37 -35
- opik/api_objects/prompt/client.py +101 -30
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +1 -1
- opik/cli/export.py +6 -2
- opik/cli/usage_report/charts.py +39 -10
- opik/cli/usage_report/cli.py +164 -45
- opik/cli/usage_report/pdf.py +14 -1
- opik/config.py +0 -5
- opik/decorator/base_track_decorator.py +37 -40
- opik/decorator/context_manager/span_context_manager.py +9 -0
- opik/decorator/context_manager/trace_context_manager.py +5 -0
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +195 -223
- opik/evaluation/engine/helpers.py +8 -7
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +35 -1
- opik/evaluation/evaluator.py +318 -30
- opik/evaluation/models/litellm/util.py +78 -6
- opik/evaluation/models/model_capabilities.py +33 -0
- opik/evaluation/report.py +14 -2
- opik/evaluation/rest_operations.py +36 -33
- opik/evaluation/test_case.py +2 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +17 -0
- opik/hooks/__init__.py +17 -1
- opik/hooks/anonymizer_hook.py +36 -0
- opik/id_helpers.py +18 -0
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +3 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +1 -6
- opik/integrations/dspy/callback.py +1 -4
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +2 -4
- opik/integrations/langchain/opik_tracer.py +273 -82
- opik/integrations/llama_index/callback.py +110 -108
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/message_processing/batching/batchers.py +11 -7
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +25 -1
- opik/message_processing/online_message_processor.py +23 -8
- opik/opik_context.py +7 -7
- opik/rest_api/__init__.py +188 -12
- opik/rest_api/client.py +3 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +893 -89
- opik/rest_api/datasets/raw_client.py +1328 -87
- opik/rest_api/experiments/client.py +30 -2
- opik/rest_api/experiments/raw_client.py +26 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/optimizations/client.py +302 -0
- opik/rest_api/optimizations/raw_client.py +463 -0
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +34 -4
- opik/rest_api/prompts/raw_client.py +32 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +148 -64
- opik/rest_api/spans/raw_client.py +210 -83
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +241 -73
- opik/rest_api/traces/raw_client.py +344 -90
- opik/rest_api/types/__init__.py +200 -15
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +6 -1
- opik/rest_api/types/alert_trigger_config_type.py +6 -1
- opik/rest_api/types/alert_trigger_config_write_type.py +6 -1
- opik/rest_api/types/automation_rule_evaluator.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
- opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +2 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +23 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +1 -0
- opik/rest_api/types/dataset_item_page_public.py +1 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +2 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +49 -0
- opik/rest_api/types/experiment.py +2 -0
- opik/rest_api/types/experiment_public.py +2 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/feedback.py +20 -1
- opik/rest_api/types/feedback_create.py +16 -1
- opik/rest_api/types/feedback_object_public.py +22 -1
- opik/rest_api/types/feedback_public.py +20 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +16 -1
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +24 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +24 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +24 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +2 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +2 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +2 -0
- opik/rest_api/types/optimization.py +2 -0
- opik/rest_api/types/optimization_public.py +2 -0
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +2 -0
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +2 -0
- opik/rest_api/types/prompt_version_detail.py +2 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_public.py +2 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +6 -0
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +6 -0
- opik/rest_api/types/trace_public.py +6 -0
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_update.py +19 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/METADATA +5 -4
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/RECORD +246 -151
- opik/api_objects/prompt/chat_prompt_template.py +0 -164
- opik/api_objects/prompt/prompt.py +0 -131
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/WHEEL +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/entry_points.txt +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/licenses/LICENSE +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/top_level.txt +0 -0
opik/evaluation/engine/engine.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
import functools
|
|
2
|
-
import inspect
|
|
3
2
|
import logging
|
|
4
|
-
from typing import List, Optional,
|
|
3
|
+
from typing import List, Optional, Any, Dict
|
|
5
4
|
|
|
6
|
-
import opik.exceptions as exceptions
|
|
7
5
|
import opik.logging_messages as logging_messages
|
|
8
6
|
import opik.opik_context as opik_context
|
|
9
7
|
import opik
|
|
10
|
-
from opik.api_objects import opik_client, trace
|
|
8
|
+
from opik.api_objects import opik_client, trace, local_recording
|
|
11
9
|
from opik.api_objects.dataset import dataset, dataset_item
|
|
12
10
|
from opik.api_objects.experiment import experiment
|
|
13
11
|
from opik.evaluation import (
|
|
@@ -18,18 +16,15 @@ from opik.evaluation import (
|
|
|
18
16
|
)
|
|
19
17
|
from opik.evaluation.types import LLMTask, ScoringKeyMappingType
|
|
20
18
|
|
|
21
|
-
from . import evaluation_tasks_executor, exception_analyzer, helpers
|
|
19
|
+
from . import evaluation_tasks_executor, exception_analyzer, helpers, metrics_evaluator
|
|
22
20
|
from .types import EvaluationTask
|
|
23
|
-
from ..metrics import
|
|
24
|
-
from ..scorers import scorer_wrapper_metric
|
|
25
|
-
from ...message_processing import message_processors_chain
|
|
21
|
+
from ..metrics import base_metric, score_result
|
|
26
22
|
from ...message_processing.emulation import models
|
|
27
23
|
|
|
28
24
|
|
|
29
25
|
LOGGER = logging.getLogger(__name__)
|
|
30
26
|
|
|
31
27
|
EVALUATION_TASK_NAME = "evaluation_task"
|
|
32
|
-
EVALUATION_SPAN_PARAMETER_NAME = "task_span"
|
|
33
28
|
|
|
34
29
|
|
|
35
30
|
class EvaluationEngine:
|
|
@@ -37,7 +32,6 @@ class EvaluationEngine:
|
|
|
37
32
|
self,
|
|
38
33
|
client: opik_client.Opik,
|
|
39
34
|
project_name: Optional[str],
|
|
40
|
-
experiment_: experiment.Experiment,
|
|
41
35
|
scoring_metrics: List[base_metric.BaseMetric],
|
|
42
36
|
workers: int,
|
|
43
37
|
verbose: int,
|
|
@@ -45,41 +39,28 @@ class EvaluationEngine:
|
|
|
45
39
|
) -> None:
|
|
46
40
|
self._client = client
|
|
47
41
|
self._project_name = project_name
|
|
48
|
-
self._experiment = experiment_
|
|
49
42
|
self._workers = workers
|
|
50
43
|
self._verbose = verbose
|
|
51
|
-
self._scoring_metrics: List[base_metric.BaseMetric] = []
|
|
52
|
-
self._task_span_scoring_metrics: List[base_metric.BaseMetric] = []
|
|
53
|
-
self._scoring_key_mapping = scoring_key_mapping
|
|
54
44
|
|
|
55
|
-
#
|
|
56
|
-
self.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
"Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
|
|
61
|
-
len(self._task_span_scoring_metrics),
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
def _analyze_metrics(self, scoring_metrics: List[base_metric.BaseMetric]) -> None:
|
|
65
|
-
for metric in scoring_metrics:
|
|
66
|
-
if _has_evaluation_span_parameter(metric.score):
|
|
67
|
-
self._task_span_scoring_metrics.append(metric)
|
|
68
|
-
else:
|
|
69
|
-
self._scoring_metrics.append(metric)
|
|
45
|
+
# Delegate metric analysis to MetricsEvaluator
|
|
46
|
+
self._metrics_evaluator = metrics_evaluator.MetricsEvaluator(
|
|
47
|
+
scoring_metrics=scoring_metrics,
|
|
48
|
+
scoring_key_mapping=scoring_key_mapping,
|
|
49
|
+
)
|
|
70
50
|
|
|
71
51
|
@opik.track(name="metrics_calculation") # type: ignore[attr-defined,has-type]
|
|
72
|
-
def
|
|
52
|
+
def _compute_test_result_for_test_case(
|
|
73
53
|
self,
|
|
74
54
|
test_case_: test_case.TestCase,
|
|
75
55
|
trial_id: int = 0,
|
|
76
56
|
) -> test_result.TestResult:
|
|
77
|
-
score_results =
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
57
|
+
score_results, mapped_scoring_inputs = (
|
|
58
|
+
self._metrics_evaluator.compute_regular_scores(
|
|
59
|
+
dataset_item_content=test_case_.dataset_item_content,
|
|
60
|
+
task_output=test_case_.task_output,
|
|
61
|
+
)
|
|
82
62
|
)
|
|
63
|
+
test_case_.mapped_scoring_inputs = mapped_scoring_inputs
|
|
83
64
|
|
|
84
65
|
test_result_ = test_result.TestResult(
|
|
85
66
|
test_case=test_case_,
|
|
@@ -94,11 +75,40 @@ class EvaluationEngine:
|
|
|
94
75
|
)
|
|
95
76
|
return test_result_
|
|
96
77
|
|
|
97
|
-
|
|
78
|
+
@opik.track( # type: ignore[attr-defined,has-type]
|
|
79
|
+
name="task_span_metrics_calculation",
|
|
80
|
+
ignore_arguments=["test_case_"],
|
|
81
|
+
)
|
|
82
|
+
def _compute_scores_for_test_case_with_task_span(
|
|
83
|
+
self,
|
|
84
|
+
trace_id: str,
|
|
85
|
+
task_span: models.SpanModel,
|
|
86
|
+
test_case_: test_case.TestCase,
|
|
87
|
+
) -> List[score_result.ScoreResult]:
|
|
88
|
+
score_results, mapped_scoring_inputs = (
|
|
89
|
+
self._metrics_evaluator.compute_task_span_scores(
|
|
90
|
+
dataset_item_content=test_case_.dataset_item_content,
|
|
91
|
+
task_output=test_case_.task_output,
|
|
92
|
+
task_span=task_span,
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
test_case_.mapped_scoring_inputs = mapped_scoring_inputs
|
|
96
|
+
|
|
97
|
+
# log feedback scores
|
|
98
|
+
rest_operations.log_test_result_feedback_scores(
|
|
99
|
+
client=self._client,
|
|
100
|
+
score_results=score_results,
|
|
101
|
+
trace_id=trace_id,
|
|
102
|
+
project_name=self._project_name,
|
|
103
|
+
)
|
|
104
|
+
return score_results
|
|
105
|
+
|
|
106
|
+
def _compute_test_result_for_llm_task(
|
|
98
107
|
self,
|
|
99
108
|
item: dataset_item.DatasetItem,
|
|
100
109
|
task: LLMTask,
|
|
101
110
|
trial_id: int,
|
|
111
|
+
experiment_: Optional[experiment.Experiment],
|
|
102
112
|
) -> test_result.TestResult:
|
|
103
113
|
if not hasattr(task, "opik_tracked"):
|
|
104
114
|
name = task.__name__ if hasattr(task, "__name__") else "llm_task"
|
|
@@ -113,7 +123,7 @@ class EvaluationEngine:
|
|
|
113
123
|
)
|
|
114
124
|
|
|
115
125
|
with helpers.evaluate_llm_task_context(
|
|
116
|
-
experiment=
|
|
126
|
+
experiment=experiment_,
|
|
117
127
|
dataset_item_id=item.id,
|
|
118
128
|
trace_data=trace_data,
|
|
119
129
|
client=self._client,
|
|
@@ -132,121 +142,53 @@ class EvaluationEngine:
|
|
|
132
142
|
|
|
133
143
|
opik_context.update_current_trace(output=task_output_)
|
|
134
144
|
|
|
135
|
-
scoring_inputs = arguments_helpers.create_scoring_inputs(
|
|
136
|
-
dataset_item=item_content,
|
|
137
|
-
task_output=task_output_,
|
|
138
|
-
scoring_key_mapping=self._scoring_key_mapping,
|
|
139
|
-
)
|
|
140
|
-
|
|
141
145
|
test_case_ = test_case.TestCase(
|
|
142
146
|
trace_id=trace_data.id,
|
|
143
147
|
dataset_item_id=item.id,
|
|
144
|
-
scoring_inputs=scoring_inputs,
|
|
145
148
|
task_output=task_output_,
|
|
146
149
|
dataset_item_content=item_content,
|
|
147
150
|
)
|
|
148
|
-
test_result_ = self.
|
|
151
|
+
test_result_ = self._compute_test_result_for_test_case(
|
|
149
152
|
test_case_=test_case_,
|
|
150
153
|
trial_id=trial_id,
|
|
151
154
|
)
|
|
152
155
|
|
|
153
156
|
return test_result_
|
|
154
157
|
|
|
155
|
-
def
|
|
158
|
+
def _compute_test_results_for_llm_task(
|
|
156
159
|
self,
|
|
157
|
-
|
|
160
|
+
dataset_items: List[dataset_item.DatasetItem],
|
|
158
161
|
task: LLMTask,
|
|
159
|
-
|
|
160
|
-
dataset_item_ids: Optional[List[str]],
|
|
161
|
-
dataset_sampler: Optional[samplers.BaseDatasetSampler],
|
|
162
|
+
experiment_: Optional[experiment.Experiment],
|
|
162
163
|
trial_count: int,
|
|
164
|
+
description: str,
|
|
163
165
|
) -> List[test_result.TestResult]:
|
|
164
|
-
task_span_scoring_enabled = False
|
|
165
|
-
if len(self._task_span_scoring_metrics) > 0:
|
|
166
|
-
message_processors_chain.toggle_local_emulator_message_processor(
|
|
167
|
-
active=True, chain=self._client._message_processor
|
|
168
|
-
)
|
|
169
|
-
task_span_scoring_enabled = True
|
|
170
|
-
|
|
171
|
-
dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
|
|
172
|
-
nb_samples=nb_samples,
|
|
173
|
-
dataset_item_ids=dataset_item_ids,
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
if dataset_sampler is not None:
|
|
177
|
-
dataset_items = dataset_sampler.sample(dataset_items)
|
|
178
|
-
|
|
179
166
|
test_results: List[test_result.TestResult] = []
|
|
180
167
|
|
|
181
168
|
for trial_id in range(trial_count):
|
|
182
169
|
evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
|
|
183
170
|
functools.partial(
|
|
184
|
-
self.
|
|
171
|
+
self._compute_test_result_for_llm_task,
|
|
185
172
|
item=item,
|
|
186
173
|
task=task,
|
|
187
174
|
trial_id=trial_id,
|
|
175
|
+
experiment_=experiment_,
|
|
188
176
|
)
|
|
189
177
|
for item in dataset_items
|
|
190
178
|
]
|
|
191
179
|
|
|
192
180
|
test_results += evaluation_tasks_executor.execute(
|
|
193
|
-
evaluation_tasks,
|
|
194
|
-
self._workers,
|
|
195
|
-
self._verbose,
|
|
196
|
-
desc=f"
|
|
181
|
+
evaluation_tasks=evaluation_tasks,
|
|
182
|
+
workers=self._workers,
|
|
183
|
+
verbose=self._verbose,
|
|
184
|
+
desc=f"{description} trial {trial_id}"
|
|
197
185
|
if trial_count > 1
|
|
198
|
-
else
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
if task_span_scoring_enabled:
|
|
202
|
-
# flush Opik client to make sure all spans are collected
|
|
203
|
-
self._client.flush()
|
|
204
|
-
|
|
205
|
-
self._evaluate_llm_tasks_spans(test_results)
|
|
206
|
-
|
|
207
|
-
LOGGER.info(
|
|
208
|
-
"Task evaluation span handling is disabled — the evaluation has been completed."
|
|
209
|
-
)
|
|
210
|
-
message_processors_chain.toggle_local_emulator_message_processor(
|
|
211
|
-
active=False, chain=self._client._message_processor
|
|
186
|
+
else description,
|
|
212
187
|
)
|
|
213
188
|
|
|
214
189
|
return test_results
|
|
215
190
|
|
|
216
|
-
def
|
|
217
|
-
self, test_results: List[test_result.TestResult]
|
|
218
|
-
) -> None:
|
|
219
|
-
local = message_processors_chain.get_local_emulator_message_processor(
|
|
220
|
-
chain=self._client._message_processor
|
|
221
|
-
)
|
|
222
|
-
if local is None:
|
|
223
|
-
LOGGER.warning("Local emulator message processor not found in the chain.")
|
|
224
|
-
return
|
|
225
|
-
|
|
226
|
-
# get trace trees from a local emulator
|
|
227
|
-
trace_trees = local.trace_trees
|
|
228
|
-
if len(trace_trees) == 0:
|
|
229
|
-
LOGGER.warning("No trace trees found in the local emulator.")
|
|
230
|
-
return
|
|
231
|
-
|
|
232
|
-
# create span evaluation tasks from LLM tasks evaluation results and evaluate them in parallel
|
|
233
|
-
span_evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
|
|
234
|
-
functools.partial(
|
|
235
|
-
self._evaluate_llm_task_result_span,
|
|
236
|
-
evaluation_task_result=test_result_,
|
|
237
|
-
trace_trees=trace_trees,
|
|
238
|
-
)
|
|
239
|
-
for test_result_ in test_results
|
|
240
|
-
]
|
|
241
|
-
|
|
242
|
-
evaluation_tasks_executor.execute(
|
|
243
|
-
span_evaluation_tasks,
|
|
244
|
-
self._workers,
|
|
245
|
-
self._verbose,
|
|
246
|
-
desc="LLM task spans evaluation",
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
def _evaluate_llm_task_result_span(
|
|
191
|
+
def _update_test_result_with_task_span_metrics(
|
|
250
192
|
self,
|
|
251
193
|
evaluation_task_result: test_result.TestResult,
|
|
252
194
|
trace_trees: List[models.TraceModel],
|
|
@@ -288,7 +230,7 @@ class EvaluationEngine:
|
|
|
288
230
|
),
|
|
289
231
|
client=self._client,
|
|
290
232
|
):
|
|
291
|
-
score_results = self.
|
|
233
|
+
score_results = self._compute_scores_for_test_case_with_task_span(
|
|
292
234
|
trace_id=trace_id,
|
|
293
235
|
task_span=evaluation_span,
|
|
294
236
|
test_case_=evaluation_task_result.test_case,
|
|
@@ -297,129 +239,159 @@ class EvaluationEngine:
|
|
|
297
239
|
evaluation_task_result.score_results += score_results
|
|
298
240
|
return evaluation_task_result
|
|
299
241
|
|
|
300
|
-
|
|
301
|
-
name="task_span_metrics_calculation",
|
|
302
|
-
ignore_arguments=["test_case_"],
|
|
303
|
-
)
|
|
304
|
-
def _score_llm_task_result_span(
|
|
242
|
+
def _update_test_results_with_task_span_metrics(
|
|
305
243
|
self,
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
244
|
+
test_results: List[test_result.TestResult],
|
|
245
|
+
recording: local_recording._LocalRecordingHandle,
|
|
246
|
+
) -> None:
|
|
247
|
+
"""Evaluate task spans from a local recording."""
|
|
248
|
+
# Get trace trees from the recording (this flushes automatically)
|
|
249
|
+
trace_trees = recording.trace_trees
|
|
250
|
+
if len(trace_trees) == 0:
|
|
251
|
+
LOGGER.warning("No trace trees found in the local recording.")
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
# Create span evaluation tasks from LLM tasks evaluation results and evaluate them in parallel
|
|
255
|
+
span_evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
|
|
256
|
+
functools.partial(
|
|
257
|
+
self._update_test_result_with_task_span_metrics,
|
|
258
|
+
evaluation_task_result=test_result_,
|
|
259
|
+
trace_trees=trace_trees,
|
|
260
|
+
)
|
|
261
|
+
for test_result_ in test_results
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
evaluation_tasks_executor.execute(
|
|
265
|
+
evaluation_tasks=span_evaluation_tasks,
|
|
266
|
+
workers=self._workers,
|
|
267
|
+
verbose=self._verbose,
|
|
268
|
+
desc="LLM task spans evaluation",
|
|
320
269
|
)
|
|
321
270
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
client=self._client,
|
|
325
|
-
score_results=score_results,
|
|
326
|
-
trace_id=trace_id,
|
|
327
|
-
project_name=self._project_name,
|
|
271
|
+
LOGGER.debug(
|
|
272
|
+
"Task evaluation span handling is disabled — the evaluation has been completed."
|
|
328
273
|
)
|
|
329
|
-
return score_results
|
|
330
274
|
|
|
331
|
-
def
|
|
275
|
+
def evaluate_llm_task_on_dataset(
|
|
332
276
|
self,
|
|
333
|
-
|
|
277
|
+
dataset_: dataset.Dataset,
|
|
278
|
+
task: LLMTask,
|
|
279
|
+
nb_samples: Optional[int],
|
|
280
|
+
dataset_item_ids: Optional[List[str]],
|
|
281
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler],
|
|
282
|
+
trial_count: int,
|
|
283
|
+
experiment_: Optional[experiment.Experiment],
|
|
334
284
|
) -> List[test_result.TestResult]:
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
285
|
+
dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
|
|
286
|
+
nb_samples=nb_samples,
|
|
287
|
+
dataset_item_ids=dataset_item_ids,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
if dataset_sampler is not None:
|
|
291
|
+
dataset_items = dataset_sampler.sample(dataset_items)
|
|
292
|
+
|
|
293
|
+
if not self._metrics_evaluator.has_task_span_metrics:
|
|
294
|
+
return self._compute_test_results_for_llm_task(
|
|
295
|
+
dataset_items=dataset_items,
|
|
296
|
+
task=task,
|
|
297
|
+
experiment_=experiment_,
|
|
298
|
+
trial_count=trial_count,
|
|
299
|
+
description="Evaluation",
|
|
339
300
|
)
|
|
340
|
-
for test_case_ in test_cases
|
|
341
|
-
]
|
|
342
301
|
|
|
343
|
-
|
|
344
|
-
|
|
302
|
+
LOGGER.debug(
|
|
303
|
+
"Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
|
|
304
|
+
len(self._metrics_evaluator.task_span_metrics),
|
|
345
305
|
)
|
|
346
306
|
|
|
307
|
+
with local_recording.record_traces_locally(client=self._client) as recording:
|
|
308
|
+
test_results = self._compute_test_results_for_llm_task(
|
|
309
|
+
dataset_items=dataset_items,
|
|
310
|
+
task=task,
|
|
311
|
+
experiment_=experiment_,
|
|
312
|
+
trial_count=trial_count,
|
|
313
|
+
description="Evaluation",
|
|
314
|
+
)
|
|
315
|
+
self._update_test_results_with_task_span_metrics(
|
|
316
|
+
test_results=test_results,
|
|
317
|
+
recording=recording,
|
|
318
|
+
)
|
|
319
|
+
|
|
347
320
|
return test_results
|
|
348
321
|
|
|
322
|
+
def evaluate_llm_task_on_dict_items(
|
|
323
|
+
self,
|
|
324
|
+
items: List[Dict[str, Any]],
|
|
325
|
+
task: LLMTask,
|
|
326
|
+
) -> List[test_result.TestResult]:
|
|
327
|
+
"""
|
|
328
|
+
Evaluate an LLM task on a list of dict items.
|
|
329
|
+
|
|
330
|
+
This method creates traces for each evaluation but doesn't require a Dataset object
|
|
331
|
+
or experiment. It's useful for optimization scenarios where you have items in memory
|
|
332
|
+
and want to evaluate them with a task function.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
items: List of dataset item contents (dictionaries).
|
|
336
|
+
task: A callable that takes a dataset item dict and returns a dict with outputs.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
List of TestResult objects containing scores for each item.
|
|
340
|
+
"""
|
|
341
|
+
# Convert raw items to DatasetItem objects for compatibility
|
|
342
|
+
dataset_items = [
|
|
343
|
+
dataset_item.DatasetItem(
|
|
344
|
+
id=f"temp_item_{idx}",
|
|
345
|
+
**item,
|
|
346
|
+
)
|
|
347
|
+
for idx, item in enumerate(items)
|
|
348
|
+
]
|
|
349
349
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
for metric in scoring_metrics:
|
|
358
|
-
try:
|
|
359
|
-
LOGGER.debug("Metric %s score started", metric.name)
|
|
360
|
-
|
|
361
|
-
if isinstance(metric, scorer_wrapper_metric.ScorerWrapperMetric):
|
|
362
|
-
# use original dataset item content without any mappings applied
|
|
363
|
-
if (
|
|
364
|
-
task_span := score_kwargs.get(EVALUATION_SPAN_PARAMETER_NAME)
|
|
365
|
-
) is not None:
|
|
366
|
-
result = metric.score(
|
|
367
|
-
dataset_item=test_case_.dataset_item_content,
|
|
368
|
-
task_outputs=test_case_.task_output,
|
|
369
|
-
task_span=task_span,
|
|
370
|
-
)
|
|
371
|
-
else:
|
|
372
|
-
result = metric.score(
|
|
373
|
-
dataset_item=test_case_.dataset_item_content,
|
|
374
|
-
task_outputs=test_case_.task_output,
|
|
375
|
-
)
|
|
376
|
-
else:
|
|
377
|
-
arguments_validator.validate_score_arguments(
|
|
378
|
-
metric=metric,
|
|
379
|
-
kwargs=score_kwargs,
|
|
380
|
-
scoring_key_mapping=scoring_key_mapping,
|
|
381
|
-
)
|
|
382
|
-
result = metric.score(**score_kwargs)
|
|
383
|
-
|
|
384
|
-
LOGGER.debug("Metric %s score ended", metric.name)
|
|
385
|
-
|
|
386
|
-
if isinstance(result, list):
|
|
387
|
-
score_results += result
|
|
388
|
-
else:
|
|
389
|
-
score_results.append(result)
|
|
390
|
-
except exceptions.ScoreMethodMissingArguments:
|
|
391
|
-
raise
|
|
392
|
-
except Exception as exception:
|
|
393
|
-
# This can be problematic if the metric returns a list of strings as we will not know the name of the metrics that have failed
|
|
394
|
-
LOGGER.error(
|
|
395
|
-
"Failed to compute metric %s. Score result will be marked as failed.",
|
|
396
|
-
metric.name,
|
|
397
|
-
exc_info=True,
|
|
350
|
+
if not self._metrics_evaluator.has_task_span_metrics:
|
|
351
|
+
return self._compute_test_results_for_llm_task(
|
|
352
|
+
dataset_items=dataset_items,
|
|
353
|
+
task=task,
|
|
354
|
+
experiment_=None,
|
|
355
|
+
trial_count=1,
|
|
356
|
+
description="Items evaluation",
|
|
398
357
|
)
|
|
399
358
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
359
|
+
LOGGER.debug(
|
|
360
|
+
"Detected %d LLM task span scoring metrics — enabling handling of the LLM task evaluation span.",
|
|
361
|
+
len(self._metrics_evaluator.task_span_metrics),
|
|
362
|
+
)
|
|
404
363
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
364
|
+
with local_recording.record_traces_locally(client=self._client) as recording:
|
|
365
|
+
test_results = self._compute_test_results_for_llm_task(
|
|
366
|
+
dataset_items=dataset_items,
|
|
367
|
+
task=task,
|
|
368
|
+
experiment_=None,
|
|
369
|
+
trial_count=1,
|
|
370
|
+
description="Items evaluation",
|
|
371
|
+
)
|
|
372
|
+
self._update_test_results_with_task_span_metrics(
|
|
373
|
+
test_results=test_results,
|
|
374
|
+
recording=recording,
|
|
412
375
|
)
|
|
413
376
|
|
|
414
|
-
|
|
377
|
+
return test_results
|
|
415
378
|
|
|
379
|
+
def evaluate_test_cases(
|
|
380
|
+
self,
|
|
381
|
+
test_cases: List[test_case.TestCase],
|
|
382
|
+
) -> List[test_result.TestResult]:
|
|
383
|
+
evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
|
|
384
|
+
functools.partial(
|
|
385
|
+
self._compute_test_result_for_test_case,
|
|
386
|
+
test_case_=test_case_,
|
|
387
|
+
)
|
|
388
|
+
for test_case_ in test_cases
|
|
389
|
+
]
|
|
416
390
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
# If we can't inspect the signature, assume no parameter
|
|
423
|
-
has_param = False
|
|
391
|
+
test_results = evaluation_tasks_executor.execute(
|
|
392
|
+
evaluation_tasks=evaluation_tasks,
|
|
393
|
+
workers=self._workers,
|
|
394
|
+
verbose=self._verbose,
|
|
395
|
+
)
|
|
424
396
|
|
|
425
|
-
|
|
397
|
+
return test_results
|
|
@@ -11,7 +11,7 @@ import opik.context_storage as context_storage
|
|
|
11
11
|
|
|
12
12
|
@contextlib.contextmanager
|
|
13
13
|
def evaluate_llm_task_context(
|
|
14
|
-
experiment: experiment.Experiment,
|
|
14
|
+
experiment: Optional[experiment.Experiment],
|
|
15
15
|
dataset_item_id: str,
|
|
16
16
|
trace_data: trace.TraceData,
|
|
17
17
|
client: opik_client.Opik,
|
|
@@ -36,12 +36,13 @@ def evaluate_llm_task_context(
|
|
|
36
36
|
client = client if client is not None else opik_client.get_client_cached()
|
|
37
37
|
client.trace(**trace_data.as_parameters)
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
39
|
+
# Only insert experiment item if an experiment is provided
|
|
40
|
+
if experiment is not None:
|
|
41
|
+
experiment_item_ = experiment_item.ExperimentItemReferences(
|
|
42
|
+
dataset_item_id=dataset_item_id,
|
|
43
|
+
trace_id=trace_data.id,
|
|
44
|
+
)
|
|
45
|
+
experiment.insert(experiment_items_references=[experiment_item_])
|
|
45
46
|
|
|
46
47
|
|
|
47
48
|
@contextlib.contextmanager
|