opik 1.9.5__py3-none-any.whl → 1.9.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +10 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/dataset/rest_operations.py +5 -0
- opik/api_objects/experiment/experiment.py +46 -49
- opik/api_objects/experiment/helpers.py +34 -10
- opik/api_objects/local_recording.py +8 -3
- opik/api_objects/opik_client.py +230 -48
- opik/api_objects/opik_query_language.py +9 -0
- opik/api_objects/prompt/__init__.py +11 -3
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +193 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +37 -35
- opik/api_objects/prompt/client.py +101 -30
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +1 -1
- opik/cli/export.py +6 -2
- opik/cli/usage_report/charts.py +39 -10
- opik/cli/usage_report/cli.py +164 -45
- opik/cli/usage_report/pdf.py +14 -1
- opik/config.py +0 -5
- opik/decorator/base_track_decorator.py +37 -40
- opik/decorator/context_manager/span_context_manager.py +9 -0
- opik/decorator/context_manager/trace_context_manager.py +5 -0
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +195 -223
- opik/evaluation/engine/helpers.py +8 -7
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +35 -1
- opik/evaluation/evaluator.py +318 -30
- opik/evaluation/models/litellm/util.py +78 -6
- opik/evaluation/models/model_capabilities.py +33 -0
- opik/evaluation/report.py +14 -2
- opik/evaluation/rest_operations.py +36 -33
- opik/evaluation/test_case.py +2 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +17 -0
- opik/hooks/__init__.py +17 -1
- opik/hooks/anonymizer_hook.py +36 -0
- opik/id_helpers.py +18 -0
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +3 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +1 -6
- opik/integrations/dspy/callback.py +1 -4
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +2 -4
- opik/integrations/langchain/opik_tracer.py +273 -82
- opik/integrations/llama_index/callback.py +110 -108
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/message_processing/batching/batchers.py +11 -7
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +25 -1
- opik/message_processing/online_message_processor.py +23 -8
- opik/opik_context.py +7 -7
- opik/rest_api/__init__.py +188 -12
- opik/rest_api/client.py +3 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +893 -89
- opik/rest_api/datasets/raw_client.py +1328 -87
- opik/rest_api/experiments/client.py +30 -2
- opik/rest_api/experiments/raw_client.py +26 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/optimizations/client.py +302 -0
- opik/rest_api/optimizations/raw_client.py +463 -0
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +34 -4
- opik/rest_api/prompts/raw_client.py +32 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +148 -64
- opik/rest_api/spans/raw_client.py +210 -83
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +241 -73
- opik/rest_api/traces/raw_client.py +344 -90
- opik/rest_api/types/__init__.py +200 -15
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +6 -1
- opik/rest_api/types/alert_trigger_config_type.py +6 -1
- opik/rest_api/types/alert_trigger_config_write_type.py +6 -1
- opik/rest_api/types/automation_rule_evaluator.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
- opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +2 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +23 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +1 -0
- opik/rest_api/types/dataset_item_page_public.py +1 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +2 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +49 -0
- opik/rest_api/types/experiment.py +2 -0
- opik/rest_api/types/experiment_public.py +2 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/feedback.py +20 -1
- opik/rest_api/types/feedback_create.py +16 -1
- opik/rest_api/types/feedback_object_public.py +22 -1
- opik/rest_api/types/feedback_public.py +20 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +16 -1
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +24 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +24 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +24 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +2 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +2 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +2 -0
- opik/rest_api/types/optimization.py +2 -0
- opik/rest_api/types/optimization_public.py +2 -0
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +2 -0
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +2 -0
- opik/rest_api/types/prompt_version_detail.py +2 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_public.py +2 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +6 -0
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +6 -0
- opik/rest_api/types/trace_public.py +6 -0
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_update.py +19 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/METADATA +5 -4
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/RECORD +246 -151
- opik/api_objects/prompt/chat_prompt_template.py +0 -164
- opik/api_objects/prompt/prompt.py +0 -131
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/WHEEL +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/entry_points.txt +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/licenses/LICENSE +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/top_level.txt +0 -0
opik/evaluation/evaluator.py
CHANGED
|
@@ -2,10 +2,12 @@ import logging
|
|
|
2
2
|
import time
|
|
3
3
|
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
4
4
|
|
|
5
|
-
from .. import
|
|
5
|
+
from ..api_objects.prompt import base_prompt
|
|
6
6
|
from ..api_objects import opik_client
|
|
7
7
|
from ..api_objects import dataset, experiment
|
|
8
8
|
from ..api_objects.experiment import helpers as experiment_helpers
|
|
9
|
+
from ..api_objects.prompt.chat import chat_prompt_template
|
|
10
|
+
from ..api_objects.prompt import types as prompt_types
|
|
9
11
|
from . import (
|
|
10
12
|
asyncio_support,
|
|
11
13
|
engine,
|
|
@@ -14,13 +16,12 @@ from . import (
|
|
|
14
16
|
rest_operations,
|
|
15
17
|
samplers,
|
|
16
18
|
)
|
|
17
|
-
from .metrics import base_metric
|
|
19
|
+
from .metrics import base_metric, score_result
|
|
18
20
|
from .models import ModelCapabilities, base_model, models_factory
|
|
19
21
|
from .scorers import scorer_function, scorer_wrapper_metric
|
|
20
|
-
from .
|
|
22
|
+
from . import test_result
|
|
23
|
+
from .types import ExperimentScoreFunction, LLMTask, ScoringKeyMappingType
|
|
21
24
|
from .. import url_helpers
|
|
22
|
-
from opik.api_objects.prompt.chat_prompt_template import ChatPromptTemplate
|
|
23
|
-
from opik.api_objects.prompt.types import SupportedModalities
|
|
24
25
|
|
|
25
26
|
LOGGER = logging.getLogger(__name__)
|
|
26
27
|
MODALITY_SUPPORT_DOC_URL = (
|
|
@@ -41,23 +42,52 @@ def _try_notifying_about_experiment_completion(
|
|
|
41
42
|
)
|
|
42
43
|
|
|
43
44
|
|
|
45
|
+
def _compute_experiment_scores(
|
|
46
|
+
experiment_scoring_functions: List[ExperimentScoreFunction],
|
|
47
|
+
test_results: List[test_result.TestResult],
|
|
48
|
+
) -> List[score_result.ScoreResult]:
|
|
49
|
+
"""Compute experiment-level scores from test results."""
|
|
50
|
+
if not experiment_scoring_functions or not test_results:
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
all_scores: List[score_result.ScoreResult] = []
|
|
54
|
+
for score_function in experiment_scoring_functions:
|
|
55
|
+
try:
|
|
56
|
+
scores = score_function(test_results)
|
|
57
|
+
# Handle Union[ScoreResult, List[ScoreResult]]
|
|
58
|
+
if isinstance(scores, list):
|
|
59
|
+
all_scores.extend(scores)
|
|
60
|
+
else:
|
|
61
|
+
all_scores.append(scores)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
LOGGER.warning(
|
|
64
|
+
"Failed to compute experiment score: %s",
|
|
65
|
+
e,
|
|
66
|
+
exc_info=True,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return all_scores
|
|
70
|
+
|
|
71
|
+
|
|
44
72
|
def evaluate(
|
|
45
73
|
dataset: dataset.Dataset,
|
|
46
74
|
task: LLMTask,
|
|
47
75
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
48
76
|
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
77
|
+
experiment_name_prefix: Optional[str] = None,
|
|
49
78
|
experiment_name: Optional[str] = None,
|
|
50
79
|
project_name: Optional[str] = None,
|
|
51
80
|
experiment_config: Optional[Dict[str, Any]] = None,
|
|
52
81
|
verbose: int = 1,
|
|
53
82
|
nb_samples: Optional[int] = None,
|
|
54
83
|
task_threads: int = 16,
|
|
55
|
-
prompt: Optional[
|
|
56
|
-
prompts: Optional[List[
|
|
84
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
85
|
+
prompts: Optional[List[base_prompt.BasePrompt]] = None,
|
|
57
86
|
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
58
87
|
dataset_item_ids: Optional[List[str]] = None,
|
|
59
88
|
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
60
89
|
trial_count: int = 1,
|
|
90
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
61
91
|
) -> evaluation_result.EvaluationResult:
|
|
62
92
|
"""
|
|
63
93
|
Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
|
|
@@ -70,6 +100,10 @@ def evaluate(
|
|
|
70
100
|
task: A callable object that takes dict with dataset item content
|
|
71
101
|
as input and returns dict which will later be used for scoring.
|
|
72
102
|
|
|
103
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
104
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
105
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
106
|
+
|
|
73
107
|
experiment_name: The name of the experiment associated with evaluation run.
|
|
74
108
|
If None, a generated name will be used.
|
|
75
109
|
|
|
@@ -117,7 +151,16 @@ def evaluate(
|
|
|
117
151
|
If not provided, all samples in the dataset will be evaluated.
|
|
118
152
|
|
|
119
153
|
trial_count: number of times to run the task and evaluate the task output for every dataset item.
|
|
154
|
+
|
|
155
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
156
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
157
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
158
|
+
metrics across the entire experiment.
|
|
120
159
|
"""
|
|
160
|
+
experiment_scoring_functions = (
|
|
161
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
162
|
+
)
|
|
163
|
+
|
|
121
164
|
checked_prompts = experiment_helpers.handle_prompt_args(
|
|
122
165
|
prompt=prompt,
|
|
123
166
|
prompts=prompts,
|
|
@@ -125,6 +168,11 @@ def evaluate(
|
|
|
125
168
|
|
|
126
169
|
client = opik_client.get_client_cached()
|
|
127
170
|
|
|
171
|
+
experiment_name = _use_or_create_experiment_name(
|
|
172
|
+
experiment_name=experiment_name,
|
|
173
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
174
|
+
)
|
|
175
|
+
|
|
128
176
|
experiment = client.create_experiment(
|
|
129
177
|
name=experiment_name,
|
|
130
178
|
dataset_name=dataset.name,
|
|
@@ -153,6 +201,7 @@ def evaluate(
|
|
|
153
201
|
dataset_item_ids=dataset_item_ids,
|
|
154
202
|
dataset_sampler=dataset_sampler,
|
|
155
203
|
trial_count=trial_count,
|
|
204
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
156
205
|
)
|
|
157
206
|
|
|
158
207
|
|
|
@@ -171,6 +220,7 @@ def _evaluate_task(
|
|
|
171
220
|
dataset_item_ids: Optional[List[str]],
|
|
172
221
|
dataset_sampler: Optional[samplers.BaseDatasetSampler],
|
|
173
222
|
trial_count: int,
|
|
223
|
+
experiment_scoring_functions: List[ExperimentScoreFunction],
|
|
174
224
|
) -> evaluation_result.EvaluationResult:
|
|
175
225
|
start_time = time.time()
|
|
176
226
|
|
|
@@ -178,25 +228,33 @@ def _evaluate_task(
|
|
|
178
228
|
evaluation_engine = engine.EvaluationEngine(
|
|
179
229
|
client=client,
|
|
180
230
|
project_name=project_name,
|
|
181
|
-
experiment_=experiment,
|
|
182
231
|
scoring_metrics=scoring_metrics,
|
|
183
232
|
workers=task_threads,
|
|
184
233
|
verbose=verbose,
|
|
185
234
|
scoring_key_mapping=scoring_key_mapping,
|
|
186
235
|
)
|
|
187
|
-
test_results = evaluation_engine.
|
|
236
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dataset(
|
|
188
237
|
dataset_=dataset,
|
|
189
238
|
task=task,
|
|
190
239
|
nb_samples=nb_samples,
|
|
191
240
|
dataset_item_ids=dataset_item_ids,
|
|
192
241
|
dataset_sampler=dataset_sampler,
|
|
193
242
|
trial_count=trial_count,
|
|
243
|
+
experiment_=experiment,
|
|
194
244
|
)
|
|
195
245
|
|
|
196
246
|
total_time = time.time() - start_time
|
|
197
247
|
|
|
248
|
+
# Compute experiment scores
|
|
249
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
250
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
251
|
+
test_results=test_results,
|
|
252
|
+
)
|
|
253
|
+
|
|
198
254
|
if verbose >= 1:
|
|
199
|
-
report.display_experiment_results(
|
|
255
|
+
report.display_experiment_results(
|
|
256
|
+
dataset.name, total_time, test_results, computed_experiment_scores
|
|
257
|
+
)
|
|
200
258
|
|
|
201
259
|
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
202
260
|
experiment_id=experiment.id,
|
|
@@ -210,6 +268,10 @@ def _evaluate_task(
|
|
|
210
268
|
|
|
211
269
|
_try_notifying_about_experiment_completion(experiment)
|
|
212
270
|
|
|
271
|
+
# Log experiment scores to backend
|
|
272
|
+
if computed_experiment_scores:
|
|
273
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
274
|
+
|
|
213
275
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
214
276
|
dataset_id=dataset.id,
|
|
215
277
|
experiment_id=experiment.id,
|
|
@@ -217,6 +279,7 @@ def _evaluate_task(
|
|
|
217
279
|
test_results=test_results,
|
|
218
280
|
experiment_url=experiment_url,
|
|
219
281
|
trial_count=trial_count,
|
|
282
|
+
experiment_scores=computed_experiment_scores,
|
|
220
283
|
)
|
|
221
284
|
|
|
222
285
|
if verbose >= 2:
|
|
@@ -236,6 +299,7 @@ def evaluate_experiment(
|
|
|
236
299
|
verbose: int = 1,
|
|
237
300
|
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
238
301
|
experiment_id: Optional[str] = None,
|
|
302
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
239
303
|
) -> evaluation_result.EvaluationResult:
|
|
240
304
|
"""Update the existing experiment with new evaluation metrics. You can use either `scoring_metrics` or `scorer_functions` to calculate
|
|
241
305
|
evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
|
|
@@ -267,7 +331,15 @@ def evaluate_experiment(
|
|
|
267
331
|
`{"input": "user_question"}` to map the "user_question" key to "input".
|
|
268
332
|
|
|
269
333
|
experiment_id: The ID of the experiment to evaluate. If not provided, the experiment will be evaluated based on the experiment name.
|
|
334
|
+
|
|
335
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
336
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
337
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
338
|
+
metrics across the entire experiment.
|
|
270
339
|
"""
|
|
340
|
+
experiment_scoring_functions = (
|
|
341
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
342
|
+
)
|
|
271
343
|
start_time = time.time()
|
|
272
344
|
|
|
273
345
|
client = opik_client.get_client_cached()
|
|
@@ -280,10 +352,11 @@ def evaluate_experiment(
|
|
|
280
352
|
client=client, experiment_name=experiment_name
|
|
281
353
|
)
|
|
282
354
|
|
|
355
|
+
dataset_ = client.get_dataset(name=experiment.dataset_name)
|
|
356
|
+
|
|
283
357
|
test_cases = rest_operations.get_experiment_test_cases(
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
dataset_id=experiment.dataset_id,
|
|
358
|
+
experiment_=experiment,
|
|
359
|
+
dataset_=dataset_,
|
|
287
360
|
scoring_key_mapping=scoring_key_mapping,
|
|
288
361
|
)
|
|
289
362
|
first_trace_id = test_cases[0].trace_id
|
|
@@ -302,7 +375,6 @@ def evaluate_experiment(
|
|
|
302
375
|
evaluation_engine = engine.EvaluationEngine(
|
|
303
376
|
client=client,
|
|
304
377
|
project_name=project_name,
|
|
305
|
-
experiment_=experiment,
|
|
306
378
|
scoring_metrics=scoring_metrics,
|
|
307
379
|
workers=scoring_threads,
|
|
308
380
|
verbose=verbose,
|
|
@@ -314,14 +386,23 @@ def evaluate_experiment(
|
|
|
314
386
|
|
|
315
387
|
total_time = time.time() - start_time
|
|
316
388
|
|
|
389
|
+
# Compute experiment scores
|
|
390
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
391
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
392
|
+
test_results=test_results,
|
|
393
|
+
)
|
|
394
|
+
|
|
317
395
|
if verbose >= 1:
|
|
318
396
|
report.display_experiment_results(
|
|
319
|
-
|
|
397
|
+
dataset_.name,
|
|
398
|
+
total_time,
|
|
399
|
+
test_results,
|
|
400
|
+
computed_experiment_scores,
|
|
320
401
|
)
|
|
321
402
|
|
|
322
403
|
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
323
404
|
experiment_id=experiment.id,
|
|
324
|
-
dataset_id=
|
|
405
|
+
dataset_id=dataset_.id,
|
|
325
406
|
url_override=client.config.url_override,
|
|
326
407
|
)
|
|
327
408
|
|
|
@@ -329,18 +410,23 @@ def evaluate_experiment(
|
|
|
329
410
|
|
|
330
411
|
_try_notifying_about_experiment_completion(experiment)
|
|
331
412
|
|
|
413
|
+
# Log experiment scores to backend
|
|
414
|
+
if computed_experiment_scores:
|
|
415
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
416
|
+
|
|
332
417
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
333
|
-
dataset_id=
|
|
418
|
+
dataset_id=dataset_.id,
|
|
334
419
|
experiment_id=experiment.id,
|
|
335
420
|
experiment_name=experiment.name,
|
|
336
421
|
test_results=test_results,
|
|
337
422
|
experiment_url=experiment_url,
|
|
338
423
|
trial_count=1,
|
|
424
|
+
experiment_scores=computed_experiment_scores,
|
|
339
425
|
)
|
|
340
426
|
|
|
341
427
|
if verbose >= 2:
|
|
342
428
|
report.display_evaluation_scores_statistics(
|
|
343
|
-
dataset_name=
|
|
429
|
+
dataset_name=dataset_.name,
|
|
344
430
|
evaluation_results=evaluation_result_,
|
|
345
431
|
)
|
|
346
432
|
|
|
@@ -351,16 +437,22 @@ def _build_prompt_evaluation_task(
|
|
|
351
437
|
model: base_model.OpikBaseModel, messages: List[Dict[str, Any]]
|
|
352
438
|
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
|
|
353
439
|
supported_modalities = cast(
|
|
354
|
-
SupportedModalities,
|
|
440
|
+
prompt_types.SupportedModalities,
|
|
355
441
|
{
|
|
356
442
|
"vision": ModelCapabilities.supports_vision(
|
|
357
443
|
getattr(model, "model_name", None)
|
|
358
|
-
)
|
|
444
|
+
),
|
|
445
|
+
"video": ModelCapabilities.supports_video(
|
|
446
|
+
getattr(model, "model_name", None)
|
|
447
|
+
),
|
|
359
448
|
},
|
|
360
449
|
)
|
|
361
|
-
|
|
450
|
+
# Disable placeholder validation since we pass all dataset item fields to format()
|
|
451
|
+
chat_prompt_template_ = chat_prompt_template.ChatPromptTemplate(
|
|
452
|
+
messages=messages, validate_placeholders=False
|
|
453
|
+
)
|
|
362
454
|
|
|
363
|
-
required_modalities =
|
|
455
|
+
required_modalities = chat_prompt_template_.required_modalities()
|
|
364
456
|
unsupported_modalities = {
|
|
365
457
|
modality
|
|
366
458
|
for modality in required_modalities
|
|
@@ -379,7 +471,7 @@ def _build_prompt_evaluation_task(
|
|
|
379
471
|
|
|
380
472
|
def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
|
|
381
473
|
template_type_override = prompt_variables.get("type")
|
|
382
|
-
processed_messages =
|
|
474
|
+
processed_messages = chat_prompt_template_.format(
|
|
383
475
|
variables=prompt_variables,
|
|
384
476
|
supported_modalities=supported_modalities,
|
|
385
477
|
template_type=template_type_override,
|
|
@@ -402,16 +494,18 @@ def evaluate_prompt(
|
|
|
402
494
|
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
403
495
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
404
496
|
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
497
|
+
experiment_name_prefix: Optional[str] = None,
|
|
405
498
|
experiment_name: Optional[str] = None,
|
|
406
499
|
project_name: Optional[str] = None,
|
|
407
500
|
experiment_config: Optional[Dict[str, Any]] = None,
|
|
408
501
|
verbose: int = 1,
|
|
409
502
|
nb_samples: Optional[int] = None,
|
|
410
503
|
task_threads: int = 16,
|
|
411
|
-
prompt: Optional[
|
|
504
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
412
505
|
dataset_item_ids: Optional[List[str]] = None,
|
|
413
506
|
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
414
507
|
trial_count: int = 1,
|
|
508
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
415
509
|
) -> evaluation_result.EvaluationResult:
|
|
416
510
|
"""
|
|
417
511
|
Performs prompt evaluation on a given dataset.
|
|
@@ -433,6 +527,10 @@ def evaluate_prompt(
|
|
|
433
527
|
• task_outputs — a dictionary containing the LLM task output.
|
|
434
528
|
• task_span - the data collected during the LLM task execution [optional].
|
|
435
529
|
|
|
530
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
531
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
532
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
533
|
+
|
|
436
534
|
experiment_name: name of the experiment.
|
|
437
535
|
|
|
438
536
|
project_name: The name of the project to log data
|
|
@@ -453,7 +551,15 @@ def evaluate_prompt(
|
|
|
453
551
|
If not provided, all samples in the dataset will be evaluated.
|
|
454
552
|
|
|
455
553
|
trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
|
|
554
|
+
|
|
555
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
556
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
557
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
558
|
+
metrics across the entire experiment.
|
|
456
559
|
"""
|
|
560
|
+
experiment_scoring_functions = (
|
|
561
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
562
|
+
)
|
|
457
563
|
if isinstance(model, str):
|
|
458
564
|
opik_model = models_factory.get(model_name=model)
|
|
459
565
|
elif not isinstance(model, base_model.OpikBaseModel):
|
|
@@ -477,6 +583,11 @@ def evaluate_prompt(
|
|
|
477
583
|
|
|
478
584
|
prompts = [prompt] if prompt else None
|
|
479
585
|
|
|
586
|
+
experiment_name = _use_or_create_experiment_name(
|
|
587
|
+
experiment_name=experiment_name,
|
|
588
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
589
|
+
)
|
|
590
|
+
|
|
480
591
|
experiment = client.create_experiment(
|
|
481
592
|
name=experiment_name,
|
|
482
593
|
dataset_name=dataset.name,
|
|
@@ -497,25 +608,33 @@ def evaluate_prompt(
|
|
|
497
608
|
evaluation_engine = engine.EvaluationEngine(
|
|
498
609
|
client=client,
|
|
499
610
|
project_name=project_name,
|
|
500
|
-
experiment_=experiment,
|
|
501
611
|
scoring_metrics=scoring_metrics,
|
|
502
612
|
workers=task_threads,
|
|
503
613
|
verbose=verbose,
|
|
504
614
|
scoring_key_mapping=None,
|
|
505
615
|
)
|
|
506
|
-
test_results = evaluation_engine.
|
|
616
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dataset(
|
|
507
617
|
dataset_=dataset,
|
|
508
618
|
task=_build_prompt_evaluation_task(model=opik_model, messages=messages),
|
|
509
619
|
nb_samples=nb_samples,
|
|
510
620
|
dataset_item_ids=dataset_item_ids,
|
|
511
621
|
dataset_sampler=dataset_sampler,
|
|
512
622
|
trial_count=trial_count,
|
|
623
|
+
experiment_=experiment,
|
|
513
624
|
)
|
|
514
625
|
|
|
515
626
|
total_time = time.time() - start_time
|
|
516
627
|
|
|
628
|
+
# Compute experiment scores
|
|
629
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
630
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
631
|
+
test_results=test_results,
|
|
632
|
+
)
|
|
633
|
+
|
|
517
634
|
if verbose >= 1:
|
|
518
|
-
report.display_experiment_results(
|
|
635
|
+
report.display_experiment_results(
|
|
636
|
+
dataset.name, total_time, test_results, computed_experiment_scores
|
|
637
|
+
)
|
|
519
638
|
|
|
520
639
|
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
521
640
|
experiment_id=experiment.id,
|
|
@@ -529,6 +648,10 @@ def evaluate_prompt(
|
|
|
529
648
|
|
|
530
649
|
_try_notifying_about_experiment_completion(experiment)
|
|
531
650
|
|
|
651
|
+
# Log experiment scores to backend
|
|
652
|
+
if computed_experiment_scores:
|
|
653
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
654
|
+
|
|
532
655
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
533
656
|
experiment_id=experiment.id,
|
|
534
657
|
dataset_id=dataset.id,
|
|
@@ -536,6 +659,7 @@ def evaluate_prompt(
|
|
|
536
659
|
test_results=test_results,
|
|
537
660
|
experiment_url=experiment_url,
|
|
538
661
|
trial_count=trial_count,
|
|
662
|
+
experiment_scores=computed_experiment_scores,
|
|
539
663
|
)
|
|
540
664
|
|
|
541
665
|
if verbose >= 2:
|
|
@@ -552,18 +676,21 @@ def evaluate_optimization_trial(
|
|
|
552
676
|
dataset: dataset.Dataset,
|
|
553
677
|
task: LLMTask,
|
|
554
678
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
679
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
680
|
+
experiment_name_prefix: Optional[str] = None,
|
|
555
681
|
experiment_name: Optional[str] = None,
|
|
556
682
|
project_name: Optional[str] = None,
|
|
557
683
|
experiment_config: Optional[Dict[str, Any]] = None,
|
|
558
684
|
verbose: int = 1,
|
|
559
685
|
nb_samples: Optional[int] = None,
|
|
560
686
|
task_threads: int = 16,
|
|
561
|
-
prompt: Optional[
|
|
562
|
-
prompts: Optional[List[
|
|
687
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
688
|
+
prompts: Optional[List[base_prompt.BasePrompt]] = None,
|
|
563
689
|
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
564
690
|
dataset_item_ids: Optional[List[str]] = None,
|
|
565
691
|
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
566
692
|
trial_count: int = 1,
|
|
693
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
567
694
|
) -> evaluation_result.EvaluationResult:
|
|
568
695
|
"""
|
|
569
696
|
Performs task evaluation on a given dataset.
|
|
@@ -576,6 +703,17 @@ def evaluate_optimization_trial(
|
|
|
576
703
|
task: A callable object that takes dict with dataset item content
|
|
577
704
|
as input and returns dict which will later be used for scoring.
|
|
578
705
|
|
|
706
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
707
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
708
|
+
arguments supplied by the evaluation engine:
|
|
709
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
710
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
711
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
712
|
+
|
|
713
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
714
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
715
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
716
|
+
|
|
579
717
|
experiment_name: The name of the experiment associated with evaluation run.
|
|
580
718
|
If None, a generated name will be used.
|
|
581
719
|
|
|
@@ -615,7 +753,16 @@ def evaluate_optimization_trial(
|
|
|
615
753
|
If not provided, all samples in the dataset will be evaluated.
|
|
616
754
|
|
|
617
755
|
trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
|
|
756
|
+
|
|
757
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
758
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
759
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
760
|
+
metrics across the entire experiment.
|
|
618
761
|
"""
|
|
762
|
+
experiment_scoring_functions = (
|
|
763
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
764
|
+
)
|
|
765
|
+
|
|
619
766
|
if scoring_metrics is None:
|
|
620
767
|
scoring_metrics = []
|
|
621
768
|
|
|
@@ -624,8 +771,20 @@ def evaluate_optimization_trial(
|
|
|
624
771
|
prompts=prompts,
|
|
625
772
|
)
|
|
626
773
|
|
|
774
|
+
# wrap scoring functions if any
|
|
775
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
776
|
+
scoring_functions=scoring_functions,
|
|
777
|
+
scoring_metrics=scoring_metrics,
|
|
778
|
+
project_name=project_name,
|
|
779
|
+
)
|
|
780
|
+
|
|
627
781
|
client = opik_client.get_client_cached()
|
|
628
782
|
|
|
783
|
+
experiment_name = _use_or_create_experiment_name(
|
|
784
|
+
experiment_name=experiment_name,
|
|
785
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
786
|
+
)
|
|
787
|
+
|
|
629
788
|
experiment = client.create_experiment(
|
|
630
789
|
name=experiment_name,
|
|
631
790
|
dataset_name=dataset.name,
|
|
@@ -649,13 +808,128 @@ def evaluate_optimization_trial(
|
|
|
649
808
|
dataset_item_ids=dataset_item_ids,
|
|
650
809
|
dataset_sampler=dataset_sampler,
|
|
651
810
|
trial_count=trial_count,
|
|
811
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
652
812
|
)
|
|
653
813
|
|
|
654
814
|
|
|
655
|
-
def
|
|
815
|
+
def evaluate_on_dict_items(
|
|
816
|
+
items: List[Dict[str, Any]],
|
|
817
|
+
task: LLMTask,
|
|
656
818
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
657
819
|
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
658
820
|
project_name: Optional[str] = None,
|
|
821
|
+
verbose: int = 0,
|
|
822
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
823
|
+
scoring_threads: int = 16,
|
|
824
|
+
) -> evaluation_result.EvaluationResultOnDictItems:
|
|
825
|
+
"""
|
|
826
|
+
Lightweight evaluation function that evaluates a task on dataset items (as dictionaries)
|
|
827
|
+
without requiring a Dataset object or creating an experiment.
|
|
828
|
+
|
|
829
|
+
This function is useful for optimization scenarios where you need to evaluate many
|
|
830
|
+
candidate solutions quickly using Opik's metric infrastructure. It creates traces for
|
|
831
|
+
tracking but doesn't require experiment setup or dataset management.
|
|
832
|
+
|
|
833
|
+
Args:
|
|
834
|
+
items: List of dataset item contents (dictionaries with the data to evaluate).
|
|
835
|
+
|
|
836
|
+
task: A callable object that takes dict with dataset item content
|
|
837
|
+
as input and returns dict which will later be used for scoring.
|
|
838
|
+
|
|
839
|
+
scoring_metrics: List of metrics to calculate during evaluation.
|
|
840
|
+
Each metric's `score(...)` method will be called with arguments taken from
|
|
841
|
+
the dataset item and task output.
|
|
842
|
+
|
|
843
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
844
|
+
Each scorer function accepts predefined arguments:
|
|
845
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
846
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
847
|
+
|
|
848
|
+
project_name: The name of the project for logging traces.
|
|
849
|
+
|
|
850
|
+
verbose: Controls evaluation output logs and progress bars.
|
|
851
|
+
0 - no outputs (default), 1 - enable outputs.
|
|
852
|
+
|
|
853
|
+
scoring_key_mapping: A dictionary that allows you to rename keys present in either
|
|
854
|
+
the dataset item or the task output to match the keys expected by scoring metrics.
|
|
855
|
+
|
|
856
|
+
scoring_threads: Number of thread workers to run scoring metrics.
|
|
857
|
+
|
|
858
|
+
Returns:
|
|
859
|
+
EvaluationResultOnDictItems object containing test results and providing methods
|
|
860
|
+
to aggregate scores, similar to the regular evaluation result.
|
|
861
|
+
|
|
862
|
+
Example:
|
|
863
|
+
```python
|
|
864
|
+
import opik
|
|
865
|
+
from opik.evaluation.metrics import Equals
|
|
866
|
+
|
|
867
|
+
items = [
|
|
868
|
+
{"input": "What is 2+2?", "expected_output": "4"},
|
|
869
|
+
{"input": "What is 3+3?", "expected_output": "6"},
|
|
870
|
+
]
|
|
871
|
+
|
|
872
|
+
def my_task(item):
|
|
873
|
+
# Your LLM call here
|
|
874
|
+
question = item["input"]
|
|
875
|
+
# ... call model ...
|
|
876
|
+
return {"output": model_output}
|
|
877
|
+
|
|
878
|
+
result = opik.evaluate_on_dict_items(
|
|
879
|
+
items=items,
|
|
880
|
+
task=my_task,
|
|
881
|
+
scoring_metrics=[Equals()],
|
|
882
|
+
scoring_key_mapping={"reference": "expected_output"},
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
# Access individual test results
|
|
886
|
+
for test_result in result.test_results:
|
|
887
|
+
print(f"Score: {test_result.score_results[0].value}")
|
|
888
|
+
|
|
889
|
+
# Get aggregated statistics
|
|
890
|
+
aggregated = result.aggregate_evaluation_scores()
|
|
891
|
+
print(f"Mean equals score: {aggregated['equals_metric'].mean}")
|
|
892
|
+
```
|
|
893
|
+
"""
|
|
894
|
+
# Wrap scoring functions if any
|
|
895
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
896
|
+
scoring_functions=scoring_functions,
|
|
897
|
+
scoring_metrics=scoring_metrics,
|
|
898
|
+
project_name=project_name,
|
|
899
|
+
)
|
|
900
|
+
|
|
901
|
+
if not scoring_metrics:
|
|
902
|
+
LOGGER.warning("No scoring metrics provided for items evaluation")
|
|
903
|
+
return evaluation_result.EvaluationResultOnDictItems(test_results=[])
|
|
904
|
+
|
|
905
|
+
client = opik_client.get_client_cached()
|
|
906
|
+
|
|
907
|
+
# Create evaluation engine
|
|
908
|
+
with asyncio_support.async_http_connections_expire_immediately():
|
|
909
|
+
evaluation_engine = engine.EvaluationEngine(
|
|
910
|
+
client=client,
|
|
911
|
+
project_name=project_name,
|
|
912
|
+
scoring_metrics=scoring_metrics,
|
|
913
|
+
workers=scoring_threads,
|
|
914
|
+
verbose=verbose,
|
|
915
|
+
scoring_key_mapping=scoring_key_mapping,
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
# Use the new evaluate_items method
|
|
919
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dict_items(
|
|
920
|
+
items=items,
|
|
921
|
+
task=task,
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
return evaluation_result.EvaluationResultOnDictItems(
|
|
925
|
+
test_results=test_results,
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def _wrap_scoring_functions(
|
|
930
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]],
|
|
931
|
+
scoring_metrics: Optional[List[base_metric.BaseMetric]],
|
|
932
|
+
project_name: Optional[str],
|
|
659
933
|
) -> List[base_metric.BaseMetric]:
|
|
660
934
|
if scoring_functions:
|
|
661
935
|
function_metrics = scorer_wrapper_metric.wrap_scorer_functions(
|
|
@@ -667,3 +941,17 @@ def _wrap_scoring_functions(
|
|
|
667
941
|
scoring_metrics = function_metrics
|
|
668
942
|
|
|
669
943
|
return scoring_metrics if scoring_metrics else []
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def _use_or_create_experiment_name(
|
|
947
|
+
experiment_name: Optional[str], experiment_name_prefix: Optional[str]
|
|
948
|
+
) -> Optional[str]:
|
|
949
|
+
if experiment_name:
|
|
950
|
+
return experiment_name
|
|
951
|
+
|
|
952
|
+
if experiment_name_prefix:
|
|
953
|
+
return experiment_helpers.generate_unique_experiment_name(
|
|
954
|
+
experiment_name_prefix
|
|
955
|
+
)
|
|
956
|
+
else:
|
|
957
|
+
return None
|