opik 1.9.5__py3-none-any.whl → 1.9.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +10 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/dataset/rest_operations.py +5 -0
- opik/api_objects/experiment/experiment.py +46 -49
- opik/api_objects/experiment/helpers.py +34 -10
- opik/api_objects/local_recording.py +8 -3
- opik/api_objects/opik_client.py +230 -48
- opik/api_objects/opik_query_language.py +9 -0
- opik/api_objects/prompt/__init__.py +11 -3
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +193 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +37 -35
- opik/api_objects/prompt/client.py +101 -30
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +1 -1
- opik/cli/export.py +6 -2
- opik/cli/usage_report/charts.py +39 -10
- opik/cli/usage_report/cli.py +164 -45
- opik/cli/usage_report/pdf.py +14 -1
- opik/config.py +0 -5
- opik/decorator/base_track_decorator.py +37 -40
- opik/decorator/context_manager/span_context_manager.py +9 -0
- opik/decorator/context_manager/trace_context_manager.py +5 -0
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +195 -223
- opik/evaluation/engine/helpers.py +8 -7
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +35 -1
- opik/evaluation/evaluator.py +318 -30
- opik/evaluation/models/litellm/util.py +78 -6
- opik/evaluation/models/model_capabilities.py +33 -0
- opik/evaluation/report.py +14 -2
- opik/evaluation/rest_operations.py +36 -33
- opik/evaluation/test_case.py +2 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +17 -0
- opik/hooks/__init__.py +17 -1
- opik/hooks/anonymizer_hook.py +36 -0
- opik/id_helpers.py +18 -0
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +3 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +1 -6
- opik/integrations/dspy/callback.py +1 -4
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +2 -4
- opik/integrations/langchain/opik_tracer.py +273 -82
- opik/integrations/llama_index/callback.py +110 -108
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/message_processing/batching/batchers.py +11 -7
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +25 -1
- opik/message_processing/online_message_processor.py +23 -8
- opik/opik_context.py +7 -7
- opik/rest_api/__init__.py +188 -12
- opik/rest_api/client.py +3 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +893 -89
- opik/rest_api/datasets/raw_client.py +1328 -87
- opik/rest_api/experiments/client.py +30 -2
- opik/rest_api/experiments/raw_client.py +26 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/optimizations/client.py +302 -0
- opik/rest_api/optimizations/raw_client.py +463 -0
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +34 -4
- opik/rest_api/prompts/raw_client.py +32 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +148 -64
- opik/rest_api/spans/raw_client.py +210 -83
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +241 -73
- opik/rest_api/traces/raw_client.py +344 -90
- opik/rest_api/types/__init__.py +200 -15
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +6 -1
- opik/rest_api/types/alert_trigger_config_type.py +6 -1
- opik/rest_api/types/alert_trigger_config_write_type.py +6 -1
- opik/rest_api/types/automation_rule_evaluator.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
- opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +2 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +23 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +1 -0
- opik/rest_api/types/dataset_item_page_public.py +1 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +2 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +49 -0
- opik/rest_api/types/experiment.py +2 -0
- opik/rest_api/types/experiment_public.py +2 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/feedback.py +20 -1
- opik/rest_api/types/feedback_create.py +16 -1
- opik/rest_api/types/feedback_object_public.py +22 -1
- opik/rest_api/types/feedback_public.py +20 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +16 -1
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +24 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +24 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +24 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +2 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +2 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +2 -0
- opik/rest_api/types/optimization.py +2 -0
- opik/rest_api/types/optimization_public.py +2 -0
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +2 -0
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +2 -0
- opik/rest_api/types/prompt_version_detail.py +2 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_public.py +2 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +6 -0
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +6 -0
- opik/rest_api/types/trace_public.py +6 -0
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_update.py +19 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/METADATA +5 -4
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/RECORD +246 -151
- opik/api_objects/prompt/chat_prompt_template.py +0 -164
- opik/api_objects/prompt/prompt.py +0 -131
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/WHEEL +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/entry_points.txt +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/licenses/LICENSE +0 -0
- {opik-1.9.5.dist-info → opik-1.9.39.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import logging
|
|
3
|
+
from typing import List, Dict, Any, Optional, Callable, Tuple
|
|
4
|
+
|
|
5
|
+
import opik.exceptions as exceptions
|
|
6
|
+
import opik.logging_messages as logging_messages
|
|
7
|
+
from opik.evaluation.metrics import (
|
|
8
|
+
arguments_helpers,
|
|
9
|
+
base_metric,
|
|
10
|
+
score_result,
|
|
11
|
+
arguments_validator,
|
|
12
|
+
)
|
|
13
|
+
from opik.evaluation.scorers import scorer_wrapper_metric
|
|
14
|
+
from opik.evaluation.types import ScoringKeyMappingType
|
|
15
|
+
from opik.message_processing.emulation import models
|
|
16
|
+
|
|
17
|
+
from . import exception_analyzer
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
LOGGER = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
EVALUATION_SPAN_PARAMETER_NAME = "task_span"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _has_evaluation_span_parameter(func: Callable) -> bool:
|
|
26
|
+
"""Check if a scoring function expects the task_span parameter."""
|
|
27
|
+
try:
|
|
28
|
+
sig = inspect.signature(func)
|
|
29
|
+
return EVALUATION_SPAN_PARAMETER_NAME in sig.parameters
|
|
30
|
+
except (ValueError, TypeError):
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _compute_metric_scores(
|
|
35
|
+
scoring_metrics: List[base_metric.BaseMetric],
|
|
36
|
+
mapped_scoring_inputs: Dict[str, Any],
|
|
37
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType],
|
|
38
|
+
dataset_item_content: Dict[str, Any],
|
|
39
|
+
task_output: Dict[str, Any],
|
|
40
|
+
) -> List[score_result.ScoreResult]:
|
|
41
|
+
"""
|
|
42
|
+
Compute scores using given metrics.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
scoring_metrics: List of metrics to compute
|
|
46
|
+
mapped_scoring_inputs: Scoring inputs after key mapping (will be used for regular metrics)
|
|
47
|
+
scoring_key_mapping: Optional mapping for renaming score arguments
|
|
48
|
+
dataset_item_content: Dataset item content (will be used for ScorerWrapperMetric)
|
|
49
|
+
task_output: Task output (will be used for ScorerWrapperMetric)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of computed score results
|
|
53
|
+
"""
|
|
54
|
+
score_results: List[score_result.ScoreResult] = []
|
|
55
|
+
|
|
56
|
+
for metric in scoring_metrics:
|
|
57
|
+
try:
|
|
58
|
+
LOGGER.debug("Metric %s score started", metric.name)
|
|
59
|
+
|
|
60
|
+
if isinstance(metric, scorer_wrapper_metric.ScorerWrapperMetric):
|
|
61
|
+
# ScorerWrapperMetric uses original dataset item and task output without mappings
|
|
62
|
+
if (
|
|
63
|
+
task_span := mapped_scoring_inputs.get(
|
|
64
|
+
EVALUATION_SPAN_PARAMETER_NAME
|
|
65
|
+
)
|
|
66
|
+
) is not None:
|
|
67
|
+
result = metric.score(
|
|
68
|
+
dataset_item=dataset_item_content,
|
|
69
|
+
task_outputs=task_output,
|
|
70
|
+
task_span=task_span,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
result = metric.score(
|
|
74
|
+
dataset_item=dataset_item_content,
|
|
75
|
+
task_outputs=task_output,
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
arguments_validator.validate_score_arguments(
|
|
79
|
+
metric=metric,
|
|
80
|
+
kwargs=mapped_scoring_inputs,
|
|
81
|
+
scoring_key_mapping=scoring_key_mapping,
|
|
82
|
+
)
|
|
83
|
+
result = metric.score(**mapped_scoring_inputs)
|
|
84
|
+
|
|
85
|
+
LOGGER.debug("Metric %s score ended", metric.name)
|
|
86
|
+
|
|
87
|
+
if isinstance(result, list):
|
|
88
|
+
score_results += result
|
|
89
|
+
else:
|
|
90
|
+
score_results.append(result)
|
|
91
|
+
|
|
92
|
+
except exceptions.ScoreMethodMissingArguments:
|
|
93
|
+
raise
|
|
94
|
+
except Exception as exception:
|
|
95
|
+
LOGGER.error(
|
|
96
|
+
"Failed to compute metric %s. Score result will be marked as failed.",
|
|
97
|
+
metric.name,
|
|
98
|
+
exc_info=True,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if exception_analyzer.is_llm_provider_rate_limit_error(exception):
|
|
102
|
+
LOGGER.error(
|
|
103
|
+
logging_messages.LLM_PROVIDER_RATE_LIMIT_ERROR_DETECTED_IN_EVALUATE_FUNCTION
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
score_results.append(
|
|
107
|
+
score_result.ScoreResult(
|
|
108
|
+
name=metric.name,
|
|
109
|
+
value=0.0,
|
|
110
|
+
reason=str(exception),
|
|
111
|
+
scoring_failed=True,
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return score_results
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class MetricsEvaluator:
|
|
119
|
+
"""
|
|
120
|
+
Handles metric computation and scoring.
|
|
121
|
+
|
|
122
|
+
Separates metrics into:
|
|
123
|
+
- Regular metrics: Score based on inputs/outputs
|
|
124
|
+
- Task span metrics: Score based on LLM call metadata (tokens, latency, etc)
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def __init__(
|
|
128
|
+
self,
|
|
129
|
+
scoring_metrics: List[base_metric.BaseMetric],
|
|
130
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType],
|
|
131
|
+
):
|
|
132
|
+
self._scoring_key_mapping = scoring_key_mapping
|
|
133
|
+
self._regular_metrics: List[base_metric.BaseMetric] = []
|
|
134
|
+
self._task_span_metrics: List[base_metric.BaseMetric] = []
|
|
135
|
+
|
|
136
|
+
self._analyze_metrics(scoring_metrics)
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def has_task_span_metrics(self) -> bool:
|
|
140
|
+
"""Check if any task span scoring metrics are configured."""
|
|
141
|
+
return len(self._task_span_metrics) > 0
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def task_span_metrics(self) -> List[base_metric.BaseMetric]:
|
|
145
|
+
"""Get list of task span scoring metrics."""
|
|
146
|
+
return self._task_span_metrics
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def regular_metrics(self) -> List[base_metric.BaseMetric]:
|
|
150
|
+
"""Get list of regular scoring metrics."""
|
|
151
|
+
return self._regular_metrics
|
|
152
|
+
|
|
153
|
+
def _analyze_metrics(
|
|
154
|
+
self,
|
|
155
|
+
scoring_metrics: List[base_metric.BaseMetric],
|
|
156
|
+
) -> None:
|
|
157
|
+
"""Separate metrics into regular and task-span categories."""
|
|
158
|
+
for metric in scoring_metrics:
|
|
159
|
+
if _has_evaluation_span_parameter(metric.score):
|
|
160
|
+
self._task_span_metrics.append(metric)
|
|
161
|
+
else:
|
|
162
|
+
self._regular_metrics.append(metric)
|
|
163
|
+
|
|
164
|
+
if self.has_task_span_metrics:
|
|
165
|
+
LOGGER.debug(
|
|
166
|
+
"Detected %d LLM task span scoring metrics.",
|
|
167
|
+
len(self._task_span_metrics),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
def compute_regular_scores(
|
|
171
|
+
self,
|
|
172
|
+
dataset_item_content: Dict[str, Any],
|
|
173
|
+
task_output: Dict[str, Any],
|
|
174
|
+
) -> Tuple[List[score_result.ScoreResult], Dict[str, Any]]:
|
|
175
|
+
"""
|
|
176
|
+
Compute scores using regular metrics.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
dataset_item_content: Dataset item content
|
|
180
|
+
task_output: Task output
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Tuple of (score results, mapped scoring inputs used for scoring regular non-wrapper metrics)
|
|
184
|
+
"""
|
|
185
|
+
mapped_scoring_inputs = arguments_helpers.create_scoring_inputs(
|
|
186
|
+
dataset_item=dataset_item_content,
|
|
187
|
+
task_output=task_output,
|
|
188
|
+
scoring_key_mapping=self._scoring_key_mapping,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
score_results = _compute_metric_scores(
|
|
192
|
+
scoring_metrics=self._regular_metrics,
|
|
193
|
+
mapped_scoring_inputs=mapped_scoring_inputs,
|
|
194
|
+
scoring_key_mapping=self._scoring_key_mapping,
|
|
195
|
+
dataset_item_content=dataset_item_content,
|
|
196
|
+
task_output=task_output,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
return score_results, mapped_scoring_inputs
|
|
200
|
+
|
|
201
|
+
def compute_task_span_scores(
|
|
202
|
+
self,
|
|
203
|
+
dataset_item_content: Dict[str, Any],
|
|
204
|
+
task_output: Dict[str, Any],
|
|
205
|
+
task_span: models.SpanModel,
|
|
206
|
+
) -> Tuple[List[score_result.ScoreResult], Dict[str, Any]]:
|
|
207
|
+
"""
|
|
208
|
+
Compute scores using task span metrics.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
dataset_item_content: Dataset item content
|
|
212
|
+
task_output: Task output
|
|
213
|
+
task_span: Span model containing task execution metadata
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Tuple of (score results, mapped scoring inputs used for scoring regular non-wrapper metrics)
|
|
217
|
+
"""
|
|
218
|
+
mapped_scoring_inputs = arguments_helpers.create_scoring_inputs(
|
|
219
|
+
dataset_item=dataset_item_content,
|
|
220
|
+
task_output=task_output,
|
|
221
|
+
scoring_key_mapping=self._scoring_key_mapping,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
mapped_scoring_inputs_with_span = {
|
|
225
|
+
**mapped_scoring_inputs,
|
|
226
|
+
EVALUATION_SPAN_PARAMETER_NAME: task_span,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
score_results = _compute_metric_scores(
|
|
230
|
+
scoring_metrics=self._task_span_metrics,
|
|
231
|
+
mapped_scoring_inputs=mapped_scoring_inputs_with_span,
|
|
232
|
+
scoring_key_mapping=self._scoring_key_mapping,
|
|
233
|
+
dataset_item_content=dataset_item_content,
|
|
234
|
+
task_output=task_output,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return score_results, mapped_scoring_inputs_with_span
|
|
@@ -1,10 +1,14 @@
|
|
|
1
|
-
from typing import List, Optional, Dict
|
|
1
|
+
from typing import List, Optional, Dict, TYPE_CHECKING
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
5
|
import dataclasses
|
|
6
6
|
|
|
7
7
|
from . import score_statistics, test_result
|
|
8
|
+
from .metrics import score_result
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
pass
|
|
8
12
|
|
|
9
13
|
LOGGER = logging.getLogger(__name__)
|
|
10
14
|
|
|
@@ -68,6 +72,9 @@ class EvaluationResult:
|
|
|
68
72
|
test_results: List[test_result.TestResult]
|
|
69
73
|
experiment_url: Optional[str]
|
|
70
74
|
trial_count: int
|
|
75
|
+
experiment_scores: List[score_result.ScoreResult] = dataclasses.field(
|
|
76
|
+
default_factory=list
|
|
77
|
+
)
|
|
71
78
|
|
|
72
79
|
def aggregate_evaluation_scores(self) -> EvaluationResultAggregatedScoresView:
|
|
73
80
|
"""
|
|
@@ -143,3 +150,30 @@ class EvaluationResult:
|
|
|
143
150
|
)
|
|
144
151
|
|
|
145
152
|
return dataset_items_results
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclasses.dataclass
|
|
156
|
+
class EvaluationResultOnDictItems:
|
|
157
|
+
"""
|
|
158
|
+
Evaluation result for dict items evaluation without experiment tracking.
|
|
159
|
+
|
|
160
|
+
This class provides a similar interface to EvaluationResult but is designed
|
|
161
|
+
for lightweight evaluations that don't require experiment or dataset management.
|
|
162
|
+
It can aggregate scores across test results just like the regular evaluation.
|
|
163
|
+
|
|
164
|
+
Attributes:
|
|
165
|
+
test_results: Collection of test results from the evaluation.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
test_results: List[test_result.TestResult]
|
|
169
|
+
|
|
170
|
+
def aggregate_evaluation_scores(
|
|
171
|
+
self,
|
|
172
|
+
) -> Dict[str, score_statistics.ScoreStatistics]:
|
|
173
|
+
"""
|
|
174
|
+
Aggregates evaluation scores from test results.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Dictionary mapping score names to their aggregated statistics.
|
|
178
|
+
"""
|
|
179
|
+
return score_statistics.calculate_aggregated_statistics(self.test_results)
|