agenta 0.52.6__py3-none-any.whl → 0.63.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenta/__init__.py +12 -3
- agenta/client/__init__.py +4 -4
- agenta/client/backend/__init__.py +4 -4
- agenta/client/backend/api_keys/client.py +2 -2
- agenta/client/backend/billing/client.py +2 -2
- agenta/client/backend/billing/raw_client.py +2 -2
- agenta/client/backend/client.py +56 -48
- agenta/client/backend/core/client_wrapper.py +2 -2
- agenta/client/backend/core/file.py +3 -1
- agenta/client/backend/core/http_client.py +3 -3
- agenta/client/backend/core/pydantic_utilities.py +13 -3
- agenta/client/backend/human_evaluations/client.py +2 -2
- agenta/client/backend/human_evaluations/raw_client.py +2 -2
- agenta/client/backend/organization/client.py +46 -34
- agenta/client/backend/organization/raw_client.py +32 -26
- agenta/client/backend/raw_client.py +26 -26
- agenta/client/backend/testsets/client.py +18 -18
- agenta/client/backend/testsets/raw_client.py +30 -30
- agenta/client/backend/types/__init__.py +4 -4
- agenta/client/backend/types/account_request.py +3 -1
- agenta/client/backend/types/account_response.py +3 -1
- agenta/client/backend/types/agenta_node_dto.py +3 -1
- agenta/client/backend/types/agenta_nodes_response.py +3 -1
- agenta/client/backend/types/agenta_root_dto.py +3 -1
- agenta/client/backend/types/agenta_roots_response.py +3 -1
- agenta/client/backend/types/agenta_tree_dto.py +3 -1
- agenta/client/backend/types/agenta_trees_response.py +3 -1
- agenta/client/backend/types/aggregated_result.py +3 -1
- agenta/client/backend/types/analytics_response.py +3 -1
- agenta/client/backend/types/annotation.py +6 -4
- agenta/client/backend/types/annotation_create.py +3 -1
- agenta/client/backend/types/annotation_edit.py +3 -1
- agenta/client/backend/types/annotation_link.py +3 -1
- agenta/client/backend/types/annotation_link_response.py +3 -1
- agenta/client/backend/types/annotation_query.py +3 -1
- agenta/client/backend/types/annotation_query_request.py +3 -1
- agenta/client/backend/types/annotation_reference.py +3 -1
- agenta/client/backend/types/annotation_references.py +3 -1
- agenta/client/backend/types/annotation_response.py +3 -1
- agenta/client/backend/types/annotations_response.py +3 -1
- agenta/client/backend/types/app.py +3 -1
- agenta/client/backend/types/app_variant_response.py +3 -1
- agenta/client/backend/types/app_variant_revision.py +3 -1
- agenta/client/backend/types/artifact.py +6 -4
- agenta/client/backend/types/base_output.py +3 -1
- agenta/client/backend/types/body_fetch_workflow_revision.py +3 -1
- agenta/client/backend/types/body_import_testset.py +3 -1
- agenta/client/backend/types/bucket_dto.py +3 -1
- agenta/client/backend/types/collect_status_response.py +3 -1
- agenta/client/backend/types/config_db.py +3 -1
- agenta/client/backend/types/config_dto.py +3 -1
- agenta/client/backend/types/config_response_model.py +3 -1
- agenta/client/backend/types/correct_answer.py +3 -1
- agenta/client/backend/types/create_app_output.py +3 -1
- agenta/client/backend/types/custom_model_settings_dto.py +3 -1
- agenta/client/backend/types/custom_provider_dto.py +3 -1
- agenta/client/backend/types/custom_provider_kind.py +1 -1
- agenta/client/backend/types/custom_provider_settings_dto.py +3 -1
- agenta/client/backend/types/delete_evaluation.py +3 -1
- agenta/client/backend/types/environment_output.py +3 -1
- agenta/client/backend/types/environment_output_extended.py +3 -1
- agenta/client/backend/types/environment_revision.py +3 -1
- agenta/client/backend/types/error.py +3 -1
- agenta/client/backend/types/evaluation.py +3 -1
- agenta/client/backend/types/evaluation_scenario.py +3 -1
- agenta/client/backend/types/evaluation_scenario_input.py +3 -1
- agenta/client/backend/types/evaluation_scenario_output.py +3 -1
- agenta/client/backend/types/evaluation_scenario_result.py +3 -1
- agenta/client/backend/types/evaluator.py +6 -4
- agenta/client/backend/types/evaluator_config.py +6 -4
- agenta/client/backend/types/evaluator_flags.py +3 -1
- agenta/client/backend/types/evaluator_mapping_output_interface.py +3 -1
- agenta/client/backend/types/evaluator_output_interface.py +3 -1
- agenta/client/backend/types/evaluator_query.py +3 -1
- agenta/client/backend/types/evaluator_query_request.py +3 -1
- agenta/client/backend/types/evaluator_request.py +3 -1
- agenta/client/backend/types/evaluator_response.py +3 -1
- agenta/client/backend/types/evaluators_response.py +3 -1
- agenta/client/backend/types/exception_dto.py +3 -1
- agenta/client/backend/types/extended_o_tel_tracing_response.py +3 -1
- agenta/client/backend/types/get_config_response.py +3 -1
- agenta/client/backend/types/header.py +3 -1
- agenta/client/backend/types/http_validation_error.py +3 -1
- agenta/client/backend/types/human_evaluation.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario_input.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario_output.py +3 -1
- agenta/client/backend/types/invite_request.py +3 -1
- agenta/client/backend/types/legacy_analytics_response.py +3 -1
- agenta/client/backend/types/legacy_data_point.py +3 -1
- agenta/client/backend/types/legacy_evaluator.py +3 -1
- agenta/client/backend/types/legacy_scope_request.py +3 -1
- agenta/client/backend/types/legacy_scopes_response.py +3 -1
- agenta/client/backend/types/legacy_subscription_request.py +3 -1
- agenta/client/backend/types/legacy_user_request.py +3 -1
- agenta/client/backend/types/legacy_user_response.py +3 -1
- agenta/client/backend/types/lifecycle_dto.py +3 -1
- agenta/client/backend/types/link_dto.py +3 -1
- agenta/client/backend/types/list_api_keys_response.py +3 -1
- agenta/client/backend/types/llm_run_rate_limit.py +3 -1
- agenta/client/backend/types/meta_request.py +3 -1
- agenta/client/backend/types/metrics_dto.py +3 -1
- agenta/client/backend/types/new_testset.py +3 -1
- agenta/client/backend/types/node_dto.py +3 -1
- agenta/client/backend/types/o_tel_context_dto.py +3 -1
- agenta/client/backend/types/o_tel_event.py +6 -4
- agenta/client/backend/types/o_tel_event_dto.py +3 -1
- agenta/client/backend/types/o_tel_extra_dto.py +3 -1
- agenta/client/backend/types/o_tel_flat_span.py +6 -4
- agenta/client/backend/types/o_tel_link.py +6 -4
- agenta/client/backend/types/o_tel_link_dto.py +3 -1
- agenta/client/backend/types/o_tel_links_response.py +3 -1
- agenta/client/backend/types/o_tel_span.py +1 -1
- agenta/client/backend/types/o_tel_span_dto.py +3 -1
- agenta/client/backend/types/o_tel_spans_tree.py +3 -1
- agenta/client/backend/types/o_tel_tracing_data_response.py +3 -1
- agenta/client/backend/types/o_tel_tracing_request.py +3 -1
- agenta/client/backend/types/o_tel_tracing_response.py +3 -1
- agenta/client/backend/types/organization.py +3 -1
- agenta/client/backend/types/organization_details.py +3 -1
- agenta/client/backend/types/organization_membership_request.py +3 -1
- agenta/client/backend/types/organization_output.py +3 -1
- agenta/client/backend/types/organization_request.py +3 -1
- agenta/client/backend/types/parent_dto.py +3 -1
- agenta/client/backend/types/project_membership_request.py +3 -1
- agenta/client/backend/types/project_request.py +3 -1
- agenta/client/backend/types/project_scope.py +3 -1
- agenta/client/backend/types/projects_response.py +3 -1
- agenta/client/backend/types/reference.py +6 -4
- agenta/client/backend/types/reference_dto.py +3 -1
- agenta/client/backend/types/reference_request_model.py +3 -1
- agenta/client/backend/types/result.py +3 -1
- agenta/client/backend/types/root_dto.py +3 -1
- agenta/client/backend/types/scopes_response_model.py +3 -1
- agenta/client/backend/types/secret_dto.py +3 -1
- agenta/client/backend/types/secret_response_dto.py +3 -1
- agenta/client/backend/types/simple_evaluation_output.py +3 -1
- agenta/client/backend/types/span_dto.py +6 -4
- agenta/client/backend/types/standard_provider_dto.py +3 -1
- agenta/client/backend/types/standard_provider_settings_dto.py +3 -1
- agenta/client/backend/types/status_dto.py +3 -1
- agenta/client/backend/types/tags_request.py +3 -1
- agenta/client/backend/types/testcase_response.py +6 -4
- agenta/client/backend/types/testset.py +6 -4
- agenta/client/backend/types/{test_set_output_response.py → testset_output_response.py} +4 -2
- agenta/client/backend/types/testset_request.py +3 -1
- agenta/client/backend/types/testset_response.py +3 -1
- agenta/client/backend/types/{test_set_simple_response.py → testset_simple_response.py} +4 -2
- agenta/client/backend/types/testsets_response.py +3 -1
- agenta/client/backend/types/time_dto.py +3 -1
- agenta/client/backend/types/tree_dto.py +3 -1
- agenta/client/backend/types/update_app_output.py +3 -1
- agenta/client/backend/types/user_request.py +3 -1
- agenta/client/backend/types/validation_error.py +3 -1
- agenta/client/backend/types/workflow_artifact.py +6 -4
- agenta/client/backend/types/workflow_data.py +3 -1
- agenta/client/backend/types/workflow_flags.py +3 -1
- agenta/client/backend/types/workflow_request.py +3 -1
- agenta/client/backend/types/workflow_response.py +3 -1
- agenta/client/backend/types/workflow_revision.py +6 -4
- agenta/client/backend/types/workflow_revision_request.py +3 -1
- agenta/client/backend/types/workflow_revision_response.py +3 -1
- agenta/client/backend/types/workflow_revisions_response.py +3 -1
- agenta/client/backend/types/workflow_variant.py +6 -4
- agenta/client/backend/types/workflow_variant_request.py +3 -1
- agenta/client/backend/types/workflow_variant_response.py +3 -1
- agenta/client/backend/types/workflow_variants_response.py +3 -1
- agenta/client/backend/types/workflows_response.py +3 -1
- agenta/client/backend/types/workspace.py +3 -1
- agenta/client/backend/types/workspace_member_response.py +3 -1
- agenta/client/backend/types/workspace_membership_request.py +3 -1
- agenta/client/backend/types/workspace_permission.py +3 -1
- agenta/client/backend/types/workspace_request.py +3 -1
- agenta/client/backend/types/workspace_response.py +3 -1
- agenta/client/backend/vault/raw_client.py +4 -4
- agenta/client/backend/workspace/client.py +2 -2
- agenta/client/client.py +102 -88
- agenta/sdk/__init__.py +52 -3
- agenta/sdk/agenta_init.py +43 -16
- agenta/sdk/assets.py +23 -15
- agenta/sdk/context/serving.py +20 -8
- agenta/sdk/context/tracing.py +40 -22
- agenta/sdk/contexts/__init__.py +0 -0
- agenta/sdk/contexts/routing.py +38 -0
- agenta/sdk/contexts/running.py +57 -0
- agenta/sdk/contexts/tracing.py +86 -0
- agenta/sdk/decorators/__init__.py +1 -0
- agenta/sdk/decorators/routing.py +284 -0
- agenta/sdk/decorators/running.py +692 -98
- agenta/sdk/decorators/serving.py +20 -21
- agenta/sdk/decorators/tracing.py +176 -131
- agenta/sdk/engines/__init__.py +0 -0
- agenta/sdk/engines/running/__init__.py +0 -0
- agenta/sdk/engines/running/utils.py +17 -0
- agenta/sdk/engines/tracing/__init__.py +1 -0
- agenta/sdk/engines/tracing/attributes.py +185 -0
- agenta/sdk/engines/tracing/conventions.py +49 -0
- agenta/sdk/engines/tracing/exporters.py +130 -0
- agenta/sdk/engines/tracing/inline.py +1154 -0
- agenta/sdk/engines/tracing/processors.py +190 -0
- agenta/sdk/engines/tracing/propagation.py +102 -0
- agenta/sdk/engines/tracing/spans.py +136 -0
- agenta/sdk/engines/tracing/tracing.py +324 -0
- agenta/sdk/evaluations/__init__.py +2 -0
- agenta/sdk/evaluations/metrics.py +37 -0
- agenta/sdk/evaluations/preview/__init__.py +0 -0
- agenta/sdk/evaluations/preview/evaluate.py +765 -0
- agenta/sdk/evaluations/preview/utils.py +861 -0
- agenta/sdk/evaluations/results.py +66 -0
- agenta/sdk/evaluations/runs.py +153 -0
- agenta/sdk/evaluations/scenarios.py +48 -0
- agenta/sdk/litellm/litellm.py +12 -0
- agenta/sdk/litellm/mockllm.py +6 -8
- agenta/sdk/litellm/mocks/__init__.py +5 -5
- agenta/sdk/managers/applications.py +304 -0
- agenta/sdk/managers/config.py +2 -2
- agenta/sdk/managers/evaluations.py +0 -0
- agenta/sdk/managers/evaluators.py +303 -0
- agenta/sdk/managers/secrets.py +161 -24
- agenta/sdk/managers/shared.py +3 -1
- agenta/sdk/managers/testsets.py +441 -0
- agenta/sdk/managers/vault.py +3 -3
- agenta/sdk/middleware/auth.py +0 -176
- agenta/sdk/middleware/config.py +27 -9
- agenta/sdk/middleware/vault.py +204 -9
- agenta/sdk/middlewares/__init__.py +0 -0
- agenta/sdk/middlewares/routing/__init__.py +0 -0
- agenta/sdk/middlewares/routing/auth.py +263 -0
- agenta/sdk/middlewares/routing/cors.py +30 -0
- agenta/sdk/middlewares/routing/otel.py +29 -0
- agenta/sdk/middlewares/running/__init__.py +0 -0
- agenta/sdk/middlewares/running/normalizer.py +321 -0
- agenta/sdk/middlewares/running/resolver.py +161 -0
- agenta/sdk/middlewares/running/vault.py +140 -0
- agenta/sdk/models/__init__.py +0 -0
- agenta/sdk/models/blobs.py +33 -0
- agenta/sdk/models/evaluations.py +119 -0
- agenta/sdk/models/git.py +126 -0
- agenta/sdk/models/shared.py +167 -0
- agenta/sdk/models/testsets.py +163 -0
- agenta/sdk/models/tracing.py +202 -0
- agenta/sdk/models/workflows.py +753 -0
- agenta/sdk/tracing/attributes.py +4 -4
- agenta/sdk/tracing/exporters.py +67 -17
- agenta/sdk/tracing/inline.py +37 -45
- agenta/sdk/tracing/processors.py +97 -0
- agenta/sdk/tracing/propagation.py +3 -1
- agenta/sdk/tracing/spans.py +4 -0
- agenta/sdk/tracing/tracing.py +13 -15
- agenta/sdk/types.py +222 -22
- agenta/sdk/utils/cache.py +1 -1
- agenta/sdk/utils/client.py +38 -0
- agenta/sdk/utils/helpers.py +13 -12
- agenta/sdk/utils/logging.py +18 -78
- agenta/sdk/utils/references.py +23 -0
- agenta/sdk/workflows/builtin.py +600 -0
- agenta/sdk/workflows/configurations.py +22 -0
- agenta/sdk/workflows/errors.py +292 -0
- agenta/sdk/workflows/handlers.py +1791 -0
- agenta/sdk/workflows/interfaces.py +948 -0
- agenta/sdk/workflows/sandbox.py +118 -0
- agenta/sdk/workflows/utils.py +303 -6
- {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/METADATA +37 -33
- agenta-0.63.2.dist-info/RECORD +421 -0
- {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/WHEEL +1 -1
- agenta/sdk/middleware/adapt.py +0 -253
- agenta/sdk/middleware/base.py +0 -40
- agenta/sdk/middleware/flags.py +0 -40
- agenta/sdk/workflows/types.py +0 -472
- agenta-0.52.6.dist-info/RECORD +0 -371
- /agenta/sdk/{workflows → engines/running}/registry.py +0 -0
|
@@ -0,0 +1,861 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utilities for formatting and displaying evaluation results.
|
|
3
|
+
Contains helper functions for Rich text formatting and table generation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
from typing import Dict, List, Any, Optional
|
|
8
|
+
import asyncio
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
import unicodedata
|
|
13
|
+
import re
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class EvaluationTestcaseData:
|
|
18
|
+
"""
|
|
19
|
+
Data model for a single evaluation testcase.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
case_id: Unique identifier for the testcase
|
|
23
|
+
inputs: Input data for the testcase
|
|
24
|
+
application_outputs: Outputs from the application under test
|
|
25
|
+
evaluator_outputs: Outputs from evaluators (scores and assertions)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
case_id: str = ""
|
|
29
|
+
inputs: Dict[str, Any] = field(default_factory=dict)
|
|
30
|
+
application_outputs: Dict[str, Any] = field(default_factory=dict)
|
|
31
|
+
evaluator_outputs: Dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
def get_scores(self) -> Dict[str, float]:
|
|
34
|
+
"""Extract numeric scores from evaluator outputs."""
|
|
35
|
+
scores = {}
|
|
36
|
+
for key, value in self.evaluator_outputs.items():
|
|
37
|
+
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
|
38
|
+
scores[key] = value
|
|
39
|
+
return scores
|
|
40
|
+
|
|
41
|
+
def get_assertions(self) -> Dict[str, Any]:
|
|
42
|
+
"""Extract boolean assertions from evaluator outputs."""
|
|
43
|
+
assertions = {}
|
|
44
|
+
for key, value in self.evaluator_outputs.items():
|
|
45
|
+
if isinstance(value, bool):
|
|
46
|
+
assertions[key] = value
|
|
47
|
+
elif isinstance(value, list) and all(isinstance(v, bool) for v in value):
|
|
48
|
+
assertions[key] = value
|
|
49
|
+
return assertions
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class EvaluationReport:
|
|
54
|
+
"""
|
|
55
|
+
Data model for the complete evaluation report.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
run_id: Unique identifier for the evaluation run
|
|
59
|
+
cases: List of evaluation case data
|
|
60
|
+
summary: Summary statistics for the evaluation
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
run_id: str = ""
|
|
64
|
+
cases: List[EvaluationTestcaseData] = field(default_factory=list)
|
|
65
|
+
summary: Dict[str, Any] = field(default_factory=dict)
|
|
66
|
+
|
|
67
|
+
def get_total_cases(self) -> int:
|
|
68
|
+
"""Get total number of testcases."""
|
|
69
|
+
return len(self.cases)
|
|
70
|
+
|
|
71
|
+
def get_all_evaluator_keys(self) -> set[str]:
|
|
72
|
+
"""Get all unique evaluator keys across all cases."""
|
|
73
|
+
all_keys = set()
|
|
74
|
+
for case in self.cases:
|
|
75
|
+
all_keys.update(case.evaluator_outputs.keys())
|
|
76
|
+
return all_keys
|
|
77
|
+
|
|
78
|
+
def calculate_averages(self) -> Dict[str, float]:
|
|
79
|
+
"""Calculate average scores across all cases."""
|
|
80
|
+
averages = {}
|
|
81
|
+
all_scores = {}
|
|
82
|
+
|
|
83
|
+
# Collect all scores
|
|
84
|
+
for case in self.cases:
|
|
85
|
+
case_scores = case.get_scores()
|
|
86
|
+
for key, value in case_scores.items():
|
|
87
|
+
if key not in all_scores:
|
|
88
|
+
all_scores[key] = []
|
|
89
|
+
all_scores[key].append(value)
|
|
90
|
+
|
|
91
|
+
# Calculate averages
|
|
92
|
+
for key, values in all_scores.items():
|
|
93
|
+
if values:
|
|
94
|
+
averages[key] = sum(values) / len(values)
|
|
95
|
+
|
|
96
|
+
return averages
|
|
97
|
+
|
|
98
|
+
def calculate_assertion_percentage(self) -> float:
|
|
99
|
+
"""Calculate overall assertion success percentage."""
|
|
100
|
+
all_assertions = []
|
|
101
|
+
|
|
102
|
+
for case in self.cases:
|
|
103
|
+
case_assertions = case.get_assertions()
|
|
104
|
+
for value in case_assertions.values():
|
|
105
|
+
if isinstance(value, bool):
|
|
106
|
+
all_assertions.append(value)
|
|
107
|
+
elif isinstance(value, list):
|
|
108
|
+
all_assertions.extend(value)
|
|
109
|
+
|
|
110
|
+
if not all_assertions:
|
|
111
|
+
return 0.0
|
|
112
|
+
|
|
113
|
+
return (sum(all_assertions) / len(all_assertions)) * 100
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# Rich imports for progress tracking
|
|
117
|
+
try:
|
|
118
|
+
from rich.progress import track
|
|
119
|
+
|
|
120
|
+
RICH_AVAILABLE = True
|
|
121
|
+
except ImportError:
|
|
122
|
+
RICH_AVAILABLE = False
|
|
123
|
+
|
|
124
|
+
# Use simple iteration when Rich is not available
|
|
125
|
+
def track(iterable, description="Processing..."):
|
|
126
|
+
return iterable
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# Try to import Rich for enhanced formatting, fall back to plain text if not available
|
|
130
|
+
try:
|
|
131
|
+
from rich.console import Console
|
|
132
|
+
from rich.table import Table
|
|
133
|
+
from rich.text import Text
|
|
134
|
+
from rich import box
|
|
135
|
+
|
|
136
|
+
_HAS_RICH = True
|
|
137
|
+
except ImportError:
|
|
138
|
+
_HAS_RICH = False
|
|
139
|
+
|
|
140
|
+
# Fallback implementations for when Rich is not available
|
|
141
|
+
class Text:
|
|
142
|
+
def __init__(self, text="", style=None):
|
|
143
|
+
self.text = str(text)
|
|
144
|
+
|
|
145
|
+
def __str__(self):
|
|
146
|
+
return self.text
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def from_markup(text):
|
|
150
|
+
# Remove Rich markup for plain text fallback
|
|
151
|
+
import re
|
|
152
|
+
|
|
153
|
+
clean_text = re.sub(r'\[/?\w+(?:\s+\w+="[^"]*")*\]', "", text)
|
|
154
|
+
return Text(clean_text)
|
|
155
|
+
|
|
156
|
+
class Table:
|
|
157
|
+
def __init__(self, *args, **kwargs):
|
|
158
|
+
self.rows = []
|
|
159
|
+
self.headers = []
|
|
160
|
+
|
|
161
|
+
def add_column(self, header, **kwargs):
|
|
162
|
+
self.headers.append(header)
|
|
163
|
+
|
|
164
|
+
def add_row(self, *args):
|
|
165
|
+
self.rows.append([str(arg) for arg in args])
|
|
166
|
+
|
|
167
|
+
def add_section(self):
|
|
168
|
+
# Add separator in fallback mode
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
class Console:
|
|
172
|
+
def __init__(self, width=None, **kwargs):
|
|
173
|
+
self.width = width
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def smart_format_content(content: Any, max_length: int = 200) -> str:
|
|
177
|
+
"""
|
|
178
|
+
Smart content formatting with size awareness and Rich markup support.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
content: Content to format (dict, list, str, etc.)
|
|
182
|
+
max_length: Maximum character length before truncation
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Formatted string with optional Rich markup
|
|
186
|
+
"""
|
|
187
|
+
if content is None:
|
|
188
|
+
return ""
|
|
189
|
+
|
|
190
|
+
if isinstance(content, str):
|
|
191
|
+
if len(content) <= max_length:
|
|
192
|
+
return content
|
|
193
|
+
else:
|
|
194
|
+
return f"{content[: max_length - 3]}..."
|
|
195
|
+
|
|
196
|
+
if isinstance(content, (dict, list)):
|
|
197
|
+
try:
|
|
198
|
+
json_str = json.dumps(content, indent=None, separators=(",", ":"))
|
|
199
|
+
if len(json_str) <= max_length:
|
|
200
|
+
return json_str
|
|
201
|
+
else:
|
|
202
|
+
# For large objects, show structure with key-value pairs
|
|
203
|
+
if isinstance(content, dict):
|
|
204
|
+
items = list(content.items())[:3]
|
|
205
|
+
item_preview = ", ".join(f'"{k}": "{v}"' for k, v in items)
|
|
206
|
+
more_indicator = (
|
|
207
|
+
f" (+{len(content) - len(items)} more)"
|
|
208
|
+
if len(content) > len(items)
|
|
209
|
+
else ""
|
|
210
|
+
)
|
|
211
|
+
full_preview = f"{{{item_preview}{more_indicator}}}"
|
|
212
|
+
# Truncate the entire string to fit the column width
|
|
213
|
+
if len(full_preview) <= max_length:
|
|
214
|
+
return full_preview
|
|
215
|
+
else:
|
|
216
|
+
return f"{full_preview[: max_length - 3]}..."
|
|
217
|
+
else: # list
|
|
218
|
+
count = len(content)
|
|
219
|
+
item_preview = (
|
|
220
|
+
str(content[0])[:50] + "..."
|
|
221
|
+
if content and len(str(content[0])) > 50
|
|
222
|
+
else str(content[0])
|
|
223
|
+
if content
|
|
224
|
+
else ""
|
|
225
|
+
)
|
|
226
|
+
return (
|
|
227
|
+
f"[{item_preview}] ({count} items)"
|
|
228
|
+
if count > 1
|
|
229
|
+
else f"[{item_preview}]"
|
|
230
|
+
)
|
|
231
|
+
except (TypeError, ValueError):
|
|
232
|
+
# Fallback for non-serializable objects
|
|
233
|
+
str_repr = str(content)
|
|
234
|
+
return (
|
|
235
|
+
str_repr[: max_length - 3] + "..."
|
|
236
|
+
if len(str_repr) > max_length
|
|
237
|
+
else str_repr
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# For other types
|
|
241
|
+
str_repr = str(content)
|
|
242
|
+
return (
|
|
243
|
+
str_repr[: max_length - 3] + "..." if len(str_repr) > max_length else str_repr
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def format_number(value: float, max_precision: int = 3) -> str:
|
|
248
|
+
"""
|
|
249
|
+
Format numbers with intelligent precision and comma separators.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
value: The numeric value to format
|
|
253
|
+
max_precision: Maximum decimal places to show
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Formatted number string
|
|
257
|
+
"""
|
|
258
|
+
if abs(value) >= 1000:
|
|
259
|
+
# Use comma separators for large numbers
|
|
260
|
+
return f"{value:,.{max_precision}f}".rstrip("0").rstrip(".")
|
|
261
|
+
elif abs(value) < 0.001 and value != 0:
|
|
262
|
+
# Use scientific notation for very small numbers
|
|
263
|
+
return f"{value:.{max_precision}e}"
|
|
264
|
+
else:
|
|
265
|
+
# Standard formatting with up to max_precision decimal places
|
|
266
|
+
formatted = f"{value:.{max_precision}f}".rstrip("0").rstrip(".")
|
|
267
|
+
return formatted if formatted else "0"
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def format_evaluation_report_rich(
|
|
271
|
+
report_data: List[Dict[str, Any]], console_width: Optional[int] = None
|
|
272
|
+
) -> str:
|
|
273
|
+
"""Format evaluation results using Rich tables with enhanced styling."""
|
|
274
|
+
if not _HAS_RICH:
|
|
275
|
+
return _format_with_unicode_table(report_data, console_width)
|
|
276
|
+
|
|
277
|
+
if not report_data:
|
|
278
|
+
return "No evaluation data available"
|
|
279
|
+
|
|
280
|
+
# Create Rich table with responsive design
|
|
281
|
+
table = Table(
|
|
282
|
+
title="Evaluation Results",
|
|
283
|
+
box=box.ROUNDED,
|
|
284
|
+
show_header=True,
|
|
285
|
+
header_style="bold magenta",
|
|
286
|
+
width=console_width,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Add columns with responsive widths
|
|
290
|
+
table.add_column("Testcases", style="cyan", width=10)
|
|
291
|
+
table.add_column("Inputs", style="green", width=40, overflow="fold")
|
|
292
|
+
table.add_column("Outputs", style="blue", width=40, overflow="fold")
|
|
293
|
+
table.add_column("Scores", style="yellow", width=40)
|
|
294
|
+
table.add_column("Assertions", style="red", width=10)
|
|
295
|
+
|
|
296
|
+
# Collect totals for summary
|
|
297
|
+
total_scores = {}
|
|
298
|
+
total_assertions = []
|
|
299
|
+
|
|
300
|
+
for case_data in report_data:
|
|
301
|
+
case_id = case_data.get("case_id", "unknown")
|
|
302
|
+
inputs = case_data.get("inputs", {})
|
|
303
|
+
outputs = case_data.get("application_outputs", {})
|
|
304
|
+
|
|
305
|
+
# Format inputs and outputs with Rich Text for better display
|
|
306
|
+
inputs_text = Text.from_markup(smart_format_content(inputs, 400))
|
|
307
|
+
outputs_text = Text.from_markup(smart_format_content(outputs, 500))
|
|
308
|
+
|
|
309
|
+
# Format scores (numeric values). One score per line for readability.
|
|
310
|
+
scores_parts = []
|
|
311
|
+
for key, value in case_data.get("evaluator_outputs", {}).items():
|
|
312
|
+
|
|
313
|
+
def _maybe_add(k: str, v: Any):
|
|
314
|
+
if isinstance(v, bool):
|
|
315
|
+
return
|
|
316
|
+
num: Optional[float] = None
|
|
317
|
+
if isinstance(v, (int, float)):
|
|
318
|
+
num = float(v)
|
|
319
|
+
elif isinstance(v, str):
|
|
320
|
+
try:
|
|
321
|
+
num = float(v)
|
|
322
|
+
except Exception:
|
|
323
|
+
num = None
|
|
324
|
+
if num is not None:
|
|
325
|
+
formatted_value = format_number(num)
|
|
326
|
+
scores_parts.append(f"{k}: {formatted_value}")
|
|
327
|
+
if k not in total_scores:
|
|
328
|
+
total_scores[k] = []
|
|
329
|
+
total_scores[k].append(num)
|
|
330
|
+
|
|
331
|
+
if isinstance(value, list):
|
|
332
|
+
for idx, v in enumerate(value):
|
|
333
|
+
_maybe_add(key, v)
|
|
334
|
+
else:
|
|
335
|
+
_maybe_add(key, value)
|
|
336
|
+
scores_text = Text("\n".join(scores_parts))
|
|
337
|
+
|
|
338
|
+
# Format assertions (boolean values) - show each evaluator's result
|
|
339
|
+
assertions_parts = []
|
|
340
|
+
for key, value in case_data.get("evaluator_outputs", {}).items():
|
|
341
|
+
if isinstance(value, bool):
|
|
342
|
+
symbol = "[green]✔[/green]" if value else "[red]✗[/red]"
|
|
343
|
+
assertions_parts.append(symbol)
|
|
344
|
+
total_assertions.append(value)
|
|
345
|
+
elif isinstance(value, list) and all(isinstance(v, bool) for v in value):
|
|
346
|
+
# Handle multiple evaluators with same key name
|
|
347
|
+
for v in value:
|
|
348
|
+
symbol = "[green]✔[/green]" if v else "[red]✗[/red]"
|
|
349
|
+
assertions_parts.append(symbol)
|
|
350
|
+
total_assertions.append(v)
|
|
351
|
+
# Join with spaces to show multiple assertions clearly
|
|
352
|
+
assertions_text = Text.from_markup(
|
|
353
|
+
" ".join(assertions_parts) if assertions_parts else ""
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
table.add_row(case_id, inputs_text, outputs_text, scores_text, assertions_text)
|
|
357
|
+
# Add a separator after each data row for readability
|
|
358
|
+
table.add_section()
|
|
359
|
+
|
|
360
|
+
# Add a separator line before averages
|
|
361
|
+
table.add_section()
|
|
362
|
+
|
|
363
|
+
# Add averages row
|
|
364
|
+
avg_scores_parts = []
|
|
365
|
+
for key, values in total_scores.items():
|
|
366
|
+
avg = sum(values) / len(values) if values else 0
|
|
367
|
+
avg_scores_parts.append(f"{key}: {format_number(avg)}")
|
|
368
|
+
|
|
369
|
+
assertion_pct = (
|
|
370
|
+
(sum(total_assertions) / len(total_assertions) * 100) if total_assertions else 0
|
|
371
|
+
)
|
|
372
|
+
assertion_summary = f"{assertion_pct:.1f}%"
|
|
373
|
+
|
|
374
|
+
table.add_row(
|
|
375
|
+
"[bold italic]Averages[/bold italic]",
|
|
376
|
+
"",
|
|
377
|
+
"",
|
|
378
|
+
Text("\n".join(avg_scores_parts)),
|
|
379
|
+
Text(assertion_summary),
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Render the table
|
|
383
|
+
console = Console(width=console_width)
|
|
384
|
+
from io import StringIO
|
|
385
|
+
|
|
386
|
+
string_buffer = StringIO()
|
|
387
|
+
console.file = string_buffer
|
|
388
|
+
console.print(table)
|
|
389
|
+
return string_buffer.getvalue()
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _format_with_unicode_table(
|
|
393
|
+
report_data: List[Dict[str, Any]], console_width: Optional[int]
|
|
394
|
+
) -> str:
|
|
395
|
+
"""Fallback Unicode table formatting (enhanced version)"""
|
|
396
|
+
if not report_data:
|
|
397
|
+
return "No evaluation data available"
|
|
398
|
+
|
|
399
|
+
# Enhanced table formatting helpers
|
|
400
|
+
def make_border(widths, left="┏", mid="┳", right="┓", fill="━"):
|
|
401
|
+
return left + mid.join(fill * w for w in widths) + right
|
|
402
|
+
|
|
403
|
+
def make_separator(widths, left="├", mid="┼", right="┤", fill="─"):
|
|
404
|
+
return left + mid.join(fill * w for w in widths) + right
|
|
405
|
+
|
|
406
|
+
def make_row(values, widths, left="┃", mid="┃", right="┃"):
|
|
407
|
+
formatted = []
|
|
408
|
+
for val, width in zip(values, widths):
|
|
409
|
+
# Handle multi-line content better
|
|
410
|
+
val_str = str(val)
|
|
411
|
+
if "\n" in val_str:
|
|
412
|
+
# Take first line for table display
|
|
413
|
+
val_str = val_str.split("\n")[0]
|
|
414
|
+
formatted.append(f" {val_str:<{width - 2}} ")
|
|
415
|
+
return left + mid.join(formatted) + right
|
|
416
|
+
|
|
417
|
+
# Responsive column widths
|
|
418
|
+
if console_width and console_width < 120:
|
|
419
|
+
col_widths = [12, 20, 30, 20, 10] # Compact
|
|
420
|
+
else:
|
|
421
|
+
col_widths = [15, 30, 40, 25, 12] # Full width
|
|
422
|
+
|
|
423
|
+
# Build enhanced table
|
|
424
|
+
lines = []
|
|
425
|
+
|
|
426
|
+
# Header with styling
|
|
427
|
+
lines.append(make_border(col_widths))
|
|
428
|
+
lines.append(
|
|
429
|
+
make_row(
|
|
430
|
+
["Testcase ID", "Inputs", "Outputs", "Scores", "Assertions"], col_widths
|
|
431
|
+
)
|
|
432
|
+
)
|
|
433
|
+
lines.append(make_border(col_widths, "┡", "╇", "┩", "━"))
|
|
434
|
+
|
|
435
|
+
# Data rows with improved formatting
|
|
436
|
+
total_scores = {}
|
|
437
|
+
total_assertions = []
|
|
438
|
+
|
|
439
|
+
for case_data in report_data:
|
|
440
|
+
case_id = case_data.get("case_id", "unknown")
|
|
441
|
+
|
|
442
|
+
# Smart content formatting
|
|
443
|
+
inputs = case_data.get("inputs", {})
|
|
444
|
+
outputs = case_data.get("application_outputs", {})
|
|
445
|
+
|
|
446
|
+
inputs_str = smart_format_content(inputs, col_widths[1] - 4)
|
|
447
|
+
outputs_str = smart_format_content(outputs, col_widths[2] - 4)
|
|
448
|
+
|
|
449
|
+
# Format scores with proper number formatting, one per line
|
|
450
|
+
scores_parts = []
|
|
451
|
+
for key, value in case_data.get("evaluator_outputs", {}).items():
|
|
452
|
+
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
|
453
|
+
formatted_value = format_number(value)
|
|
454
|
+
scores_parts.append(f"{key}: {formatted_value}")
|
|
455
|
+
if key not in total_scores:
|
|
456
|
+
total_scores[key] = []
|
|
457
|
+
total_scores[key].append(value)
|
|
458
|
+
# Preserve line breaks for better readability in plain table
|
|
459
|
+
scores_str = "\n".join(scores_parts)
|
|
460
|
+
|
|
461
|
+
# Format assertions with colored symbols (fallback) - show each evaluator's result
|
|
462
|
+
assertions_parts = []
|
|
463
|
+
for key, value in case_data.get("evaluator_outputs", {}).items():
|
|
464
|
+
if isinstance(value, bool):
|
|
465
|
+
assertions_parts.append("✔" if value else "✗")
|
|
466
|
+
total_assertions.append(value)
|
|
467
|
+
elif isinstance(value, list) and all(isinstance(v, bool) for v in value):
|
|
468
|
+
# Handle multiple evaluators with same key name
|
|
469
|
+
for v in value:
|
|
470
|
+
assertions_parts.append("✔" if v else "✗")
|
|
471
|
+
total_assertions.append(v)
|
|
472
|
+
# Join with spaces to show multiple assertions clearly
|
|
473
|
+
assertions_str = " ".join(assertions_parts) if assertions_parts else ""
|
|
474
|
+
|
|
475
|
+
lines.append(
|
|
476
|
+
make_row(
|
|
477
|
+
[case_id, inputs_str, outputs_str, scores_str, assertions_str],
|
|
478
|
+
col_widths,
|
|
479
|
+
)
|
|
480
|
+
)
|
|
481
|
+
lines.append(make_separator(col_widths))
|
|
482
|
+
|
|
483
|
+
# Enhanced summary row
|
|
484
|
+
avg_scores_parts = []
|
|
485
|
+
for key, values in total_scores.items():
|
|
486
|
+
avg = sum(values) / len(values) if values else 0
|
|
487
|
+
avg_scores_parts.append(f"{key}: {format_number(avg)}")
|
|
488
|
+
avg_scores_str = smart_format_content(
|
|
489
|
+
", ".join(avg_scores_parts), col_widths[3] - 4
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
assertion_pct = (
|
|
493
|
+
(sum(total_assertions) / len(total_assertions) * 100) if total_assertions else 0
|
|
494
|
+
)
|
|
495
|
+
assertion_summary = f"{assertion_pct:.1f}%"
|
|
496
|
+
|
|
497
|
+
# Add separator line before averages for clarity
|
|
498
|
+
lines.append(make_border(col_widths, "┠", "╂", "┨", "━"))
|
|
499
|
+
lines.append(
|
|
500
|
+
make_row(["Averages", "", "", avg_scores_str, assertion_summary], col_widths)
|
|
501
|
+
)
|
|
502
|
+
lines.append(make_border(col_widths, "└", "┴", "┘", "─"))
|
|
503
|
+
|
|
504
|
+
return "\n".join(lines)
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
# Main function that chooses the best available formatting
|
|
508
|
+
def format_evaluation_report(
|
|
509
|
+
report_data: List[Dict[str, Any]], console_width: Optional[int] = None
|
|
510
|
+
) -> str:
|
|
511
|
+
"""Format evaluation results with best available method"""
|
|
512
|
+
return format_evaluation_report_rich(report_data, console_width)
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
async def display_evaluation_results(
|
|
516
|
+
eval_data, show_detailed_logs=True, console_width=None
|
|
517
|
+
):
|
|
518
|
+
"""Enhanced display evaluation results with Rich-like formatting and progress tracking"""
|
|
519
|
+
# Give traces a moment to be stored
|
|
520
|
+
print()
|
|
521
|
+
print("⏳ Waiting for traces to be available...")
|
|
522
|
+
await asyncio.sleep(2)
|
|
523
|
+
|
|
524
|
+
print()
|
|
525
|
+
print("📊 Processing evaluation results...")
|
|
526
|
+
print(f" run_id={eval_data['run'].id}") # type:ignore
|
|
527
|
+
|
|
528
|
+
# Collect data for the report table with progress tracking
|
|
529
|
+
report_data = []
|
|
530
|
+
scenarios_to_process = eval_data["scenarios"]
|
|
531
|
+
|
|
532
|
+
# Use Rich progress bar if available, otherwise simple iteration
|
|
533
|
+
if RICH_AVAILABLE:
|
|
534
|
+
scenario_iterator = track(
|
|
535
|
+
scenarios_to_process, description="📋 Processing scenarios"
|
|
536
|
+
)
|
|
537
|
+
else:
|
|
538
|
+
scenario_iterator = scenarios_to_process
|
|
539
|
+
print(f"📋 Processing {len(scenarios_to_process)} scenarios...")
|
|
540
|
+
|
|
541
|
+
for i, scenario in enumerate(scenario_iterator):
|
|
542
|
+
if not RICH_AVAILABLE and show_detailed_logs:
|
|
543
|
+
print(
|
|
544
|
+
f" 📄 scenario {i + 1}/{len(scenarios_to_process)}: {scenario['scenario'].id}"
|
|
545
|
+
) # type:ignore
|
|
546
|
+
elif show_detailed_logs:
|
|
547
|
+
print(f" scenario_id={scenario['scenario'].id}") # type:ignore
|
|
548
|
+
|
|
549
|
+
case_data = EvaluationTestcaseData().__dict__
|
|
550
|
+
|
|
551
|
+
for step_key, result in scenario["results"].items(): # type:ignore
|
|
552
|
+
if result.testcase_id:
|
|
553
|
+
if show_detailed_logs:
|
|
554
|
+
print(
|
|
555
|
+
f" step_key={str(step_key).ljust(32)}, testcase_id={result.testcase_id}"
|
|
556
|
+
)
|
|
557
|
+
# Use a more readable case ID
|
|
558
|
+
testcase_short = str(result.testcase_id)[:8]
|
|
559
|
+
case_data["case_id"] = f"{testcase_short}..."
|
|
560
|
+
|
|
561
|
+
elif result.trace_id:
|
|
562
|
+
if show_detailed_logs:
|
|
563
|
+
print(
|
|
564
|
+
f" step_key={str(step_key).ljust(32)}, trace_id={result.trace_id}"
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
# Fetch and process trace data using services module
|
|
568
|
+
try:
|
|
569
|
+
trace_data = await fetch_trace_data(result.trace_id)
|
|
570
|
+
if trace_data and "spans" in trace_data:
|
|
571
|
+
for span_key in trace_data["spans"].keys():
|
|
572
|
+
step_data = extract_trace_step_data(trace_data, span_key)
|
|
573
|
+
if step_data:
|
|
574
|
+
inputs = step_data["inputs"]
|
|
575
|
+
outputs = step_data["outputs"]
|
|
576
|
+
trace_type = step_data["trace_type"]
|
|
577
|
+
trace_evaluator_name = step_data.get("evaluator_name")
|
|
578
|
+
|
|
579
|
+
# Store inputs for report
|
|
580
|
+
if inputs:
|
|
581
|
+
case_data["inputs"] = clean_inputs_for_display(
|
|
582
|
+
**(inputs if isinstance(inputs, dict) else {})
|
|
583
|
+
)
|
|
584
|
+
if show_detailed_logs:
|
|
585
|
+
print(
|
|
586
|
+
f" inputs={inputs}"
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
# Determine if this is application or evaluator
|
|
590
|
+
if outputs:
|
|
591
|
+
# Heuristic to classify outputs:
|
|
592
|
+
# 1. If outputs is a single string value, it's likely the application output
|
|
593
|
+
# 2. If outputs is a dict with keys like 'score', 'myscore', 'success', it's evaluator output
|
|
594
|
+
# 3. If we already have application_outputs, everything else is evaluator output
|
|
595
|
+
|
|
596
|
+
is_application_output = False
|
|
597
|
+
if not case_data.get("application_outputs"):
|
|
598
|
+
# Check if this looks like a simple application output (single string)
|
|
599
|
+
if isinstance(outputs, str):
|
|
600
|
+
is_application_output = True
|
|
601
|
+
elif (
|
|
602
|
+
isinstance(outputs, dict)
|
|
603
|
+
and len(outputs) == 0
|
|
604
|
+
):
|
|
605
|
+
# Empty dict, skip
|
|
606
|
+
is_application_output = False
|
|
607
|
+
elif isinstance(outputs, dict):
|
|
608
|
+
# If it's a dict with typical evaluator keys, it's an evaluator
|
|
609
|
+
evaluator_keys = {
|
|
610
|
+
"score",
|
|
611
|
+
"myscore",
|
|
612
|
+
"success",
|
|
613
|
+
"failure",
|
|
614
|
+
"passed",
|
|
615
|
+
"failed",
|
|
616
|
+
}
|
|
617
|
+
if any(
|
|
618
|
+
key in evaluator_keys
|
|
619
|
+
for key in outputs.keys()
|
|
620
|
+
):
|
|
621
|
+
is_application_output = False
|
|
622
|
+
else:
|
|
623
|
+
# Otherwise, it might be application output
|
|
624
|
+
is_application_output = True
|
|
625
|
+
|
|
626
|
+
if is_application_output:
|
|
627
|
+
case_data["application_outputs"] = outputs
|
|
628
|
+
else:
|
|
629
|
+
# This is an evaluator output
|
|
630
|
+
# Use the evaluator name from trace data, or fall back to step_key hash
|
|
631
|
+
evaluator_name = trace_evaluator_name or (
|
|
632
|
+
step_key[:8] if step_key else None
|
|
633
|
+
)
|
|
634
|
+
process_evaluator_outputs(
|
|
635
|
+
case_data,
|
|
636
|
+
outputs,
|
|
637
|
+
evaluator_name=evaluator_name,
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
if show_detailed_logs:
|
|
641
|
+
print(
|
|
642
|
+
f" outputs={outputs}"
|
|
643
|
+
)
|
|
644
|
+
else:
|
|
645
|
+
if show_detailed_logs:
|
|
646
|
+
print(
|
|
647
|
+
f" ⚠️ no_trace_data"
|
|
648
|
+
)
|
|
649
|
+
except Exception as e:
|
|
650
|
+
if show_detailed_logs:
|
|
651
|
+
print(
|
|
652
|
+
f" ❌ trace_fetch_error: {e}"
|
|
653
|
+
)
|
|
654
|
+
else:
|
|
655
|
+
if show_detailed_logs:
|
|
656
|
+
print(
|
|
657
|
+
f" step_key={str(step_key).ljust(32)}, ❌ error={result.error}"
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
if case_data["case_id"]:
|
|
661
|
+
report_data.append(case_data)
|
|
662
|
+
|
|
663
|
+
# if show_detailed_logs:
|
|
664
|
+
# print(
|
|
665
|
+
# f"📈 metrics={json.dumps(eval_data['metrics'].data, indent=4)}"
|
|
666
|
+
# ) # type:ignore
|
|
667
|
+
|
|
668
|
+
# Display the enhanced formatted report table
|
|
669
|
+
print()
|
|
670
|
+
print("📋 Evaluation Report:")
|
|
671
|
+
print(format_evaluation_report(report_data, console_width))
|
|
672
|
+
|
|
673
|
+
# Add summary statistics
|
|
674
|
+
if report_data:
|
|
675
|
+
print()
|
|
676
|
+
print(f"✅ Successfully processed {len(report_data)} testcases")
|
|
677
|
+
|
|
678
|
+
# Count total evaluators
|
|
679
|
+
all_evaluator_keys = set()
|
|
680
|
+
for case in report_data:
|
|
681
|
+
all_evaluator_keys.update(case.get("evaluator_outputs", {}).keys())
|
|
682
|
+
|
|
683
|
+
if all_evaluator_keys:
|
|
684
|
+
print(
|
|
685
|
+
f"🔍 Evaluated with {len(all_evaluator_keys)} metrics: {', '.join(sorted(all_evaluator_keys))}"
|
|
686
|
+
)
|
|
687
|
+
else:
|
|
688
|
+
print("⚠️ No evaluation data found")
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
from typing import Callable, Dict, Optional, Any
|
|
692
|
+
|
|
693
|
+
from agenta.sdk.utils.client import authed_api
|
|
694
|
+
import asyncio
|
|
695
|
+
import json
|
|
696
|
+
from typing import Dict, Any, Optional
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
async def fetch_trace_data(
|
|
700
|
+
trace_id: str, max_retries: int = 3, delay: float = 1.0
|
|
701
|
+
) -> Optional[Dict[str, Any]]:
|
|
702
|
+
"""
|
|
703
|
+
Fetch trace data from the API with retry logic.
|
|
704
|
+
|
|
705
|
+
Args:
|
|
706
|
+
trace_id: The trace ID to fetch
|
|
707
|
+
max_retries: Maximum number of retry attempts
|
|
708
|
+
delay: Delay between retries in seconds
|
|
709
|
+
|
|
710
|
+
Returns:
|
|
711
|
+
Trace data dictionary or None if not found
|
|
712
|
+
"""
|
|
713
|
+
for attempt in range(max_retries):
|
|
714
|
+
try:
|
|
715
|
+
response = authed_api()(
|
|
716
|
+
method="GET", endpoint=f"/preview/tracing/traces/{trace_id}"
|
|
717
|
+
)
|
|
718
|
+
response.raise_for_status()
|
|
719
|
+
trace_data = response.json()
|
|
720
|
+
|
|
721
|
+
# print(trace_data)
|
|
722
|
+
|
|
723
|
+
# Get the traces dictionary
|
|
724
|
+
traces = trace_data.get("traces", {})
|
|
725
|
+
if traces:
|
|
726
|
+
# Get the first (and usually only) trace
|
|
727
|
+
for trace_key, trace_content in traces.items():
|
|
728
|
+
if (
|
|
729
|
+
trace_content
|
|
730
|
+
and "spans" in trace_content
|
|
731
|
+
and trace_content["spans"]
|
|
732
|
+
):
|
|
733
|
+
return trace_content
|
|
734
|
+
|
|
735
|
+
# If no data yet, retry on next iteration
|
|
736
|
+
if attempt < max_retries - 1:
|
|
737
|
+
await asyncio.sleep(delay)
|
|
738
|
+
|
|
739
|
+
except Exception as e:
|
|
740
|
+
if attempt < max_retries - 1:
|
|
741
|
+
await asyncio.sleep(delay)
|
|
742
|
+
continue
|
|
743
|
+
else:
|
|
744
|
+
print(f"Error fetching trace data: {e}")
|
|
745
|
+
return None
|
|
746
|
+
|
|
747
|
+
print("Failed to fetch trace data after retries")
|
|
748
|
+
return None
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def extract_trace_step_data(
|
|
752
|
+
trace_data: Dict[str, Any], step_key: str
|
|
753
|
+
) -> Optional[Dict[str, Any]]:
|
|
754
|
+
"""
|
|
755
|
+
Extract step data from trace information.
|
|
756
|
+
|
|
757
|
+
Args:
|
|
758
|
+
trace_data: The complete trace data
|
|
759
|
+
step_key: The step key to extract data for
|
|
760
|
+
|
|
761
|
+
Returns:
|
|
762
|
+
Step data dictionary or None if not found
|
|
763
|
+
"""
|
|
764
|
+
if not trace_data:
|
|
765
|
+
return None
|
|
766
|
+
|
|
767
|
+
spans = trace_data.get("spans", {})
|
|
768
|
+
if not spans or step_key not in spans:
|
|
769
|
+
return None
|
|
770
|
+
|
|
771
|
+
span_info = spans[step_key]
|
|
772
|
+
# Extract the actual evaluation data using the correct data structure
|
|
773
|
+
ag_data = span_info.get("attributes", {}).get("ag", {}).get("data", {})
|
|
774
|
+
|
|
775
|
+
if not ag_data:
|
|
776
|
+
return None
|
|
777
|
+
|
|
778
|
+
# Try to extract evaluator/application name from span
|
|
779
|
+
# The span_name field contains the workflow/evaluator name
|
|
780
|
+
evaluator_name = span_info.get("span_name") or span_info.get("name")
|
|
781
|
+
|
|
782
|
+
return {
|
|
783
|
+
"inputs": ag_data.get("inputs", {}),
|
|
784
|
+
"outputs": ag_data.get("outputs", {}),
|
|
785
|
+
"trace_type": span_info.get("trace_type"),
|
|
786
|
+
"evaluator_name": evaluator_name,
|
|
787
|
+
"span_info": span_info,
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def process_evaluator_outputs(
|
|
792
|
+
case_data: Dict[str, Any],
|
|
793
|
+
outputs: Dict[str, Any],
|
|
794
|
+
evaluator_name: Optional[str] = None,
|
|
795
|
+
) -> None:
|
|
796
|
+
"""
|
|
797
|
+
Process evaluator outputs and handle multiple evaluators with same key names.
|
|
798
|
+
|
|
799
|
+
Args:
|
|
800
|
+
case_data: The case data to update
|
|
801
|
+
outputs: The evaluator outputs to process
|
|
802
|
+
evaluator_name: Optional evaluator identifier for labeling
|
|
803
|
+
"""
|
|
804
|
+
# Handle multiple evaluators with same key names (like 'success', 'score')
|
|
805
|
+
for key, value in outputs.items():
|
|
806
|
+
# Label numeric scores by evaluator to distinguish between multiple evaluators
|
|
807
|
+
display_key = key
|
|
808
|
+
|
|
809
|
+
# If we have an evaluator name and this is a numeric value, prefix it
|
|
810
|
+
if (
|
|
811
|
+
evaluator_name
|
|
812
|
+
and isinstance(value, (int, float))
|
|
813
|
+
and not isinstance(value, bool)
|
|
814
|
+
):
|
|
815
|
+
display_key = f"{evaluator_name}.{key}"
|
|
816
|
+
|
|
817
|
+
# Store the value - if the key already exists, convert to list to preserve all values
|
|
818
|
+
if display_key in case_data["evaluator_outputs"]:
|
|
819
|
+
# Create lists for duplicate keys to preserve all values
|
|
820
|
+
existing = case_data["evaluator_outputs"][display_key]
|
|
821
|
+
if not isinstance(existing, list):
|
|
822
|
+
case_data["evaluator_outputs"][display_key] = [existing]
|
|
823
|
+
case_data["evaluator_outputs"][display_key].append(value)
|
|
824
|
+
else:
|
|
825
|
+
case_data["evaluator_outputs"][display_key] = value
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def clean_inputs_for_display(**kwargs) -> Dict[str, Any]:
|
|
829
|
+
"""
|
|
830
|
+
Clean inputs by removing internal IDs and trace data for cleaner display.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
inputs: Raw inputs dictionary
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
Cleaned inputs dictionary with only user-facing testcase fields
|
|
837
|
+
"""
|
|
838
|
+
inputs = kwargs.get("inputs")
|
|
839
|
+
if inputs:
|
|
840
|
+
# List of keys to exclude from display
|
|
841
|
+
# - Internal IDs (ending with _id)
|
|
842
|
+
# - Testcase internal fields (starting with testcase_)
|
|
843
|
+
# - Trace data (the 'trace' key which contains the full trace structure)
|
|
844
|
+
excluded_keys = {
|
|
845
|
+
"revision",
|
|
846
|
+
"parameters",
|
|
847
|
+
"testcase",
|
|
848
|
+
# "inputs",
|
|
849
|
+
"trace",
|
|
850
|
+
"outputs",
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
clean_inputs = {
|
|
854
|
+
k: v
|
|
855
|
+
for k, v in inputs.items()
|
|
856
|
+
if not k.endswith("_id")
|
|
857
|
+
and not k.startswith("testcase_")
|
|
858
|
+
and k not in excluded_keys
|
|
859
|
+
}
|
|
860
|
+
return clean_inputs or inputs
|
|
861
|
+
return inputs
|