agenta 0.52.6__py3-none-any.whl → 0.63.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (271) hide show
  1. agenta/__init__.py +12 -3
  2. agenta/client/__init__.py +4 -4
  3. agenta/client/backend/__init__.py +4 -4
  4. agenta/client/backend/api_keys/client.py +2 -2
  5. agenta/client/backend/billing/client.py +2 -2
  6. agenta/client/backend/billing/raw_client.py +2 -2
  7. agenta/client/backend/client.py +56 -48
  8. agenta/client/backend/core/client_wrapper.py +2 -2
  9. agenta/client/backend/core/file.py +3 -1
  10. agenta/client/backend/core/http_client.py +3 -3
  11. agenta/client/backend/core/pydantic_utilities.py +13 -3
  12. agenta/client/backend/human_evaluations/client.py +2 -2
  13. agenta/client/backend/human_evaluations/raw_client.py +2 -2
  14. agenta/client/backend/organization/client.py +46 -34
  15. agenta/client/backend/organization/raw_client.py +32 -26
  16. agenta/client/backend/raw_client.py +26 -26
  17. agenta/client/backend/testsets/client.py +18 -18
  18. agenta/client/backend/testsets/raw_client.py +30 -30
  19. agenta/client/backend/types/__init__.py +4 -4
  20. agenta/client/backend/types/account_request.py +3 -1
  21. agenta/client/backend/types/account_response.py +3 -1
  22. agenta/client/backend/types/agenta_node_dto.py +3 -1
  23. agenta/client/backend/types/agenta_nodes_response.py +3 -1
  24. agenta/client/backend/types/agenta_root_dto.py +3 -1
  25. agenta/client/backend/types/agenta_roots_response.py +3 -1
  26. agenta/client/backend/types/agenta_tree_dto.py +3 -1
  27. agenta/client/backend/types/agenta_trees_response.py +3 -1
  28. agenta/client/backend/types/aggregated_result.py +3 -1
  29. agenta/client/backend/types/analytics_response.py +3 -1
  30. agenta/client/backend/types/annotation.py +6 -4
  31. agenta/client/backend/types/annotation_create.py +3 -1
  32. agenta/client/backend/types/annotation_edit.py +3 -1
  33. agenta/client/backend/types/annotation_link.py +3 -1
  34. agenta/client/backend/types/annotation_link_response.py +3 -1
  35. agenta/client/backend/types/annotation_query.py +3 -1
  36. agenta/client/backend/types/annotation_query_request.py +3 -1
  37. agenta/client/backend/types/annotation_reference.py +3 -1
  38. agenta/client/backend/types/annotation_references.py +3 -1
  39. agenta/client/backend/types/annotation_response.py +3 -1
  40. agenta/client/backend/types/annotations_response.py +3 -1
  41. agenta/client/backend/types/app.py +3 -1
  42. agenta/client/backend/types/app_variant_response.py +3 -1
  43. agenta/client/backend/types/app_variant_revision.py +3 -1
  44. agenta/client/backend/types/artifact.py +6 -4
  45. agenta/client/backend/types/base_output.py +3 -1
  46. agenta/client/backend/types/body_fetch_workflow_revision.py +3 -1
  47. agenta/client/backend/types/body_import_testset.py +3 -1
  48. agenta/client/backend/types/bucket_dto.py +3 -1
  49. agenta/client/backend/types/collect_status_response.py +3 -1
  50. agenta/client/backend/types/config_db.py +3 -1
  51. agenta/client/backend/types/config_dto.py +3 -1
  52. agenta/client/backend/types/config_response_model.py +3 -1
  53. agenta/client/backend/types/correct_answer.py +3 -1
  54. agenta/client/backend/types/create_app_output.py +3 -1
  55. agenta/client/backend/types/custom_model_settings_dto.py +3 -1
  56. agenta/client/backend/types/custom_provider_dto.py +3 -1
  57. agenta/client/backend/types/custom_provider_kind.py +1 -1
  58. agenta/client/backend/types/custom_provider_settings_dto.py +3 -1
  59. agenta/client/backend/types/delete_evaluation.py +3 -1
  60. agenta/client/backend/types/environment_output.py +3 -1
  61. agenta/client/backend/types/environment_output_extended.py +3 -1
  62. agenta/client/backend/types/environment_revision.py +3 -1
  63. agenta/client/backend/types/error.py +3 -1
  64. agenta/client/backend/types/evaluation.py +3 -1
  65. agenta/client/backend/types/evaluation_scenario.py +3 -1
  66. agenta/client/backend/types/evaluation_scenario_input.py +3 -1
  67. agenta/client/backend/types/evaluation_scenario_output.py +3 -1
  68. agenta/client/backend/types/evaluation_scenario_result.py +3 -1
  69. agenta/client/backend/types/evaluator.py +6 -4
  70. agenta/client/backend/types/evaluator_config.py +6 -4
  71. agenta/client/backend/types/evaluator_flags.py +3 -1
  72. agenta/client/backend/types/evaluator_mapping_output_interface.py +3 -1
  73. agenta/client/backend/types/evaluator_output_interface.py +3 -1
  74. agenta/client/backend/types/evaluator_query.py +3 -1
  75. agenta/client/backend/types/evaluator_query_request.py +3 -1
  76. agenta/client/backend/types/evaluator_request.py +3 -1
  77. agenta/client/backend/types/evaluator_response.py +3 -1
  78. agenta/client/backend/types/evaluators_response.py +3 -1
  79. agenta/client/backend/types/exception_dto.py +3 -1
  80. agenta/client/backend/types/extended_o_tel_tracing_response.py +3 -1
  81. agenta/client/backend/types/get_config_response.py +3 -1
  82. agenta/client/backend/types/header.py +3 -1
  83. agenta/client/backend/types/http_validation_error.py +3 -1
  84. agenta/client/backend/types/human_evaluation.py +3 -1
  85. agenta/client/backend/types/human_evaluation_scenario.py +3 -1
  86. agenta/client/backend/types/human_evaluation_scenario_input.py +3 -1
  87. agenta/client/backend/types/human_evaluation_scenario_output.py +3 -1
  88. agenta/client/backend/types/invite_request.py +3 -1
  89. agenta/client/backend/types/legacy_analytics_response.py +3 -1
  90. agenta/client/backend/types/legacy_data_point.py +3 -1
  91. agenta/client/backend/types/legacy_evaluator.py +3 -1
  92. agenta/client/backend/types/legacy_scope_request.py +3 -1
  93. agenta/client/backend/types/legacy_scopes_response.py +3 -1
  94. agenta/client/backend/types/legacy_subscription_request.py +3 -1
  95. agenta/client/backend/types/legacy_user_request.py +3 -1
  96. agenta/client/backend/types/legacy_user_response.py +3 -1
  97. agenta/client/backend/types/lifecycle_dto.py +3 -1
  98. agenta/client/backend/types/link_dto.py +3 -1
  99. agenta/client/backend/types/list_api_keys_response.py +3 -1
  100. agenta/client/backend/types/llm_run_rate_limit.py +3 -1
  101. agenta/client/backend/types/meta_request.py +3 -1
  102. agenta/client/backend/types/metrics_dto.py +3 -1
  103. agenta/client/backend/types/new_testset.py +3 -1
  104. agenta/client/backend/types/node_dto.py +3 -1
  105. agenta/client/backend/types/o_tel_context_dto.py +3 -1
  106. agenta/client/backend/types/o_tel_event.py +6 -4
  107. agenta/client/backend/types/o_tel_event_dto.py +3 -1
  108. agenta/client/backend/types/o_tel_extra_dto.py +3 -1
  109. agenta/client/backend/types/o_tel_flat_span.py +6 -4
  110. agenta/client/backend/types/o_tel_link.py +6 -4
  111. agenta/client/backend/types/o_tel_link_dto.py +3 -1
  112. agenta/client/backend/types/o_tel_links_response.py +3 -1
  113. agenta/client/backend/types/o_tel_span.py +1 -1
  114. agenta/client/backend/types/o_tel_span_dto.py +3 -1
  115. agenta/client/backend/types/o_tel_spans_tree.py +3 -1
  116. agenta/client/backend/types/o_tel_tracing_data_response.py +3 -1
  117. agenta/client/backend/types/o_tel_tracing_request.py +3 -1
  118. agenta/client/backend/types/o_tel_tracing_response.py +3 -1
  119. agenta/client/backend/types/organization.py +3 -1
  120. agenta/client/backend/types/organization_details.py +3 -1
  121. agenta/client/backend/types/organization_membership_request.py +3 -1
  122. agenta/client/backend/types/organization_output.py +3 -1
  123. agenta/client/backend/types/organization_request.py +3 -1
  124. agenta/client/backend/types/parent_dto.py +3 -1
  125. agenta/client/backend/types/project_membership_request.py +3 -1
  126. agenta/client/backend/types/project_request.py +3 -1
  127. agenta/client/backend/types/project_scope.py +3 -1
  128. agenta/client/backend/types/projects_response.py +3 -1
  129. agenta/client/backend/types/reference.py +6 -4
  130. agenta/client/backend/types/reference_dto.py +3 -1
  131. agenta/client/backend/types/reference_request_model.py +3 -1
  132. agenta/client/backend/types/result.py +3 -1
  133. agenta/client/backend/types/root_dto.py +3 -1
  134. agenta/client/backend/types/scopes_response_model.py +3 -1
  135. agenta/client/backend/types/secret_dto.py +3 -1
  136. agenta/client/backend/types/secret_response_dto.py +3 -1
  137. agenta/client/backend/types/simple_evaluation_output.py +3 -1
  138. agenta/client/backend/types/span_dto.py +6 -4
  139. agenta/client/backend/types/standard_provider_dto.py +3 -1
  140. agenta/client/backend/types/standard_provider_settings_dto.py +3 -1
  141. agenta/client/backend/types/status_dto.py +3 -1
  142. agenta/client/backend/types/tags_request.py +3 -1
  143. agenta/client/backend/types/testcase_response.py +6 -4
  144. agenta/client/backend/types/testset.py +6 -4
  145. agenta/client/backend/types/{test_set_output_response.py → testset_output_response.py} +4 -2
  146. agenta/client/backend/types/testset_request.py +3 -1
  147. agenta/client/backend/types/testset_response.py +3 -1
  148. agenta/client/backend/types/{test_set_simple_response.py → testset_simple_response.py} +4 -2
  149. agenta/client/backend/types/testsets_response.py +3 -1
  150. agenta/client/backend/types/time_dto.py +3 -1
  151. agenta/client/backend/types/tree_dto.py +3 -1
  152. agenta/client/backend/types/update_app_output.py +3 -1
  153. agenta/client/backend/types/user_request.py +3 -1
  154. agenta/client/backend/types/validation_error.py +3 -1
  155. agenta/client/backend/types/workflow_artifact.py +6 -4
  156. agenta/client/backend/types/workflow_data.py +3 -1
  157. agenta/client/backend/types/workflow_flags.py +3 -1
  158. agenta/client/backend/types/workflow_request.py +3 -1
  159. agenta/client/backend/types/workflow_response.py +3 -1
  160. agenta/client/backend/types/workflow_revision.py +6 -4
  161. agenta/client/backend/types/workflow_revision_request.py +3 -1
  162. agenta/client/backend/types/workflow_revision_response.py +3 -1
  163. agenta/client/backend/types/workflow_revisions_response.py +3 -1
  164. agenta/client/backend/types/workflow_variant.py +6 -4
  165. agenta/client/backend/types/workflow_variant_request.py +3 -1
  166. agenta/client/backend/types/workflow_variant_response.py +3 -1
  167. agenta/client/backend/types/workflow_variants_response.py +3 -1
  168. agenta/client/backend/types/workflows_response.py +3 -1
  169. agenta/client/backend/types/workspace.py +3 -1
  170. agenta/client/backend/types/workspace_member_response.py +3 -1
  171. agenta/client/backend/types/workspace_membership_request.py +3 -1
  172. agenta/client/backend/types/workspace_permission.py +3 -1
  173. agenta/client/backend/types/workspace_request.py +3 -1
  174. agenta/client/backend/types/workspace_response.py +3 -1
  175. agenta/client/backend/vault/raw_client.py +4 -4
  176. agenta/client/backend/workspace/client.py +2 -2
  177. agenta/client/client.py +102 -88
  178. agenta/sdk/__init__.py +52 -3
  179. agenta/sdk/agenta_init.py +43 -16
  180. agenta/sdk/assets.py +23 -15
  181. agenta/sdk/context/serving.py +20 -8
  182. agenta/sdk/context/tracing.py +40 -22
  183. agenta/sdk/contexts/__init__.py +0 -0
  184. agenta/sdk/contexts/routing.py +38 -0
  185. agenta/sdk/contexts/running.py +57 -0
  186. agenta/sdk/contexts/tracing.py +86 -0
  187. agenta/sdk/decorators/__init__.py +1 -0
  188. agenta/sdk/decorators/routing.py +284 -0
  189. agenta/sdk/decorators/running.py +692 -98
  190. agenta/sdk/decorators/serving.py +20 -21
  191. agenta/sdk/decorators/tracing.py +176 -131
  192. agenta/sdk/engines/__init__.py +0 -0
  193. agenta/sdk/engines/running/__init__.py +0 -0
  194. agenta/sdk/engines/running/utils.py +17 -0
  195. agenta/sdk/engines/tracing/__init__.py +1 -0
  196. agenta/sdk/engines/tracing/attributes.py +185 -0
  197. agenta/sdk/engines/tracing/conventions.py +49 -0
  198. agenta/sdk/engines/tracing/exporters.py +130 -0
  199. agenta/sdk/engines/tracing/inline.py +1154 -0
  200. agenta/sdk/engines/tracing/processors.py +190 -0
  201. agenta/sdk/engines/tracing/propagation.py +102 -0
  202. agenta/sdk/engines/tracing/spans.py +136 -0
  203. agenta/sdk/engines/tracing/tracing.py +324 -0
  204. agenta/sdk/evaluations/__init__.py +2 -0
  205. agenta/sdk/evaluations/metrics.py +37 -0
  206. agenta/sdk/evaluations/preview/__init__.py +0 -0
  207. agenta/sdk/evaluations/preview/evaluate.py +765 -0
  208. agenta/sdk/evaluations/preview/utils.py +861 -0
  209. agenta/sdk/evaluations/results.py +66 -0
  210. agenta/sdk/evaluations/runs.py +153 -0
  211. agenta/sdk/evaluations/scenarios.py +48 -0
  212. agenta/sdk/litellm/litellm.py +12 -0
  213. agenta/sdk/litellm/mockllm.py +6 -8
  214. agenta/sdk/litellm/mocks/__init__.py +5 -5
  215. agenta/sdk/managers/applications.py +304 -0
  216. agenta/sdk/managers/config.py +2 -2
  217. agenta/sdk/managers/evaluations.py +0 -0
  218. agenta/sdk/managers/evaluators.py +303 -0
  219. agenta/sdk/managers/secrets.py +161 -24
  220. agenta/sdk/managers/shared.py +3 -1
  221. agenta/sdk/managers/testsets.py +441 -0
  222. agenta/sdk/managers/vault.py +3 -3
  223. agenta/sdk/middleware/auth.py +0 -176
  224. agenta/sdk/middleware/config.py +27 -9
  225. agenta/sdk/middleware/vault.py +204 -9
  226. agenta/sdk/middlewares/__init__.py +0 -0
  227. agenta/sdk/middlewares/routing/__init__.py +0 -0
  228. agenta/sdk/middlewares/routing/auth.py +263 -0
  229. agenta/sdk/middlewares/routing/cors.py +30 -0
  230. agenta/sdk/middlewares/routing/otel.py +29 -0
  231. agenta/sdk/middlewares/running/__init__.py +0 -0
  232. agenta/sdk/middlewares/running/normalizer.py +321 -0
  233. agenta/sdk/middlewares/running/resolver.py +161 -0
  234. agenta/sdk/middlewares/running/vault.py +140 -0
  235. agenta/sdk/models/__init__.py +0 -0
  236. agenta/sdk/models/blobs.py +33 -0
  237. agenta/sdk/models/evaluations.py +119 -0
  238. agenta/sdk/models/git.py +126 -0
  239. agenta/sdk/models/shared.py +167 -0
  240. agenta/sdk/models/testsets.py +163 -0
  241. agenta/sdk/models/tracing.py +202 -0
  242. agenta/sdk/models/workflows.py +753 -0
  243. agenta/sdk/tracing/attributes.py +4 -4
  244. agenta/sdk/tracing/exporters.py +67 -17
  245. agenta/sdk/tracing/inline.py +37 -45
  246. agenta/sdk/tracing/processors.py +97 -0
  247. agenta/sdk/tracing/propagation.py +3 -1
  248. agenta/sdk/tracing/spans.py +4 -0
  249. agenta/sdk/tracing/tracing.py +13 -15
  250. agenta/sdk/types.py +222 -22
  251. agenta/sdk/utils/cache.py +1 -1
  252. agenta/sdk/utils/client.py +38 -0
  253. agenta/sdk/utils/helpers.py +13 -12
  254. agenta/sdk/utils/logging.py +18 -78
  255. agenta/sdk/utils/references.py +23 -0
  256. agenta/sdk/workflows/builtin.py +600 -0
  257. agenta/sdk/workflows/configurations.py +22 -0
  258. agenta/sdk/workflows/errors.py +292 -0
  259. agenta/sdk/workflows/handlers.py +1791 -0
  260. agenta/sdk/workflows/interfaces.py +948 -0
  261. agenta/sdk/workflows/sandbox.py +118 -0
  262. agenta/sdk/workflows/utils.py +303 -6
  263. {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/METADATA +37 -33
  264. agenta-0.63.2.dist-info/RECORD +421 -0
  265. {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/WHEEL +1 -1
  266. agenta/sdk/middleware/adapt.py +0 -253
  267. agenta/sdk/middleware/base.py +0 -40
  268. agenta/sdk/middleware/flags.py +0 -40
  269. agenta/sdk/workflows/types.py +0 -472
  270. agenta-0.52.6.dist-info/RECORD +0 -371
  271. /agenta/sdk/{workflows → engines/running}/registry.py +0 -0
@@ -0,0 +1,861 @@
1
+ """
2
+ Utilities for formatting and displaying evaluation results.
3
+ Contains helper functions for Rich text formatting and table generation.
4
+ """
5
+
6
+ import json
7
+ from typing import Dict, List, Any, Optional
8
+ import asyncio
9
+ from uuid import UUID
10
+ from dataclasses import dataclass, field
11
+
12
+ import unicodedata
13
+ import re
14
+
15
+
16
+ @dataclass
17
+ class EvaluationTestcaseData:
18
+ """
19
+ Data model for a single evaluation testcase.
20
+
21
+ Attributes:
22
+ case_id: Unique identifier for the testcase
23
+ inputs: Input data for the testcase
24
+ application_outputs: Outputs from the application under test
25
+ evaluator_outputs: Outputs from evaluators (scores and assertions)
26
+ """
27
+
28
+ case_id: str = ""
29
+ inputs: Dict[str, Any] = field(default_factory=dict)
30
+ application_outputs: Dict[str, Any] = field(default_factory=dict)
31
+ evaluator_outputs: Dict[str, Any] = field(default_factory=dict)
32
+
33
+ def get_scores(self) -> Dict[str, float]:
34
+ """Extract numeric scores from evaluator outputs."""
35
+ scores = {}
36
+ for key, value in self.evaluator_outputs.items():
37
+ if isinstance(value, (int, float)) and not isinstance(value, bool):
38
+ scores[key] = value
39
+ return scores
40
+
41
+ def get_assertions(self) -> Dict[str, Any]:
42
+ """Extract boolean assertions from evaluator outputs."""
43
+ assertions = {}
44
+ for key, value in self.evaluator_outputs.items():
45
+ if isinstance(value, bool):
46
+ assertions[key] = value
47
+ elif isinstance(value, list) and all(isinstance(v, bool) for v in value):
48
+ assertions[key] = value
49
+ return assertions
50
+
51
+
52
+ @dataclass
53
+ class EvaluationReport:
54
+ """
55
+ Data model for the complete evaluation report.
56
+
57
+ Attributes:
58
+ run_id: Unique identifier for the evaluation run
59
+ cases: List of evaluation case data
60
+ summary: Summary statistics for the evaluation
61
+ """
62
+
63
+ run_id: str = ""
64
+ cases: List[EvaluationTestcaseData] = field(default_factory=list)
65
+ summary: Dict[str, Any] = field(default_factory=dict)
66
+
67
+ def get_total_cases(self) -> int:
68
+ """Get total number of testcases."""
69
+ return len(self.cases)
70
+
71
+ def get_all_evaluator_keys(self) -> set[str]:
72
+ """Get all unique evaluator keys across all cases."""
73
+ all_keys = set()
74
+ for case in self.cases:
75
+ all_keys.update(case.evaluator_outputs.keys())
76
+ return all_keys
77
+
78
+ def calculate_averages(self) -> Dict[str, float]:
79
+ """Calculate average scores across all cases."""
80
+ averages = {}
81
+ all_scores = {}
82
+
83
+ # Collect all scores
84
+ for case in self.cases:
85
+ case_scores = case.get_scores()
86
+ for key, value in case_scores.items():
87
+ if key not in all_scores:
88
+ all_scores[key] = []
89
+ all_scores[key].append(value)
90
+
91
+ # Calculate averages
92
+ for key, values in all_scores.items():
93
+ if values:
94
+ averages[key] = sum(values) / len(values)
95
+
96
+ return averages
97
+
98
+ def calculate_assertion_percentage(self) -> float:
99
+ """Calculate overall assertion success percentage."""
100
+ all_assertions = []
101
+
102
+ for case in self.cases:
103
+ case_assertions = case.get_assertions()
104
+ for value in case_assertions.values():
105
+ if isinstance(value, bool):
106
+ all_assertions.append(value)
107
+ elif isinstance(value, list):
108
+ all_assertions.extend(value)
109
+
110
+ if not all_assertions:
111
+ return 0.0
112
+
113
+ return (sum(all_assertions) / len(all_assertions)) * 100
114
+
115
+
116
+ # Rich imports for progress tracking
117
+ try:
118
+ from rich.progress import track
119
+
120
+ RICH_AVAILABLE = True
121
+ except ImportError:
122
+ RICH_AVAILABLE = False
123
+
124
+ # Use simple iteration when Rich is not available
125
+ def track(iterable, description="Processing..."):
126
+ return iterable
127
+
128
+
129
+ # Try to import Rich for enhanced formatting, fall back to plain text if not available
130
+ try:
131
+ from rich.console import Console
132
+ from rich.table import Table
133
+ from rich.text import Text
134
+ from rich import box
135
+
136
+ _HAS_RICH = True
137
+ except ImportError:
138
+ _HAS_RICH = False
139
+
140
+ # Fallback implementations for when Rich is not available
141
+ class Text:
142
+ def __init__(self, text="", style=None):
143
+ self.text = str(text)
144
+
145
+ def __str__(self):
146
+ return self.text
147
+
148
+ @staticmethod
149
+ def from_markup(text):
150
+ # Remove Rich markup for plain text fallback
151
+ import re
152
+
153
+ clean_text = re.sub(r'\[/?\w+(?:\s+\w+="[^"]*")*\]', "", text)
154
+ return Text(clean_text)
155
+
156
+ class Table:
157
+ def __init__(self, *args, **kwargs):
158
+ self.rows = []
159
+ self.headers = []
160
+
161
+ def add_column(self, header, **kwargs):
162
+ self.headers.append(header)
163
+
164
+ def add_row(self, *args):
165
+ self.rows.append([str(arg) for arg in args])
166
+
167
+ def add_section(self):
168
+ # Add separator in fallback mode
169
+ pass
170
+
171
+ class Console:
172
+ def __init__(self, width=None, **kwargs):
173
+ self.width = width
174
+
175
+
176
+ def smart_format_content(content: Any, max_length: int = 200) -> str:
177
+ """
178
+ Smart content formatting with size awareness and Rich markup support.
179
+
180
+ Args:
181
+ content: Content to format (dict, list, str, etc.)
182
+ max_length: Maximum character length before truncation
183
+
184
+ Returns:
185
+ Formatted string with optional Rich markup
186
+ """
187
+ if content is None:
188
+ return ""
189
+
190
+ if isinstance(content, str):
191
+ if len(content) <= max_length:
192
+ return content
193
+ else:
194
+ return f"{content[: max_length - 3]}..."
195
+
196
+ if isinstance(content, (dict, list)):
197
+ try:
198
+ json_str = json.dumps(content, indent=None, separators=(",", ":"))
199
+ if len(json_str) <= max_length:
200
+ return json_str
201
+ else:
202
+ # For large objects, show structure with key-value pairs
203
+ if isinstance(content, dict):
204
+ items = list(content.items())[:3]
205
+ item_preview = ", ".join(f'"{k}": "{v}"' for k, v in items)
206
+ more_indicator = (
207
+ f" (+{len(content) - len(items)} more)"
208
+ if len(content) > len(items)
209
+ else ""
210
+ )
211
+ full_preview = f"{{{item_preview}{more_indicator}}}"
212
+ # Truncate the entire string to fit the column width
213
+ if len(full_preview) <= max_length:
214
+ return full_preview
215
+ else:
216
+ return f"{full_preview[: max_length - 3]}..."
217
+ else: # list
218
+ count = len(content)
219
+ item_preview = (
220
+ str(content[0])[:50] + "..."
221
+ if content and len(str(content[0])) > 50
222
+ else str(content[0])
223
+ if content
224
+ else ""
225
+ )
226
+ return (
227
+ f"[{item_preview}] ({count} items)"
228
+ if count > 1
229
+ else f"[{item_preview}]"
230
+ )
231
+ except (TypeError, ValueError):
232
+ # Fallback for non-serializable objects
233
+ str_repr = str(content)
234
+ return (
235
+ str_repr[: max_length - 3] + "..."
236
+ if len(str_repr) > max_length
237
+ else str_repr
238
+ )
239
+
240
+ # For other types
241
+ str_repr = str(content)
242
+ return (
243
+ str_repr[: max_length - 3] + "..." if len(str_repr) > max_length else str_repr
244
+ )
245
+
246
+
247
+ def format_number(value: float, max_precision: int = 3) -> str:
248
+ """
249
+ Format numbers with intelligent precision and comma separators.
250
+
251
+ Args:
252
+ value: The numeric value to format
253
+ max_precision: Maximum decimal places to show
254
+
255
+ Returns:
256
+ Formatted number string
257
+ """
258
+ if abs(value) >= 1000:
259
+ # Use comma separators for large numbers
260
+ return f"{value:,.{max_precision}f}".rstrip("0").rstrip(".")
261
+ elif abs(value) < 0.001 and value != 0:
262
+ # Use scientific notation for very small numbers
263
+ return f"{value:.{max_precision}e}"
264
+ else:
265
+ # Standard formatting with up to max_precision decimal places
266
+ formatted = f"{value:.{max_precision}f}".rstrip("0").rstrip(".")
267
+ return formatted if formatted else "0"
268
+
269
+
270
+ def format_evaluation_report_rich(
271
+ report_data: List[Dict[str, Any]], console_width: Optional[int] = None
272
+ ) -> str:
273
+ """Format evaluation results using Rich tables with enhanced styling."""
274
+ if not _HAS_RICH:
275
+ return _format_with_unicode_table(report_data, console_width)
276
+
277
+ if not report_data:
278
+ return "No evaluation data available"
279
+
280
+ # Create Rich table with responsive design
281
+ table = Table(
282
+ title="Evaluation Results",
283
+ box=box.ROUNDED,
284
+ show_header=True,
285
+ header_style="bold magenta",
286
+ width=console_width,
287
+ )
288
+
289
+ # Add columns with responsive widths
290
+ table.add_column("Testcases", style="cyan", width=10)
291
+ table.add_column("Inputs", style="green", width=40, overflow="fold")
292
+ table.add_column("Outputs", style="blue", width=40, overflow="fold")
293
+ table.add_column("Scores", style="yellow", width=40)
294
+ table.add_column("Assertions", style="red", width=10)
295
+
296
+ # Collect totals for summary
297
+ total_scores = {}
298
+ total_assertions = []
299
+
300
+ for case_data in report_data:
301
+ case_id = case_data.get("case_id", "unknown")
302
+ inputs = case_data.get("inputs", {})
303
+ outputs = case_data.get("application_outputs", {})
304
+
305
+ # Format inputs and outputs with Rich Text for better display
306
+ inputs_text = Text.from_markup(smart_format_content(inputs, 400))
307
+ outputs_text = Text.from_markup(smart_format_content(outputs, 500))
308
+
309
+ # Format scores (numeric values). One score per line for readability.
310
+ scores_parts = []
311
+ for key, value in case_data.get("evaluator_outputs", {}).items():
312
+
313
+ def _maybe_add(k: str, v: Any):
314
+ if isinstance(v, bool):
315
+ return
316
+ num: Optional[float] = None
317
+ if isinstance(v, (int, float)):
318
+ num = float(v)
319
+ elif isinstance(v, str):
320
+ try:
321
+ num = float(v)
322
+ except Exception:
323
+ num = None
324
+ if num is not None:
325
+ formatted_value = format_number(num)
326
+ scores_parts.append(f"{k}: {formatted_value}")
327
+ if k not in total_scores:
328
+ total_scores[k] = []
329
+ total_scores[k].append(num)
330
+
331
+ if isinstance(value, list):
332
+ for idx, v in enumerate(value):
333
+ _maybe_add(key, v)
334
+ else:
335
+ _maybe_add(key, value)
336
+ scores_text = Text("\n".join(scores_parts))
337
+
338
+ # Format assertions (boolean values) - show each evaluator's result
339
+ assertions_parts = []
340
+ for key, value in case_data.get("evaluator_outputs", {}).items():
341
+ if isinstance(value, bool):
342
+ symbol = "[green]✔[/green]" if value else "[red]✗[/red]"
343
+ assertions_parts.append(symbol)
344
+ total_assertions.append(value)
345
+ elif isinstance(value, list) and all(isinstance(v, bool) for v in value):
346
+ # Handle multiple evaluators with same key name
347
+ for v in value:
348
+ symbol = "[green]✔[/green]" if v else "[red]✗[/red]"
349
+ assertions_parts.append(symbol)
350
+ total_assertions.append(v)
351
+ # Join with spaces to show multiple assertions clearly
352
+ assertions_text = Text.from_markup(
353
+ " ".join(assertions_parts) if assertions_parts else ""
354
+ )
355
+
356
+ table.add_row(case_id, inputs_text, outputs_text, scores_text, assertions_text)
357
+ # Add a separator after each data row for readability
358
+ table.add_section()
359
+
360
+ # Add a separator line before averages
361
+ table.add_section()
362
+
363
+ # Add averages row
364
+ avg_scores_parts = []
365
+ for key, values in total_scores.items():
366
+ avg = sum(values) / len(values) if values else 0
367
+ avg_scores_parts.append(f"{key}: {format_number(avg)}")
368
+
369
+ assertion_pct = (
370
+ (sum(total_assertions) / len(total_assertions) * 100) if total_assertions else 0
371
+ )
372
+ assertion_summary = f"{assertion_pct:.1f}%"
373
+
374
+ table.add_row(
375
+ "[bold italic]Averages[/bold italic]",
376
+ "",
377
+ "",
378
+ Text("\n".join(avg_scores_parts)),
379
+ Text(assertion_summary),
380
+ )
381
+
382
+ # Render the table
383
+ console = Console(width=console_width)
384
+ from io import StringIO
385
+
386
+ string_buffer = StringIO()
387
+ console.file = string_buffer
388
+ console.print(table)
389
+ return string_buffer.getvalue()
390
+
391
+
392
+ def _format_with_unicode_table(
393
+ report_data: List[Dict[str, Any]], console_width: Optional[int]
394
+ ) -> str:
395
+ """Fallback Unicode table formatting (enhanced version)"""
396
+ if not report_data:
397
+ return "No evaluation data available"
398
+
399
+ # Enhanced table formatting helpers
400
+ def make_border(widths, left="┏", mid="┳", right="┓", fill="━"):
401
+ return left + mid.join(fill * w for w in widths) + right
402
+
403
+ def make_separator(widths, left="├", mid="┼", right="┤", fill="─"):
404
+ return left + mid.join(fill * w for w in widths) + right
405
+
406
+ def make_row(values, widths, left="┃", mid="┃", right="┃"):
407
+ formatted = []
408
+ for val, width in zip(values, widths):
409
+ # Handle multi-line content better
410
+ val_str = str(val)
411
+ if "\n" in val_str:
412
+ # Take first line for table display
413
+ val_str = val_str.split("\n")[0]
414
+ formatted.append(f" {val_str:<{width - 2}} ")
415
+ return left + mid.join(formatted) + right
416
+
417
+ # Responsive column widths
418
+ if console_width and console_width < 120:
419
+ col_widths = [12, 20, 30, 20, 10] # Compact
420
+ else:
421
+ col_widths = [15, 30, 40, 25, 12] # Full width
422
+
423
+ # Build enhanced table
424
+ lines = []
425
+
426
+ # Header with styling
427
+ lines.append(make_border(col_widths))
428
+ lines.append(
429
+ make_row(
430
+ ["Testcase ID", "Inputs", "Outputs", "Scores", "Assertions"], col_widths
431
+ )
432
+ )
433
+ lines.append(make_border(col_widths, "┡", "╇", "┩", "━"))
434
+
435
+ # Data rows with improved formatting
436
+ total_scores = {}
437
+ total_assertions = []
438
+
439
+ for case_data in report_data:
440
+ case_id = case_data.get("case_id", "unknown")
441
+
442
+ # Smart content formatting
443
+ inputs = case_data.get("inputs", {})
444
+ outputs = case_data.get("application_outputs", {})
445
+
446
+ inputs_str = smart_format_content(inputs, col_widths[1] - 4)
447
+ outputs_str = smart_format_content(outputs, col_widths[2] - 4)
448
+
449
+ # Format scores with proper number formatting, one per line
450
+ scores_parts = []
451
+ for key, value in case_data.get("evaluator_outputs", {}).items():
452
+ if isinstance(value, (int, float)) and not isinstance(value, bool):
453
+ formatted_value = format_number(value)
454
+ scores_parts.append(f"{key}: {formatted_value}")
455
+ if key not in total_scores:
456
+ total_scores[key] = []
457
+ total_scores[key].append(value)
458
+ # Preserve line breaks for better readability in plain table
459
+ scores_str = "\n".join(scores_parts)
460
+
461
+ # Format assertions with colored symbols (fallback) - show each evaluator's result
462
+ assertions_parts = []
463
+ for key, value in case_data.get("evaluator_outputs", {}).items():
464
+ if isinstance(value, bool):
465
+ assertions_parts.append("✔" if value else "✗")
466
+ total_assertions.append(value)
467
+ elif isinstance(value, list) and all(isinstance(v, bool) for v in value):
468
+ # Handle multiple evaluators with same key name
469
+ for v in value:
470
+ assertions_parts.append("✔" if v else "✗")
471
+ total_assertions.append(v)
472
+ # Join with spaces to show multiple assertions clearly
473
+ assertions_str = " ".join(assertions_parts) if assertions_parts else ""
474
+
475
+ lines.append(
476
+ make_row(
477
+ [case_id, inputs_str, outputs_str, scores_str, assertions_str],
478
+ col_widths,
479
+ )
480
+ )
481
+ lines.append(make_separator(col_widths))
482
+
483
+ # Enhanced summary row
484
+ avg_scores_parts = []
485
+ for key, values in total_scores.items():
486
+ avg = sum(values) / len(values) if values else 0
487
+ avg_scores_parts.append(f"{key}: {format_number(avg)}")
488
+ avg_scores_str = smart_format_content(
489
+ ", ".join(avg_scores_parts), col_widths[3] - 4
490
+ )
491
+
492
+ assertion_pct = (
493
+ (sum(total_assertions) / len(total_assertions) * 100) if total_assertions else 0
494
+ )
495
+ assertion_summary = f"{assertion_pct:.1f}%"
496
+
497
+ # Add separator line before averages for clarity
498
+ lines.append(make_border(col_widths, "┠", "╂", "┨", "━"))
499
+ lines.append(
500
+ make_row(["Averages", "", "", avg_scores_str, assertion_summary], col_widths)
501
+ )
502
+ lines.append(make_border(col_widths, "└", "┴", "┘", "─"))
503
+
504
+ return "\n".join(lines)
505
+
506
+
507
+ # Main function that chooses the best available formatting
508
+ def format_evaluation_report(
509
+ report_data: List[Dict[str, Any]], console_width: Optional[int] = None
510
+ ) -> str:
511
+ """Format evaluation results with best available method"""
512
+ return format_evaluation_report_rich(report_data, console_width)
513
+
514
+
515
+ async def display_evaluation_results(
516
+ eval_data, show_detailed_logs=True, console_width=None
517
+ ):
518
+ """Enhanced display evaluation results with Rich-like formatting and progress tracking"""
519
+ # Give traces a moment to be stored
520
+ print()
521
+ print("⏳ Waiting for traces to be available...")
522
+ await asyncio.sleep(2)
523
+
524
+ print()
525
+ print("📊 Processing evaluation results...")
526
+ print(f" run_id={eval_data['run'].id}") # type:ignore
527
+
528
+ # Collect data for the report table with progress tracking
529
+ report_data = []
530
+ scenarios_to_process = eval_data["scenarios"]
531
+
532
+ # Use Rich progress bar if available, otherwise simple iteration
533
+ if RICH_AVAILABLE:
534
+ scenario_iterator = track(
535
+ scenarios_to_process, description="📋 Processing scenarios"
536
+ )
537
+ else:
538
+ scenario_iterator = scenarios_to_process
539
+ print(f"📋 Processing {len(scenarios_to_process)} scenarios...")
540
+
541
+ for i, scenario in enumerate(scenario_iterator):
542
+ if not RICH_AVAILABLE and show_detailed_logs:
543
+ print(
544
+ f" 📄 scenario {i + 1}/{len(scenarios_to_process)}: {scenario['scenario'].id}"
545
+ ) # type:ignore
546
+ elif show_detailed_logs:
547
+ print(f" scenario_id={scenario['scenario'].id}") # type:ignore
548
+
549
+ case_data = EvaluationTestcaseData().__dict__
550
+
551
+ for step_key, result in scenario["results"].items(): # type:ignore
552
+ if result.testcase_id:
553
+ if show_detailed_logs:
554
+ print(
555
+ f" step_key={str(step_key).ljust(32)}, testcase_id={result.testcase_id}"
556
+ )
557
+ # Use a more readable case ID
558
+ testcase_short = str(result.testcase_id)[:8]
559
+ case_data["case_id"] = f"{testcase_short}..."
560
+
561
+ elif result.trace_id:
562
+ if show_detailed_logs:
563
+ print(
564
+ f" step_key={str(step_key).ljust(32)}, trace_id={result.trace_id}"
565
+ )
566
+
567
+ # Fetch and process trace data using services module
568
+ try:
569
+ trace_data = await fetch_trace_data(result.trace_id)
570
+ if trace_data and "spans" in trace_data:
571
+ for span_key in trace_data["spans"].keys():
572
+ step_data = extract_trace_step_data(trace_data, span_key)
573
+ if step_data:
574
+ inputs = step_data["inputs"]
575
+ outputs = step_data["outputs"]
576
+ trace_type = step_data["trace_type"]
577
+ trace_evaluator_name = step_data.get("evaluator_name")
578
+
579
+ # Store inputs for report
580
+ if inputs:
581
+ case_data["inputs"] = clean_inputs_for_display(
582
+ **(inputs if isinstance(inputs, dict) else {})
583
+ )
584
+ if show_detailed_logs:
585
+ print(
586
+ f" inputs={inputs}"
587
+ )
588
+
589
+ # Determine if this is application or evaluator
590
+ if outputs:
591
+ # Heuristic to classify outputs:
592
+ # 1. If outputs is a single string value, it's likely the application output
593
+ # 2. If outputs is a dict with keys like 'score', 'myscore', 'success', it's evaluator output
594
+ # 3. If we already have application_outputs, everything else is evaluator output
595
+
596
+ is_application_output = False
597
+ if not case_data.get("application_outputs"):
598
+ # Check if this looks like a simple application output (single string)
599
+ if isinstance(outputs, str):
600
+ is_application_output = True
601
+ elif (
602
+ isinstance(outputs, dict)
603
+ and len(outputs) == 0
604
+ ):
605
+ # Empty dict, skip
606
+ is_application_output = False
607
+ elif isinstance(outputs, dict):
608
+ # If it's a dict with typical evaluator keys, it's an evaluator
609
+ evaluator_keys = {
610
+ "score",
611
+ "myscore",
612
+ "success",
613
+ "failure",
614
+ "passed",
615
+ "failed",
616
+ }
617
+ if any(
618
+ key in evaluator_keys
619
+ for key in outputs.keys()
620
+ ):
621
+ is_application_output = False
622
+ else:
623
+ # Otherwise, it might be application output
624
+ is_application_output = True
625
+
626
+ if is_application_output:
627
+ case_data["application_outputs"] = outputs
628
+ else:
629
+ # This is an evaluator output
630
+ # Use the evaluator name from trace data, or fall back to step_key hash
631
+ evaluator_name = trace_evaluator_name or (
632
+ step_key[:8] if step_key else None
633
+ )
634
+ process_evaluator_outputs(
635
+ case_data,
636
+ outputs,
637
+ evaluator_name=evaluator_name,
638
+ )
639
+
640
+ if show_detailed_logs:
641
+ print(
642
+ f" outputs={outputs}"
643
+ )
644
+ else:
645
+ if show_detailed_logs:
646
+ print(
647
+ f" ⚠️ no_trace_data"
648
+ )
649
+ except Exception as e:
650
+ if show_detailed_logs:
651
+ print(
652
+ f" ❌ trace_fetch_error: {e}"
653
+ )
654
+ else:
655
+ if show_detailed_logs:
656
+ print(
657
+ f" step_key={str(step_key).ljust(32)}, ❌ error={result.error}"
658
+ )
659
+
660
+ if case_data["case_id"]:
661
+ report_data.append(case_data)
662
+
663
+ # if show_detailed_logs:
664
+ # print(
665
+ # f"📈 metrics={json.dumps(eval_data['metrics'].data, indent=4)}"
666
+ # ) # type:ignore
667
+
668
+ # Display the enhanced formatted report table
669
+ print()
670
+ print("📋 Evaluation Report:")
671
+ print(format_evaluation_report(report_data, console_width))
672
+
673
+ # Add summary statistics
674
+ if report_data:
675
+ print()
676
+ print(f"✅ Successfully processed {len(report_data)} testcases")
677
+
678
+ # Count total evaluators
679
+ all_evaluator_keys = set()
680
+ for case in report_data:
681
+ all_evaluator_keys.update(case.get("evaluator_outputs", {}).keys())
682
+
683
+ if all_evaluator_keys:
684
+ print(
685
+ f"🔍 Evaluated with {len(all_evaluator_keys)} metrics: {', '.join(sorted(all_evaluator_keys))}"
686
+ )
687
+ else:
688
+ print("⚠️ No evaluation data found")
689
+
690
+
691
+ from typing import Callable, Dict, Optional, Any
692
+
693
+ from agenta.sdk.utils.client import authed_api
694
+ import asyncio
695
+ import json
696
+ from typing import Dict, Any, Optional
697
+
698
+
699
+ async def fetch_trace_data(
700
+ trace_id: str, max_retries: int = 3, delay: float = 1.0
701
+ ) -> Optional[Dict[str, Any]]:
702
+ """
703
+ Fetch trace data from the API with retry logic.
704
+
705
+ Args:
706
+ trace_id: The trace ID to fetch
707
+ max_retries: Maximum number of retry attempts
708
+ delay: Delay between retries in seconds
709
+
710
+ Returns:
711
+ Trace data dictionary or None if not found
712
+ """
713
+ for attempt in range(max_retries):
714
+ try:
715
+ response = authed_api()(
716
+ method="GET", endpoint=f"/preview/tracing/traces/{trace_id}"
717
+ )
718
+ response.raise_for_status()
719
+ trace_data = response.json()
720
+
721
+ # print(trace_data)
722
+
723
+ # Get the traces dictionary
724
+ traces = trace_data.get("traces", {})
725
+ if traces:
726
+ # Get the first (and usually only) trace
727
+ for trace_key, trace_content in traces.items():
728
+ if (
729
+ trace_content
730
+ and "spans" in trace_content
731
+ and trace_content["spans"]
732
+ ):
733
+ return trace_content
734
+
735
+ # If no data yet, retry on next iteration
736
+ if attempt < max_retries - 1:
737
+ await asyncio.sleep(delay)
738
+
739
+ except Exception as e:
740
+ if attempt < max_retries - 1:
741
+ await asyncio.sleep(delay)
742
+ continue
743
+ else:
744
+ print(f"Error fetching trace data: {e}")
745
+ return None
746
+
747
+ print("Failed to fetch trace data after retries")
748
+ return None
749
+
750
+
751
+ def extract_trace_step_data(
752
+ trace_data: Dict[str, Any], step_key: str
753
+ ) -> Optional[Dict[str, Any]]:
754
+ """
755
+ Extract step data from trace information.
756
+
757
+ Args:
758
+ trace_data: The complete trace data
759
+ step_key: The step key to extract data for
760
+
761
+ Returns:
762
+ Step data dictionary or None if not found
763
+ """
764
+ if not trace_data:
765
+ return None
766
+
767
+ spans = trace_data.get("spans", {})
768
+ if not spans or step_key not in spans:
769
+ return None
770
+
771
+ span_info = spans[step_key]
772
+ # Extract the actual evaluation data using the correct data structure
773
+ ag_data = span_info.get("attributes", {}).get("ag", {}).get("data", {})
774
+
775
+ if not ag_data:
776
+ return None
777
+
778
+ # Try to extract evaluator/application name from span
779
+ # The span_name field contains the workflow/evaluator name
780
+ evaluator_name = span_info.get("span_name") or span_info.get("name")
781
+
782
+ return {
783
+ "inputs": ag_data.get("inputs", {}),
784
+ "outputs": ag_data.get("outputs", {}),
785
+ "trace_type": span_info.get("trace_type"),
786
+ "evaluator_name": evaluator_name,
787
+ "span_info": span_info,
788
+ }
789
+
790
+
791
+ def process_evaluator_outputs(
792
+ case_data: Dict[str, Any],
793
+ outputs: Dict[str, Any],
794
+ evaluator_name: Optional[str] = None,
795
+ ) -> None:
796
+ """
797
+ Process evaluator outputs and handle multiple evaluators with same key names.
798
+
799
+ Args:
800
+ case_data: The case data to update
801
+ outputs: The evaluator outputs to process
802
+ evaluator_name: Optional evaluator identifier for labeling
803
+ """
804
+ # Handle multiple evaluators with same key names (like 'success', 'score')
805
+ for key, value in outputs.items():
806
+ # Label numeric scores by evaluator to distinguish between multiple evaluators
807
+ display_key = key
808
+
809
+ # If we have an evaluator name and this is a numeric value, prefix it
810
+ if (
811
+ evaluator_name
812
+ and isinstance(value, (int, float))
813
+ and not isinstance(value, bool)
814
+ ):
815
+ display_key = f"{evaluator_name}.{key}"
816
+
817
+ # Store the value - if the key already exists, convert to list to preserve all values
818
+ if display_key in case_data["evaluator_outputs"]:
819
+ # Create lists for duplicate keys to preserve all values
820
+ existing = case_data["evaluator_outputs"][display_key]
821
+ if not isinstance(existing, list):
822
+ case_data["evaluator_outputs"][display_key] = [existing]
823
+ case_data["evaluator_outputs"][display_key].append(value)
824
+ else:
825
+ case_data["evaluator_outputs"][display_key] = value
826
+
827
+
828
+ def clean_inputs_for_display(**kwargs) -> Dict[str, Any]:
829
+ """
830
+ Clean inputs by removing internal IDs and trace data for cleaner display.
831
+
832
+ Args:
833
+ inputs: Raw inputs dictionary
834
+
835
+ Returns:
836
+ Cleaned inputs dictionary with only user-facing testcase fields
837
+ """
838
+ inputs = kwargs.get("inputs")
839
+ if inputs:
840
+ # List of keys to exclude from display
841
+ # - Internal IDs (ending with _id)
842
+ # - Testcase internal fields (starting with testcase_)
843
+ # - Trace data (the 'trace' key which contains the full trace structure)
844
+ excluded_keys = {
845
+ "revision",
846
+ "parameters",
847
+ "testcase",
848
+ # "inputs",
849
+ "trace",
850
+ "outputs",
851
+ }
852
+
853
+ clean_inputs = {
854
+ k: v
855
+ for k, v in inputs.items()
856
+ if not k.endswith("_id")
857
+ and not k.startswith("testcase_")
858
+ and k not in excluded_keys
859
+ }
860
+ return clean_inputs or inputs
861
+ return inputs