agenta 0.52.6__py3-none-any.whl → 0.63.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (271) hide show
  1. agenta/__init__.py +12 -3
  2. agenta/client/__init__.py +4 -4
  3. agenta/client/backend/__init__.py +4 -4
  4. agenta/client/backend/api_keys/client.py +2 -2
  5. agenta/client/backend/billing/client.py +2 -2
  6. agenta/client/backend/billing/raw_client.py +2 -2
  7. agenta/client/backend/client.py +56 -48
  8. agenta/client/backend/core/client_wrapper.py +2 -2
  9. agenta/client/backend/core/file.py +3 -1
  10. agenta/client/backend/core/http_client.py +3 -3
  11. agenta/client/backend/core/pydantic_utilities.py +13 -3
  12. agenta/client/backend/human_evaluations/client.py +2 -2
  13. agenta/client/backend/human_evaluations/raw_client.py +2 -2
  14. agenta/client/backend/organization/client.py +46 -34
  15. agenta/client/backend/organization/raw_client.py +32 -26
  16. agenta/client/backend/raw_client.py +26 -26
  17. agenta/client/backend/testsets/client.py +18 -18
  18. agenta/client/backend/testsets/raw_client.py +30 -30
  19. agenta/client/backend/types/__init__.py +4 -4
  20. agenta/client/backend/types/account_request.py +3 -1
  21. agenta/client/backend/types/account_response.py +3 -1
  22. agenta/client/backend/types/agenta_node_dto.py +3 -1
  23. agenta/client/backend/types/agenta_nodes_response.py +3 -1
  24. agenta/client/backend/types/agenta_root_dto.py +3 -1
  25. agenta/client/backend/types/agenta_roots_response.py +3 -1
  26. agenta/client/backend/types/agenta_tree_dto.py +3 -1
  27. agenta/client/backend/types/agenta_trees_response.py +3 -1
  28. agenta/client/backend/types/aggregated_result.py +3 -1
  29. agenta/client/backend/types/analytics_response.py +3 -1
  30. agenta/client/backend/types/annotation.py +6 -4
  31. agenta/client/backend/types/annotation_create.py +3 -1
  32. agenta/client/backend/types/annotation_edit.py +3 -1
  33. agenta/client/backend/types/annotation_link.py +3 -1
  34. agenta/client/backend/types/annotation_link_response.py +3 -1
  35. agenta/client/backend/types/annotation_query.py +3 -1
  36. agenta/client/backend/types/annotation_query_request.py +3 -1
  37. agenta/client/backend/types/annotation_reference.py +3 -1
  38. agenta/client/backend/types/annotation_references.py +3 -1
  39. agenta/client/backend/types/annotation_response.py +3 -1
  40. agenta/client/backend/types/annotations_response.py +3 -1
  41. agenta/client/backend/types/app.py +3 -1
  42. agenta/client/backend/types/app_variant_response.py +3 -1
  43. agenta/client/backend/types/app_variant_revision.py +3 -1
  44. agenta/client/backend/types/artifact.py +6 -4
  45. agenta/client/backend/types/base_output.py +3 -1
  46. agenta/client/backend/types/body_fetch_workflow_revision.py +3 -1
  47. agenta/client/backend/types/body_import_testset.py +3 -1
  48. agenta/client/backend/types/bucket_dto.py +3 -1
  49. agenta/client/backend/types/collect_status_response.py +3 -1
  50. agenta/client/backend/types/config_db.py +3 -1
  51. agenta/client/backend/types/config_dto.py +3 -1
  52. agenta/client/backend/types/config_response_model.py +3 -1
  53. agenta/client/backend/types/correct_answer.py +3 -1
  54. agenta/client/backend/types/create_app_output.py +3 -1
  55. agenta/client/backend/types/custom_model_settings_dto.py +3 -1
  56. agenta/client/backend/types/custom_provider_dto.py +3 -1
  57. agenta/client/backend/types/custom_provider_kind.py +1 -1
  58. agenta/client/backend/types/custom_provider_settings_dto.py +3 -1
  59. agenta/client/backend/types/delete_evaluation.py +3 -1
  60. agenta/client/backend/types/environment_output.py +3 -1
  61. agenta/client/backend/types/environment_output_extended.py +3 -1
  62. agenta/client/backend/types/environment_revision.py +3 -1
  63. agenta/client/backend/types/error.py +3 -1
  64. agenta/client/backend/types/evaluation.py +3 -1
  65. agenta/client/backend/types/evaluation_scenario.py +3 -1
  66. agenta/client/backend/types/evaluation_scenario_input.py +3 -1
  67. agenta/client/backend/types/evaluation_scenario_output.py +3 -1
  68. agenta/client/backend/types/evaluation_scenario_result.py +3 -1
  69. agenta/client/backend/types/evaluator.py +6 -4
  70. agenta/client/backend/types/evaluator_config.py +6 -4
  71. agenta/client/backend/types/evaluator_flags.py +3 -1
  72. agenta/client/backend/types/evaluator_mapping_output_interface.py +3 -1
  73. agenta/client/backend/types/evaluator_output_interface.py +3 -1
  74. agenta/client/backend/types/evaluator_query.py +3 -1
  75. agenta/client/backend/types/evaluator_query_request.py +3 -1
  76. agenta/client/backend/types/evaluator_request.py +3 -1
  77. agenta/client/backend/types/evaluator_response.py +3 -1
  78. agenta/client/backend/types/evaluators_response.py +3 -1
  79. agenta/client/backend/types/exception_dto.py +3 -1
  80. agenta/client/backend/types/extended_o_tel_tracing_response.py +3 -1
  81. agenta/client/backend/types/get_config_response.py +3 -1
  82. agenta/client/backend/types/header.py +3 -1
  83. agenta/client/backend/types/http_validation_error.py +3 -1
  84. agenta/client/backend/types/human_evaluation.py +3 -1
  85. agenta/client/backend/types/human_evaluation_scenario.py +3 -1
  86. agenta/client/backend/types/human_evaluation_scenario_input.py +3 -1
  87. agenta/client/backend/types/human_evaluation_scenario_output.py +3 -1
  88. agenta/client/backend/types/invite_request.py +3 -1
  89. agenta/client/backend/types/legacy_analytics_response.py +3 -1
  90. agenta/client/backend/types/legacy_data_point.py +3 -1
  91. agenta/client/backend/types/legacy_evaluator.py +3 -1
  92. agenta/client/backend/types/legacy_scope_request.py +3 -1
  93. agenta/client/backend/types/legacy_scopes_response.py +3 -1
  94. agenta/client/backend/types/legacy_subscription_request.py +3 -1
  95. agenta/client/backend/types/legacy_user_request.py +3 -1
  96. agenta/client/backend/types/legacy_user_response.py +3 -1
  97. agenta/client/backend/types/lifecycle_dto.py +3 -1
  98. agenta/client/backend/types/link_dto.py +3 -1
  99. agenta/client/backend/types/list_api_keys_response.py +3 -1
  100. agenta/client/backend/types/llm_run_rate_limit.py +3 -1
  101. agenta/client/backend/types/meta_request.py +3 -1
  102. agenta/client/backend/types/metrics_dto.py +3 -1
  103. agenta/client/backend/types/new_testset.py +3 -1
  104. agenta/client/backend/types/node_dto.py +3 -1
  105. agenta/client/backend/types/o_tel_context_dto.py +3 -1
  106. agenta/client/backend/types/o_tel_event.py +6 -4
  107. agenta/client/backend/types/o_tel_event_dto.py +3 -1
  108. agenta/client/backend/types/o_tel_extra_dto.py +3 -1
  109. agenta/client/backend/types/o_tel_flat_span.py +6 -4
  110. agenta/client/backend/types/o_tel_link.py +6 -4
  111. agenta/client/backend/types/o_tel_link_dto.py +3 -1
  112. agenta/client/backend/types/o_tel_links_response.py +3 -1
  113. agenta/client/backend/types/o_tel_span.py +1 -1
  114. agenta/client/backend/types/o_tel_span_dto.py +3 -1
  115. agenta/client/backend/types/o_tel_spans_tree.py +3 -1
  116. agenta/client/backend/types/o_tel_tracing_data_response.py +3 -1
  117. agenta/client/backend/types/o_tel_tracing_request.py +3 -1
  118. agenta/client/backend/types/o_tel_tracing_response.py +3 -1
  119. agenta/client/backend/types/organization.py +3 -1
  120. agenta/client/backend/types/organization_details.py +3 -1
  121. agenta/client/backend/types/organization_membership_request.py +3 -1
  122. agenta/client/backend/types/organization_output.py +3 -1
  123. agenta/client/backend/types/organization_request.py +3 -1
  124. agenta/client/backend/types/parent_dto.py +3 -1
  125. agenta/client/backend/types/project_membership_request.py +3 -1
  126. agenta/client/backend/types/project_request.py +3 -1
  127. agenta/client/backend/types/project_scope.py +3 -1
  128. agenta/client/backend/types/projects_response.py +3 -1
  129. agenta/client/backend/types/reference.py +6 -4
  130. agenta/client/backend/types/reference_dto.py +3 -1
  131. agenta/client/backend/types/reference_request_model.py +3 -1
  132. agenta/client/backend/types/result.py +3 -1
  133. agenta/client/backend/types/root_dto.py +3 -1
  134. agenta/client/backend/types/scopes_response_model.py +3 -1
  135. agenta/client/backend/types/secret_dto.py +3 -1
  136. agenta/client/backend/types/secret_response_dto.py +3 -1
  137. agenta/client/backend/types/simple_evaluation_output.py +3 -1
  138. agenta/client/backend/types/span_dto.py +6 -4
  139. agenta/client/backend/types/standard_provider_dto.py +3 -1
  140. agenta/client/backend/types/standard_provider_settings_dto.py +3 -1
  141. agenta/client/backend/types/status_dto.py +3 -1
  142. agenta/client/backend/types/tags_request.py +3 -1
  143. agenta/client/backend/types/testcase_response.py +6 -4
  144. agenta/client/backend/types/testset.py +6 -4
  145. agenta/client/backend/types/{test_set_output_response.py → testset_output_response.py} +4 -2
  146. agenta/client/backend/types/testset_request.py +3 -1
  147. agenta/client/backend/types/testset_response.py +3 -1
  148. agenta/client/backend/types/{test_set_simple_response.py → testset_simple_response.py} +4 -2
  149. agenta/client/backend/types/testsets_response.py +3 -1
  150. agenta/client/backend/types/time_dto.py +3 -1
  151. agenta/client/backend/types/tree_dto.py +3 -1
  152. agenta/client/backend/types/update_app_output.py +3 -1
  153. agenta/client/backend/types/user_request.py +3 -1
  154. agenta/client/backend/types/validation_error.py +3 -1
  155. agenta/client/backend/types/workflow_artifact.py +6 -4
  156. agenta/client/backend/types/workflow_data.py +3 -1
  157. agenta/client/backend/types/workflow_flags.py +3 -1
  158. agenta/client/backend/types/workflow_request.py +3 -1
  159. agenta/client/backend/types/workflow_response.py +3 -1
  160. agenta/client/backend/types/workflow_revision.py +6 -4
  161. agenta/client/backend/types/workflow_revision_request.py +3 -1
  162. agenta/client/backend/types/workflow_revision_response.py +3 -1
  163. agenta/client/backend/types/workflow_revisions_response.py +3 -1
  164. agenta/client/backend/types/workflow_variant.py +6 -4
  165. agenta/client/backend/types/workflow_variant_request.py +3 -1
  166. agenta/client/backend/types/workflow_variant_response.py +3 -1
  167. agenta/client/backend/types/workflow_variants_response.py +3 -1
  168. agenta/client/backend/types/workflows_response.py +3 -1
  169. agenta/client/backend/types/workspace.py +3 -1
  170. agenta/client/backend/types/workspace_member_response.py +3 -1
  171. agenta/client/backend/types/workspace_membership_request.py +3 -1
  172. agenta/client/backend/types/workspace_permission.py +3 -1
  173. agenta/client/backend/types/workspace_request.py +3 -1
  174. agenta/client/backend/types/workspace_response.py +3 -1
  175. agenta/client/backend/vault/raw_client.py +4 -4
  176. agenta/client/backend/workspace/client.py +2 -2
  177. agenta/client/client.py +102 -88
  178. agenta/sdk/__init__.py +52 -3
  179. agenta/sdk/agenta_init.py +43 -16
  180. agenta/sdk/assets.py +23 -15
  181. agenta/sdk/context/serving.py +20 -8
  182. agenta/sdk/context/tracing.py +40 -22
  183. agenta/sdk/contexts/__init__.py +0 -0
  184. agenta/sdk/contexts/routing.py +38 -0
  185. agenta/sdk/contexts/running.py +57 -0
  186. agenta/sdk/contexts/tracing.py +86 -0
  187. agenta/sdk/decorators/__init__.py +1 -0
  188. agenta/sdk/decorators/routing.py +284 -0
  189. agenta/sdk/decorators/running.py +692 -98
  190. agenta/sdk/decorators/serving.py +20 -21
  191. agenta/sdk/decorators/tracing.py +176 -131
  192. agenta/sdk/engines/__init__.py +0 -0
  193. agenta/sdk/engines/running/__init__.py +0 -0
  194. agenta/sdk/engines/running/utils.py +17 -0
  195. agenta/sdk/engines/tracing/__init__.py +1 -0
  196. agenta/sdk/engines/tracing/attributes.py +185 -0
  197. agenta/sdk/engines/tracing/conventions.py +49 -0
  198. agenta/sdk/engines/tracing/exporters.py +130 -0
  199. agenta/sdk/engines/tracing/inline.py +1154 -0
  200. agenta/sdk/engines/tracing/processors.py +190 -0
  201. agenta/sdk/engines/tracing/propagation.py +102 -0
  202. agenta/sdk/engines/tracing/spans.py +136 -0
  203. agenta/sdk/engines/tracing/tracing.py +324 -0
  204. agenta/sdk/evaluations/__init__.py +2 -0
  205. agenta/sdk/evaluations/metrics.py +37 -0
  206. agenta/sdk/evaluations/preview/__init__.py +0 -0
  207. agenta/sdk/evaluations/preview/evaluate.py +765 -0
  208. agenta/sdk/evaluations/preview/utils.py +861 -0
  209. agenta/sdk/evaluations/results.py +66 -0
  210. agenta/sdk/evaluations/runs.py +153 -0
  211. agenta/sdk/evaluations/scenarios.py +48 -0
  212. agenta/sdk/litellm/litellm.py +12 -0
  213. agenta/sdk/litellm/mockllm.py +6 -8
  214. agenta/sdk/litellm/mocks/__init__.py +5 -5
  215. agenta/sdk/managers/applications.py +304 -0
  216. agenta/sdk/managers/config.py +2 -2
  217. agenta/sdk/managers/evaluations.py +0 -0
  218. agenta/sdk/managers/evaluators.py +303 -0
  219. agenta/sdk/managers/secrets.py +161 -24
  220. agenta/sdk/managers/shared.py +3 -1
  221. agenta/sdk/managers/testsets.py +441 -0
  222. agenta/sdk/managers/vault.py +3 -3
  223. agenta/sdk/middleware/auth.py +0 -176
  224. agenta/sdk/middleware/config.py +27 -9
  225. agenta/sdk/middleware/vault.py +204 -9
  226. agenta/sdk/middlewares/__init__.py +0 -0
  227. agenta/sdk/middlewares/routing/__init__.py +0 -0
  228. agenta/sdk/middlewares/routing/auth.py +263 -0
  229. agenta/sdk/middlewares/routing/cors.py +30 -0
  230. agenta/sdk/middlewares/routing/otel.py +29 -0
  231. agenta/sdk/middlewares/running/__init__.py +0 -0
  232. agenta/sdk/middlewares/running/normalizer.py +321 -0
  233. agenta/sdk/middlewares/running/resolver.py +161 -0
  234. agenta/sdk/middlewares/running/vault.py +140 -0
  235. agenta/sdk/models/__init__.py +0 -0
  236. agenta/sdk/models/blobs.py +33 -0
  237. agenta/sdk/models/evaluations.py +119 -0
  238. agenta/sdk/models/git.py +126 -0
  239. agenta/sdk/models/shared.py +167 -0
  240. agenta/sdk/models/testsets.py +163 -0
  241. agenta/sdk/models/tracing.py +202 -0
  242. agenta/sdk/models/workflows.py +753 -0
  243. agenta/sdk/tracing/attributes.py +4 -4
  244. agenta/sdk/tracing/exporters.py +67 -17
  245. agenta/sdk/tracing/inline.py +37 -45
  246. agenta/sdk/tracing/processors.py +97 -0
  247. agenta/sdk/tracing/propagation.py +3 -1
  248. agenta/sdk/tracing/spans.py +4 -0
  249. agenta/sdk/tracing/tracing.py +13 -15
  250. agenta/sdk/types.py +222 -22
  251. agenta/sdk/utils/cache.py +1 -1
  252. agenta/sdk/utils/client.py +38 -0
  253. agenta/sdk/utils/helpers.py +13 -12
  254. agenta/sdk/utils/logging.py +18 -78
  255. agenta/sdk/utils/references.py +23 -0
  256. agenta/sdk/workflows/builtin.py +600 -0
  257. agenta/sdk/workflows/configurations.py +22 -0
  258. agenta/sdk/workflows/errors.py +292 -0
  259. agenta/sdk/workflows/handlers.py +1791 -0
  260. agenta/sdk/workflows/interfaces.py +948 -0
  261. agenta/sdk/workflows/sandbox.py +118 -0
  262. agenta/sdk/workflows/utils.py +303 -6
  263. {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/METADATA +37 -33
  264. agenta-0.63.2.dist-info/RECORD +421 -0
  265. {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/WHEEL +1 -1
  266. agenta/sdk/middleware/adapt.py +0 -253
  267. agenta/sdk/middleware/base.py +0 -40
  268. agenta/sdk/middleware/flags.py +0 -40
  269. agenta/sdk/workflows/types.py +0 -472
  270. agenta-0.52.6.dist-info/RECORD +0 -371
  271. /agenta/sdk/{workflows → engines/running}/registry.py +0 -0
@@ -0,0 +1,765 @@
1
+ from typing import Dict, List, Any, Union, Optional, Tuple
2
+ from uuid import UUID
3
+ from copy import deepcopy
4
+ from datetime import datetime
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from agenta.sdk.models.evaluations import (
9
+ Origin,
10
+ Target,
11
+ Link,
12
+ Reference,
13
+ SimpleEvaluationData,
14
+ )
15
+ from agenta.sdk.models.workflows import (
16
+ ApplicationRevision,
17
+ EvaluatorRevision,
18
+ WorkflowServiceRequestData,
19
+ ApplicationServiceRequest,
20
+ EvaluatorServiceRequest,
21
+ )
22
+ from agenta.sdk.models.testsets import TestsetRevision
23
+
24
+ from agenta.sdk.utils.references import get_slug_from_name_and_id
25
+ from agenta.sdk.evaluations.preview.utils import fetch_trace_data
26
+
27
+ from agenta.sdk.managers.testsets import (
28
+ acreate as acreate_testset,
29
+ aretrieve as aretrieve_testset,
30
+ )
31
+ from agenta.sdk.managers.applications import (
32
+ aupsert as aupsert_application,
33
+ aretrieve as aretrieve_application,
34
+ )
35
+ from agenta.sdk.managers.evaluators import (
36
+ aupsert as aupsert_evaluator,
37
+ aretrieve as aretrieve_evaluator,
38
+ )
39
+ from agenta.sdk.evaluations.runs import (
40
+ acreate as acreate_run,
41
+ aclose as aclose_run,
42
+ aurl as aget_url,
43
+ )
44
+ from agenta.sdk.evaluations.scenarios import (
45
+ acreate as aadd_scenario,
46
+ )
47
+ from agenta.sdk.evaluations.results import (
48
+ acreate as alog_result,
49
+ )
50
+ from agenta.sdk.evaluations.metrics import (
51
+ arefresh as acompute_metrics,
52
+ )
53
+
54
+
55
+ from agenta.sdk.models.workflows import (
56
+ WorkflowServiceInterface,
57
+ WorkflowServiceConfiguration,
58
+ )
59
+ from agenta.sdk.decorators.running import (
60
+ invoke_application,
61
+ invoke_evaluator,
62
+ )
63
+
64
+
65
+ class EvaluateSpecs(BaseModel):
66
+ testsets: Optional[Target] = None
67
+ applications: Optional[Target] = None
68
+ evaluators: Optional[Target] = None
69
+
70
+ repeats: Optional[int] = None
71
+
72
+
73
+ async def _parse_evaluate_kwargs(
74
+ *,
75
+ testsets: Optional[Target] = None,
76
+ applications: Optional[Target] = None,
77
+ evaluators: Optional[Target] = None,
78
+ #
79
+ repeats: Optional[int] = None,
80
+ #
81
+ specs: Optional[Union[EvaluateSpecs, Dict[str, Any]]] = None,
82
+ ) -> SimpleEvaluationData:
83
+ _specs = deepcopy(specs)
84
+ if isinstance(_specs, dict):
85
+ _specs = EvaluateSpecs(**_specs)
86
+ if _specs and not isinstance(_specs, EvaluateSpecs):
87
+ _specs = None
88
+
89
+ simple_evaluation_data = SimpleEvaluationData(
90
+ testset_steps=testsets or (_specs.testsets if _specs else None),
91
+ application_steps=applications or (_specs.applications if _specs else None),
92
+ evaluator_steps=evaluators or (_specs.evaluators if _specs else None),
93
+ #
94
+ repeats=repeats or (_specs.repeats if _specs else None),
95
+ )
96
+
97
+ if not simple_evaluation_data.testset_steps:
98
+ raise ValueError("Invalid 'evaluate()' specs: missing testsets")
99
+ if not simple_evaluation_data.application_steps:
100
+ raise ValueError("Invalid 'evaluate()' specs: missing applications")
101
+ if not simple_evaluation_data.evaluator_steps:
102
+ raise ValueError("Invalid 'evaluate()' specs: missing evaluators")
103
+
104
+ return simple_evaluation_data
105
+
106
+
107
+ async def _upsert_entities(
108
+ simple_evaluation_data: SimpleEvaluationData,
109
+ ) -> SimpleEvaluationData:
110
+ if simple_evaluation_data.testset_steps:
111
+ if isinstance(simple_evaluation_data.testset_steps, list):
112
+ testset_steps: Dict[str, Origin] = {}
113
+
114
+ if all(
115
+ isinstance(testset_revision_id, UUID)
116
+ for testset_revision_id in simple_evaluation_data.testset_steps
117
+ ):
118
+ for testset_revision_id in simple_evaluation_data.testset_steps:
119
+ if isinstance(testset_revision_id, UUID):
120
+ testset_steps[str(testset_revision_id)] = "custom"
121
+
122
+ elif all(
123
+ isinstance(testcases_data, List)
124
+ for testcases_data in simple_evaluation_data.testset_steps
125
+ ):
126
+ for testcases_data in simple_evaluation_data.testset_steps:
127
+ if isinstance(testcases_data, List):
128
+ if all(isinstance(step, Dict) for step in testcases_data):
129
+ testset_revision_id = await acreate_testset(
130
+ data=testcases_data,
131
+ )
132
+ testset_steps[str(testset_revision_id)] = "custom"
133
+
134
+ simple_evaluation_data.testset_steps = testset_steps
135
+
136
+ if not simple_evaluation_data.testset_steps or not isinstance(
137
+ simple_evaluation_data.testset_steps, dict
138
+ ):
139
+ raise ValueError(
140
+ "Invalid 'evaluate()' specs: missing or invalid testset steps",
141
+ )
142
+
143
+ if simple_evaluation_data.application_steps:
144
+ if isinstance(simple_evaluation_data.application_steps, list):
145
+ application_steps: Dict[str, Origin] = {}
146
+
147
+ if all(
148
+ isinstance(application_revision_id, UUID)
149
+ for application_revision_id in simple_evaluation_data.application_steps
150
+ ):
151
+ for application_revision_id in simple_evaluation_data.application_steps:
152
+ if isinstance(application_revision_id, UUID):
153
+ application_steps[str(application_revision_id)] = "custom"
154
+
155
+ elif all(
156
+ callable(application_handler)
157
+ for application_handler in simple_evaluation_data.application_steps
158
+ ):
159
+ for application_handler in simple_evaluation_data.application_steps:
160
+ if callable(application_handler):
161
+ application_revision_id = await aupsert_application(
162
+ handler=application_handler,
163
+ )
164
+ application_steps[str(application_revision_id)] = "custom"
165
+
166
+ simple_evaluation_data.application_steps = application_steps
167
+
168
+ if not simple_evaluation_data.application_steps or not isinstance(
169
+ simple_evaluation_data.application_steps, dict
170
+ ):
171
+ raise ValueError(
172
+ "Invalid 'evaluate()' specs: missing or invalid application steps",
173
+ )
174
+
175
+ if simple_evaluation_data.evaluator_steps:
176
+ if isinstance(simple_evaluation_data.evaluator_steps, list):
177
+ evaluator_steps: Dict[str, Origin] = {}
178
+
179
+ if all(
180
+ isinstance(evaluator_revision_id, UUID)
181
+ for evaluator_revision_id in simple_evaluation_data.evaluator_steps
182
+ ):
183
+ for evaluator_revision_id in simple_evaluation_data.evaluator_steps:
184
+ if isinstance(evaluator_revision_id, UUID):
185
+ evaluator_steps[str(evaluator_revision_id)] = "custom"
186
+
187
+ elif all(
188
+ callable(evaluator_handler)
189
+ for evaluator_handler in simple_evaluation_data.evaluator_steps
190
+ ):
191
+ for evaluator_handler in simple_evaluation_data.evaluator_steps:
192
+ if callable(evaluator_handler):
193
+ evaluator_revision_id = await aupsert_evaluator(
194
+ handler=evaluator_handler,
195
+ )
196
+ evaluator_steps[str(evaluator_revision_id)] = "custom"
197
+
198
+ simple_evaluation_data.evaluator_steps = evaluator_steps
199
+
200
+ if not simple_evaluation_data.evaluator_steps or not isinstance(
201
+ simple_evaluation_data.evaluator_steps, dict
202
+ ):
203
+ raise ValueError(
204
+ "Invalid 'evaluate()' specs: missing or invalid evaluator steps",
205
+ )
206
+
207
+ return simple_evaluation_data
208
+
209
+
210
+ async def _retrieve_entities(
211
+ simple_evaluation_data: SimpleEvaluationData,
212
+ ) -> Tuple[
213
+ Dict[UUID, TestsetRevision],
214
+ Dict[UUID, ApplicationRevision],
215
+ Dict[UUID, EvaluatorRevision],
216
+ ]:
217
+ testset_revisions: Dict[UUID, TestsetRevision] = {}
218
+ # for testset_revision_id, origin in simple_evaluation_data.testset_steps.items():
219
+ # testset_revision = await retrieve_testset(
220
+ # testset_revision_id=testset_revision_id,
221
+ # )
222
+ for testset_id, origin in simple_evaluation_data.testset_steps.items():
223
+ testset_revision = await aretrieve_testset(
224
+ testset_id=testset_id,
225
+ )
226
+
227
+ if not testset_revision or not testset_revision.id:
228
+ continue
229
+
230
+ testset_revisions[testset_revision.id] = testset_revision
231
+
232
+ application_revisions: Dict[UUID, ApplicationRevision] = {}
233
+ for (
234
+ application_revision_id,
235
+ origin,
236
+ ) in simple_evaluation_data.application_steps.items():
237
+ application_revision = await aretrieve_application(
238
+ application_revision_id=application_revision_id,
239
+ )
240
+
241
+ if not application_revision:
242
+ continue
243
+
244
+ application_revisions[application_revision_id] = application_revision
245
+
246
+ evaluator_revisions: Dict[UUID, EvaluatorRevision] = {}
247
+ for evaluator_revision_id, origin in simple_evaluation_data.evaluator_steps.items():
248
+ evaluator_revision = await aretrieve_evaluator(
249
+ evaluator_revision_id=evaluator_revision_id,
250
+ )
251
+
252
+ if not evaluator_revision:
253
+ continue
254
+
255
+ evaluator_revisions[evaluator_revision_id] = evaluator_revision
256
+
257
+ return testset_revisions, application_revisions, evaluator_revisions
258
+
259
+
260
+ def _timestamp_suffix():
261
+ suffix = datetime.now().strftime("%y-%m-%d · %H:%M")
262
+ return f" [{suffix}]"
263
+
264
+
265
+ UNICODE = {
266
+ "here": "• ",
267
+ "root": "┌─ ",
268
+ "next": "├─ ",
269
+ "last": "└─ ",
270
+ "pipe": "│ ",
271
+ "skip": " ",
272
+ "this": "── ",
273
+ }
274
+
275
+
276
+ # @debug
277
+ async def aevaluate(
278
+ *,
279
+ name: Optional[str] = None,
280
+ description: Optional[str] = None,
281
+ #
282
+ testsets: Optional[Target] = None,
283
+ applications: Optional[Target] = None,
284
+ evaluators: Optional[Target] = None,
285
+ #
286
+ repeats: Optional[int] = None,
287
+ #
288
+ specs: Optional[Union[EvaluateSpecs, Dict[str, Any]]] = None,
289
+ ):
290
+ simple_evaluation_data = await _parse_evaluate_kwargs(
291
+ testsets=testsets,
292
+ applications=applications,
293
+ evaluators=evaluators,
294
+ repeats=repeats,
295
+ specs=specs,
296
+ )
297
+
298
+ simple_evaluation_data = await _upsert_entities(
299
+ simple_evaluation_data=simple_evaluation_data,
300
+ )
301
+
302
+ print()
303
+ print(
304
+ "────────────────────────────────────────────────────────────────────────────"
305
+ )
306
+ print(f"Evaluation running...")
307
+ print(
308
+ "────────────────────────────────────────────────────────────────────────────"
309
+ )
310
+
311
+ suffix = _timestamp_suffix()
312
+ name = f"{name}{suffix}"
313
+
314
+ run = await acreate_run(
315
+ name=name,
316
+ description=description,
317
+ #
318
+ testset_steps=simple_evaluation_data.testset_steps,
319
+ application_steps=simple_evaluation_data.application_steps,
320
+ evaluator_steps=simple_evaluation_data.evaluator_steps,
321
+ #
322
+ repeats=simple_evaluation_data.repeats,
323
+ )
324
+
325
+ print(
326
+ f"{UNICODE['here']}"
327
+ f"{UNICODE['skip']}"
328
+ f"{UNICODE['skip']}"
329
+ f"{UNICODE['skip']}"
330
+ f"{UNICODE['skip']}"
331
+ f" run_id={str(run.id)}",
332
+ )
333
+
334
+ if not run.id:
335
+ print("[failure] could not create evaluation")
336
+ return None
337
+
338
+ (
339
+ testset_revisions,
340
+ application_revisions,
341
+ evaluator_revisions,
342
+ ) = await _retrieve_entities(
343
+ simple_evaluation_data=simple_evaluation_data,
344
+ )
345
+
346
+ scenarios = list()
347
+
348
+ metrics = dict()
349
+
350
+ for testset_revision in testset_revisions.values():
351
+ if not testset_revision.data or not testset_revision.data.testcases:
352
+ continue
353
+
354
+ testcases = testset_revision.data.testcases
355
+
356
+ print(
357
+ f"{UNICODE['next']}"
358
+ f"{UNICODE['here']}"
359
+ f"{UNICODE['skip']}"
360
+ f"{UNICODE['skip']}"
361
+ f"{UNICODE['skip']}"
362
+ f" testset_id={str(testset_revision.testset_id)}",
363
+ )
364
+
365
+ for testcase_idx, testcase in enumerate(testcases):
366
+ print(
367
+ f"{UNICODE['pipe']}"
368
+ f"{UNICODE['pipe']}"
369
+ f"{UNICODE['skip']}"
370
+ f"{UNICODE['skip']}"
371
+ f"{UNICODE['skip']}"
372
+ "-----------------------"
373
+ "--------------------------------------"
374
+ )
375
+
376
+ print(
377
+ f"{UNICODE['pipe']}"
378
+ f"{UNICODE['next' if testcase_idx < len(testcases) - 1 else 'last']}"
379
+ f"{UNICODE['here']}"
380
+ f"{UNICODE['skip']}"
381
+ f"{UNICODE['skip']}"
382
+ f"testcase_id={str(testcase.id)}",
383
+ )
384
+
385
+ scenario = await aadd_scenario(
386
+ run_id=run.id,
387
+ )
388
+
389
+ print(
390
+ f"{UNICODE['pipe']}"
391
+ f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
392
+ f"{UNICODE['next']}"
393
+ f"{UNICODE['here']}"
394
+ f"{UNICODE['skip']}"
395
+ f"scenario_id={str(scenario.id)}",
396
+ )
397
+
398
+ results = dict()
399
+
400
+ result = await alog_result(
401
+ run_id=run.id,
402
+ scenario_id=scenario.id,
403
+ step_key="testset-" + testset_revision.slug, # type: ignore
404
+ testcase_id=testcase.id,
405
+ )
406
+
407
+ print(
408
+ f"{UNICODE['pipe']}"
409
+ f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
410
+ f"{UNICODE['pipe']}"
411
+ f"{UNICODE['next']}"
412
+ f"{UNICODE['here']}"
413
+ f" result_id={str(result.id)} (testcase)",
414
+ )
415
+
416
+ results[testset_revision.slug] = result
417
+
418
+ _testcase = testcase.model_dump(
419
+ mode="json",
420
+ exclude_none=True,
421
+ ) # type: ignore
422
+ inputs = testcase.data
423
+ if isinstance(inputs, dict):
424
+ if "testcase_dedup_id" in inputs:
425
+ del inputs["testcase_dedup_id"]
426
+
427
+ for application_revision in application_revisions.values():
428
+ if not application_revision or not application_revision.data:
429
+ print("Missing or invalid application revision")
430
+ if application_revision:
431
+ print(application_revision.model_dump(exclude_none=True))
432
+ continue
433
+
434
+ # print(f" Application {application_revision.model_dump(exclude_none=True)}") # type: ignore
435
+
436
+ references = dict(
437
+ testset=Reference(
438
+ id=testset_revision.testset_id,
439
+ ),
440
+ testset_variant=Reference(
441
+ id=testset_revision.testset_variant_id,
442
+ ),
443
+ testset_revision=Reference(
444
+ id=testset_revision.id,
445
+ slug=testset_revision.slug,
446
+ version=testset_revision.version,
447
+ ),
448
+ application=Reference(
449
+ id=application_revision.application_id,
450
+ ),
451
+ application_variant=Reference(
452
+ id=application_revision.application_variant_id,
453
+ ),
454
+ application_revision=Reference(
455
+ id=application_revision.id,
456
+ slug=application_revision.slug,
457
+ version=application_revision.version,
458
+ ),
459
+ )
460
+ links = None
461
+
462
+ _revision = application_revision.model_dump(
463
+ mode="json",
464
+ exclude_none=True,
465
+ )
466
+ interface = WorkflowServiceInterface(
467
+ **(
468
+ application_revision.data.model_dump()
469
+ if application_revision.data
470
+ else {}
471
+ )
472
+ )
473
+ configuration = WorkflowServiceConfiguration(
474
+ **(
475
+ application_revision.data.model_dump()
476
+ if application_revision.data
477
+ else {}
478
+ )
479
+ )
480
+ parameters = application_revision.data.parameters
481
+
482
+ _trace = None
483
+ outputs = None
484
+
485
+ workflow_service_request_data = WorkflowServiceRequestData(
486
+ revision=_revision,
487
+ parameters=parameters,
488
+ #
489
+ testcase=_testcase,
490
+ inputs=inputs,
491
+ #
492
+ trace=_trace,
493
+ outputs=outputs,
494
+ )
495
+
496
+ application_request = ApplicationServiceRequest(
497
+ interface=interface,
498
+ configuration=configuration,
499
+ #
500
+ data=workflow_service_request_data,
501
+ #
502
+ references=references, # type: ignore
503
+ links=links, # type: ignore
504
+ )
505
+
506
+ application_response = await invoke_application(
507
+ request=application_request,
508
+ )
509
+
510
+ if (
511
+ not application_response
512
+ or not application_response.data
513
+ or not application_response.trace_id
514
+ ):
515
+ print("Missing or invalid application response")
516
+ if application_response:
517
+ print(application_response.model_dump(exclude_none=True))
518
+ continue
519
+
520
+ trace_id = application_response.trace_id
521
+
522
+ if not application_revision.id or not application_revision.name:
523
+ print("Missing application revision ID or name")
524
+ continue
525
+
526
+ application_slug = get_slug_from_name_and_id(
527
+ name=application_revision.name,
528
+ id=application_revision.id,
529
+ )
530
+
531
+ trace = fetch_trace_data(trace_id, max_retries=30, delay=1.0)
532
+
533
+ result = await alog_result(
534
+ run_id=run.id,
535
+ scenario_id=scenario.id,
536
+ step_key="application-" + application_slug, # type: ignore
537
+ trace_id=trace_id,
538
+ )
539
+
540
+ print(
541
+ f"{UNICODE['pipe']}"
542
+ f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
543
+ f"{UNICODE['pipe']}"
544
+ f"{UNICODE['next']}"
545
+ f"{UNICODE['here']}"
546
+ f" result_id={str(result.id)} (invocation)",
547
+ )
548
+
549
+ results[application_slug] = result
550
+
551
+ trace = await trace
552
+
553
+ if not trace:
554
+ print("Failed to fetch trace data for application")
555
+ continue
556
+
557
+ root_span = list(trace.get("spans", {}).values())[0]
558
+ trace_attributes: dict = root_span.get("attributes", {})
559
+ trace_attributes_ag: dict = trace_attributes.get("ag", {})
560
+ trace_attributes_ag_data: dict = trace_attributes_ag.get("data", {})
561
+ outputs = trace_attributes_ag_data.get("outputs")
562
+ inputs = inputs or trace_attributes_ag_data.get("inputs")
563
+
564
+ for i, evaluator_revision in enumerate(evaluator_revisions.values()):
565
+ if not evaluator_revision or not evaluator_revision.data:
566
+ print("Missing or invalid evaluator revision")
567
+ if evaluator_revision:
568
+ print(evaluator_revision.model_dump(exclude_none=True))
569
+ continue
570
+
571
+ references = dict(
572
+ testset=Reference(
573
+ id=testset_revision.testset_id,
574
+ ),
575
+ testset_variant=Reference(
576
+ id=testset_revision.testset_variant_id,
577
+ ),
578
+ testset_revision=Reference(
579
+ id=testset_revision.id,
580
+ slug=testset_revision.slug,
581
+ version=testset_revision.version,
582
+ ),
583
+ evaluator=Reference(
584
+ id=evaluator_revision.evaluator_id,
585
+ ),
586
+ evaluator_variant=Reference(
587
+ id=evaluator_revision.evaluator_variant_id,
588
+ ),
589
+ evaluator_revision=Reference(
590
+ id=evaluator_revision.id,
591
+ slug=evaluator_revision.slug,
592
+ version=evaluator_revision.version,
593
+ ),
594
+ )
595
+ links = (
596
+ dict(
597
+ invocation=Link(
598
+ trace_id=application_response.trace_id,
599
+ span_id=application_response.span_id,
600
+ )
601
+ )
602
+ if application_response.trace_id
603
+ and application_response.span_id
604
+ else None
605
+ )
606
+
607
+ _revision = evaluator_revision.model_dump(
608
+ mode="json",
609
+ exclude_none=True,
610
+ )
611
+ interface = WorkflowServiceInterface(
612
+ **(
613
+ evaluator_revision.data.model_dump()
614
+ if evaluator_revision.data
615
+ else {}
616
+ )
617
+ )
618
+ configuration = WorkflowServiceConfiguration(
619
+ **(
620
+ evaluator_revision.data.model_dump()
621
+ if evaluator_revision.data
622
+ else {}
623
+ )
624
+ )
625
+ parameters = evaluator_revision.data.parameters
626
+
627
+ workflow_service_request_data = WorkflowServiceRequestData(
628
+ revision=_revision,
629
+ parameters=parameters,
630
+ #
631
+ testcase=_testcase,
632
+ inputs=inputs,
633
+ #
634
+ trace=trace,
635
+ outputs=outputs,
636
+ )
637
+
638
+ evaluator_request = EvaluatorServiceRequest(
639
+ version="2025.07.14",
640
+ #
641
+ interface=interface,
642
+ configuration=configuration,
643
+ #
644
+ data=workflow_service_request_data,
645
+ #
646
+ references=references, # type: ignore
647
+ links=links, # type: ignore
648
+ )
649
+
650
+ evaluator_response = await invoke_evaluator(
651
+ request=evaluator_request,
652
+ #
653
+ annotate=True,
654
+ )
655
+
656
+ if (
657
+ not evaluator_response
658
+ or not evaluator_response.data
659
+ or not evaluator_response.trace_id
660
+ ):
661
+ print("Missing or invalid evaluator response")
662
+ if evaluator_response:
663
+ print(evaluator_response.model_dump(exclude_none=True))
664
+ continue
665
+
666
+ trace_id = evaluator_response.trace_id
667
+
668
+ trace = fetch_trace_data(trace_id, max_retries=20, delay=1.0)
669
+
670
+ result = await alog_result(
671
+ run_id=run.id,
672
+ scenario_id=scenario.id,
673
+ step_key="evaluator-" + evaluator_revision.slug, # type: ignore
674
+ trace_id=trace_id,
675
+ )
676
+
677
+ print(
678
+ f"{UNICODE['pipe']}"
679
+ f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
680
+ f"{UNICODE['pipe']}"
681
+ f"{UNICODE['last' if (i == len(evaluator_revisions) - 1) else 'next']}"
682
+ f"{UNICODE['here']}"
683
+ f" result_id={str(result.id)} (annotation)",
684
+ )
685
+
686
+ results[evaluator_revision.slug] = result
687
+
688
+ trace = await trace
689
+
690
+ if not trace:
691
+ print("Failed to fetch trace data for evaluator")
692
+ continue
693
+
694
+ metrics = await acompute_metrics(
695
+ run_id=run.id,
696
+ scenario_id=scenario.id,
697
+ )
698
+
699
+ print(
700
+ f"{UNICODE['pipe']}"
701
+ f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
702
+ f"{UNICODE['last']}"
703
+ f"{UNICODE['here']}"
704
+ f"{UNICODE['skip']}"
705
+ f" metrics_id={str(metrics.id)}",
706
+ )
707
+
708
+ scenarios.append(
709
+ {
710
+ "scenario": scenario,
711
+ "results": results,
712
+ "metrics": metrics,
713
+ },
714
+ )
715
+
716
+ print(
717
+ f"{UNICODE['pipe']}"
718
+ f"{UNICODE['skip']}"
719
+ f"{UNICODE['skip']}"
720
+ f"{UNICODE['skip']}"
721
+ f"{UNICODE['skip']}"
722
+ "-----------------------"
723
+ "--------------------------------------"
724
+ )
725
+
726
+ metrics = dict()
727
+
728
+ if len(scenarios) > 0:
729
+ metrics = await acompute_metrics(
730
+ run_id=run.id,
731
+ )
732
+
733
+ print(
734
+ f"{UNICODE['last']}"
735
+ f"{UNICODE['here']}"
736
+ f"{UNICODE['skip']}"
737
+ f"{UNICODE['skip']}"
738
+ f"{UNICODE['skip']}"
739
+ f" metrics_id={str(metrics.id)}",
740
+ )
741
+
742
+ run = await aclose_run(
743
+ run_id=run.id,
744
+ )
745
+
746
+ run_url = await aget_url(run_id=run.id)
747
+
748
+ print(
749
+ "────────────────────────────────────────────────────────────────────────────"
750
+ )
751
+ print(f"Evaluation finished.")
752
+ print(
753
+ "----------------------------------------------------------------------------"
754
+ )
755
+ print(f"Evaluation URL: {run_url or '[unavailable]'}")
756
+ print(
757
+ "────────────────────────────────────────────────────────────────────────────"
758
+ )
759
+ print()
760
+
761
+ return dict(
762
+ run=run,
763
+ scenarios=scenarios,
764
+ metrics=metrics,
765
+ )