agenta 0.57.0__py3-none-any.whl → 0.63.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. agenta/__init__.py +12 -3
  2. agenta/client/__init__.py +4 -4
  3. agenta/client/backend/__init__.py +4 -4
  4. agenta/client/backend/api_keys/client.py +2 -2
  5. agenta/client/backend/billing/client.py +2 -2
  6. agenta/client/backend/billing/raw_client.py +2 -2
  7. agenta/client/backend/client.py +56 -48
  8. agenta/client/backend/core/client_wrapper.py +2 -2
  9. agenta/client/backend/core/file.py +3 -1
  10. agenta/client/backend/core/http_client.py +3 -3
  11. agenta/client/backend/core/pydantic_utilities.py +13 -3
  12. agenta/client/backend/human_evaluations/client.py +2 -2
  13. agenta/client/backend/human_evaluations/raw_client.py +2 -2
  14. agenta/client/backend/organization/client.py +46 -34
  15. agenta/client/backend/organization/raw_client.py +32 -26
  16. agenta/client/backend/raw_client.py +26 -26
  17. agenta/client/backend/testsets/client.py +18 -18
  18. agenta/client/backend/testsets/raw_client.py +30 -30
  19. agenta/client/backend/types/__init__.py +4 -4
  20. agenta/client/backend/types/account_request.py +3 -1
  21. agenta/client/backend/types/account_response.py +3 -1
  22. agenta/client/backend/types/agenta_node_dto.py +3 -1
  23. agenta/client/backend/types/agenta_nodes_response.py +3 -1
  24. agenta/client/backend/types/agenta_root_dto.py +3 -1
  25. agenta/client/backend/types/agenta_roots_response.py +3 -1
  26. agenta/client/backend/types/agenta_tree_dto.py +3 -1
  27. agenta/client/backend/types/agenta_trees_response.py +3 -1
  28. agenta/client/backend/types/aggregated_result.py +3 -1
  29. agenta/client/backend/types/analytics_response.py +3 -1
  30. agenta/client/backend/types/annotation.py +6 -4
  31. agenta/client/backend/types/annotation_create.py +3 -1
  32. agenta/client/backend/types/annotation_edit.py +3 -1
  33. agenta/client/backend/types/annotation_link.py +3 -1
  34. agenta/client/backend/types/annotation_link_response.py +3 -1
  35. agenta/client/backend/types/annotation_query.py +3 -1
  36. agenta/client/backend/types/annotation_query_request.py +3 -1
  37. agenta/client/backend/types/annotation_reference.py +3 -1
  38. agenta/client/backend/types/annotation_references.py +3 -1
  39. agenta/client/backend/types/annotation_response.py +3 -1
  40. agenta/client/backend/types/annotations_response.py +3 -1
  41. agenta/client/backend/types/app.py +3 -1
  42. agenta/client/backend/types/app_variant_response.py +3 -1
  43. agenta/client/backend/types/app_variant_revision.py +3 -1
  44. agenta/client/backend/types/artifact.py +6 -4
  45. agenta/client/backend/types/base_output.py +3 -1
  46. agenta/client/backend/types/body_fetch_workflow_revision.py +3 -1
  47. agenta/client/backend/types/body_import_testset.py +3 -1
  48. agenta/client/backend/types/bucket_dto.py +3 -1
  49. agenta/client/backend/types/collect_status_response.py +3 -1
  50. agenta/client/backend/types/config_db.py +3 -1
  51. agenta/client/backend/types/config_dto.py +3 -1
  52. agenta/client/backend/types/config_response_model.py +3 -1
  53. agenta/client/backend/types/correct_answer.py +3 -1
  54. agenta/client/backend/types/create_app_output.py +3 -1
  55. agenta/client/backend/types/custom_model_settings_dto.py +3 -1
  56. agenta/client/backend/types/custom_provider_dto.py +3 -1
  57. agenta/client/backend/types/custom_provider_kind.py +1 -1
  58. agenta/client/backend/types/custom_provider_settings_dto.py +3 -1
  59. agenta/client/backend/types/delete_evaluation.py +3 -1
  60. agenta/client/backend/types/environment_output.py +3 -1
  61. agenta/client/backend/types/environment_output_extended.py +3 -1
  62. agenta/client/backend/types/environment_revision.py +3 -1
  63. agenta/client/backend/types/error.py +3 -1
  64. agenta/client/backend/types/evaluation.py +3 -1
  65. agenta/client/backend/types/evaluation_scenario.py +3 -1
  66. agenta/client/backend/types/evaluation_scenario_input.py +3 -1
  67. agenta/client/backend/types/evaluation_scenario_output.py +3 -1
  68. agenta/client/backend/types/evaluation_scenario_result.py +3 -1
  69. agenta/client/backend/types/evaluator.py +6 -4
  70. agenta/client/backend/types/evaluator_config.py +6 -4
  71. agenta/client/backend/types/evaluator_flags.py +3 -1
  72. agenta/client/backend/types/evaluator_mapping_output_interface.py +3 -1
  73. agenta/client/backend/types/evaluator_output_interface.py +3 -1
  74. agenta/client/backend/types/evaluator_query.py +3 -1
  75. agenta/client/backend/types/evaluator_query_request.py +3 -1
  76. agenta/client/backend/types/evaluator_request.py +3 -1
  77. agenta/client/backend/types/evaluator_response.py +3 -1
  78. agenta/client/backend/types/evaluators_response.py +3 -1
  79. agenta/client/backend/types/exception_dto.py +3 -1
  80. agenta/client/backend/types/extended_o_tel_tracing_response.py +3 -1
  81. agenta/client/backend/types/get_config_response.py +3 -1
  82. agenta/client/backend/types/header.py +3 -1
  83. agenta/client/backend/types/http_validation_error.py +3 -1
  84. agenta/client/backend/types/human_evaluation.py +3 -1
  85. agenta/client/backend/types/human_evaluation_scenario.py +3 -1
  86. agenta/client/backend/types/human_evaluation_scenario_input.py +3 -1
  87. agenta/client/backend/types/human_evaluation_scenario_output.py +3 -1
  88. agenta/client/backend/types/invite_request.py +3 -1
  89. agenta/client/backend/types/legacy_analytics_response.py +3 -1
  90. agenta/client/backend/types/legacy_data_point.py +3 -1
  91. agenta/client/backend/types/legacy_evaluator.py +3 -1
  92. agenta/client/backend/types/legacy_scope_request.py +3 -1
  93. agenta/client/backend/types/legacy_scopes_response.py +3 -1
  94. agenta/client/backend/types/legacy_subscription_request.py +3 -1
  95. agenta/client/backend/types/legacy_user_request.py +3 -1
  96. agenta/client/backend/types/legacy_user_response.py +3 -1
  97. agenta/client/backend/types/lifecycle_dto.py +3 -1
  98. agenta/client/backend/types/link_dto.py +3 -1
  99. agenta/client/backend/types/list_api_keys_response.py +3 -1
  100. agenta/client/backend/types/llm_run_rate_limit.py +3 -1
  101. agenta/client/backend/types/meta_request.py +3 -1
  102. agenta/client/backend/types/metrics_dto.py +3 -1
  103. agenta/client/backend/types/new_testset.py +3 -1
  104. agenta/client/backend/types/node_dto.py +3 -1
  105. agenta/client/backend/types/o_tel_context_dto.py +3 -1
  106. agenta/client/backend/types/o_tel_event.py +6 -4
  107. agenta/client/backend/types/o_tel_event_dto.py +3 -1
  108. agenta/client/backend/types/o_tel_extra_dto.py +3 -1
  109. agenta/client/backend/types/o_tel_flat_span.py +6 -4
  110. agenta/client/backend/types/o_tel_link.py +6 -4
  111. agenta/client/backend/types/o_tel_link_dto.py +3 -1
  112. agenta/client/backend/types/o_tel_links_response.py +3 -1
  113. agenta/client/backend/types/o_tel_span.py +1 -1
  114. agenta/client/backend/types/o_tel_span_dto.py +3 -1
  115. agenta/client/backend/types/o_tel_spans_tree.py +3 -1
  116. agenta/client/backend/types/o_tel_tracing_data_response.py +3 -1
  117. agenta/client/backend/types/o_tel_tracing_request.py +3 -1
  118. agenta/client/backend/types/o_tel_tracing_response.py +3 -1
  119. agenta/client/backend/types/organization.py +3 -1
  120. agenta/client/backend/types/organization_details.py +3 -1
  121. agenta/client/backend/types/organization_membership_request.py +3 -1
  122. agenta/client/backend/types/organization_output.py +3 -1
  123. agenta/client/backend/types/organization_request.py +3 -1
  124. agenta/client/backend/types/parent_dto.py +3 -1
  125. agenta/client/backend/types/project_membership_request.py +3 -1
  126. agenta/client/backend/types/project_request.py +3 -1
  127. agenta/client/backend/types/project_scope.py +3 -1
  128. agenta/client/backend/types/projects_response.py +3 -1
  129. agenta/client/backend/types/reference.py +6 -4
  130. agenta/client/backend/types/reference_dto.py +3 -1
  131. agenta/client/backend/types/reference_request_model.py +3 -1
  132. agenta/client/backend/types/result.py +3 -1
  133. agenta/client/backend/types/root_dto.py +3 -1
  134. agenta/client/backend/types/scopes_response_model.py +3 -1
  135. agenta/client/backend/types/secret_dto.py +3 -1
  136. agenta/client/backend/types/secret_response_dto.py +3 -1
  137. agenta/client/backend/types/simple_evaluation_output.py +3 -1
  138. agenta/client/backend/types/span_dto.py +6 -4
  139. agenta/client/backend/types/standard_provider_dto.py +3 -1
  140. agenta/client/backend/types/standard_provider_settings_dto.py +3 -1
  141. agenta/client/backend/types/status_dto.py +3 -1
  142. agenta/client/backend/types/tags_request.py +3 -1
  143. agenta/client/backend/types/testcase_response.py +6 -4
  144. agenta/client/backend/types/testset.py +6 -4
  145. agenta/client/backend/types/{test_set_output_response.py → testset_output_response.py} +4 -2
  146. agenta/client/backend/types/testset_request.py +3 -1
  147. agenta/client/backend/types/testset_response.py +3 -1
  148. agenta/client/backend/types/{test_set_simple_response.py → testset_simple_response.py} +4 -2
  149. agenta/client/backend/types/testsets_response.py +3 -1
  150. agenta/client/backend/types/time_dto.py +3 -1
  151. agenta/client/backend/types/tree_dto.py +3 -1
  152. agenta/client/backend/types/update_app_output.py +3 -1
  153. agenta/client/backend/types/user_request.py +3 -1
  154. agenta/client/backend/types/validation_error.py +3 -1
  155. agenta/client/backend/types/workflow_artifact.py +6 -4
  156. agenta/client/backend/types/workflow_data.py +3 -1
  157. agenta/client/backend/types/workflow_flags.py +3 -1
  158. agenta/client/backend/types/workflow_request.py +3 -1
  159. agenta/client/backend/types/workflow_response.py +3 -1
  160. agenta/client/backend/types/workflow_revision.py +6 -4
  161. agenta/client/backend/types/workflow_revision_request.py +3 -1
  162. agenta/client/backend/types/workflow_revision_response.py +3 -1
  163. agenta/client/backend/types/workflow_revisions_response.py +3 -1
  164. agenta/client/backend/types/workflow_variant.py +6 -4
  165. agenta/client/backend/types/workflow_variant_request.py +3 -1
  166. agenta/client/backend/types/workflow_variant_response.py +3 -1
  167. agenta/client/backend/types/workflow_variants_response.py +3 -1
  168. agenta/client/backend/types/workflows_response.py +3 -1
  169. agenta/client/backend/types/workspace.py +3 -1
  170. agenta/client/backend/types/workspace_member_response.py +3 -1
  171. agenta/client/backend/types/workspace_membership_request.py +3 -1
  172. agenta/client/backend/types/workspace_permission.py +3 -1
  173. agenta/client/backend/types/workspace_request.py +3 -1
  174. agenta/client/backend/types/workspace_response.py +3 -1
  175. agenta/client/backend/workspace/client.py +2 -2
  176. agenta/client/client.py +102 -88
  177. agenta/sdk/__init__.py +52 -3
  178. agenta/sdk/agenta_init.py +43 -16
  179. agenta/sdk/assets.py +22 -15
  180. agenta/sdk/context/serving.py +20 -8
  181. agenta/sdk/context/tracing.py +40 -22
  182. agenta/sdk/contexts/__init__.py +0 -0
  183. agenta/sdk/contexts/routing.py +38 -0
  184. agenta/sdk/contexts/running.py +57 -0
  185. agenta/sdk/contexts/tracing.py +86 -0
  186. agenta/sdk/decorators/__init__.py +1 -0
  187. agenta/sdk/decorators/routing.py +284 -0
  188. agenta/sdk/decorators/running.py +692 -98
  189. agenta/sdk/decorators/serving.py +20 -21
  190. agenta/sdk/decorators/tracing.py +176 -131
  191. agenta/sdk/engines/__init__.py +0 -0
  192. agenta/sdk/engines/running/__init__.py +0 -0
  193. agenta/sdk/engines/running/utils.py +17 -0
  194. agenta/sdk/engines/tracing/__init__.py +1 -0
  195. agenta/sdk/engines/tracing/attributes.py +185 -0
  196. agenta/sdk/engines/tracing/conventions.py +49 -0
  197. agenta/sdk/engines/tracing/exporters.py +130 -0
  198. agenta/sdk/engines/tracing/inline.py +1154 -0
  199. agenta/sdk/engines/tracing/processors.py +190 -0
  200. agenta/sdk/engines/tracing/propagation.py +102 -0
  201. agenta/sdk/engines/tracing/spans.py +136 -0
  202. agenta/sdk/engines/tracing/tracing.py +324 -0
  203. agenta/sdk/evaluations/__init__.py +2 -0
  204. agenta/sdk/evaluations/metrics.py +37 -0
  205. agenta/sdk/evaluations/preview/__init__.py +0 -0
  206. agenta/sdk/evaluations/preview/evaluate.py +765 -0
  207. agenta/sdk/evaluations/preview/utils.py +861 -0
  208. agenta/sdk/evaluations/results.py +66 -0
  209. agenta/sdk/evaluations/runs.py +153 -0
  210. agenta/sdk/evaluations/scenarios.py +48 -0
  211. agenta/sdk/litellm/litellm.py +12 -0
  212. agenta/sdk/litellm/mockllm.py +6 -8
  213. agenta/sdk/litellm/mocks/__init__.py +5 -5
  214. agenta/sdk/managers/applications.py +304 -0
  215. agenta/sdk/managers/config.py +2 -2
  216. agenta/sdk/managers/evaluations.py +0 -0
  217. agenta/sdk/managers/evaluators.py +303 -0
  218. agenta/sdk/managers/secrets.py +161 -24
  219. agenta/sdk/managers/shared.py +3 -1
  220. agenta/sdk/managers/testsets.py +441 -0
  221. agenta/sdk/managers/vault.py +3 -3
  222. agenta/sdk/middleware/auth.py +0 -176
  223. agenta/sdk/middleware/vault.py +203 -8
  224. agenta/sdk/middlewares/__init__.py +0 -0
  225. agenta/sdk/middlewares/routing/__init__.py +0 -0
  226. agenta/sdk/middlewares/routing/auth.py +263 -0
  227. agenta/sdk/middlewares/routing/cors.py +30 -0
  228. agenta/sdk/middlewares/routing/otel.py +29 -0
  229. agenta/sdk/middlewares/running/__init__.py +0 -0
  230. agenta/sdk/middlewares/running/normalizer.py +321 -0
  231. agenta/sdk/middlewares/running/resolver.py +161 -0
  232. agenta/sdk/middlewares/running/vault.py +140 -0
  233. agenta/sdk/models/__init__.py +0 -0
  234. agenta/sdk/models/blobs.py +33 -0
  235. agenta/sdk/models/evaluations.py +119 -0
  236. agenta/sdk/models/git.py +126 -0
  237. agenta/sdk/models/shared.py +167 -0
  238. agenta/sdk/models/testsets.py +163 -0
  239. agenta/sdk/models/tracing.py +202 -0
  240. agenta/sdk/models/workflows.py +753 -0
  241. agenta/sdk/tracing/exporters.py +67 -17
  242. agenta/sdk/tracing/processors.py +97 -0
  243. agenta/sdk/tracing/propagation.py +3 -1
  244. agenta/sdk/tracing/spans.py +4 -0
  245. agenta/sdk/tracing/tracing.py +13 -13
  246. agenta/sdk/types.py +211 -17
  247. agenta/sdk/utils/cache.py +1 -1
  248. agenta/sdk/utils/client.py +38 -0
  249. agenta/sdk/utils/helpers.py +13 -12
  250. agenta/sdk/utils/logging.py +18 -78
  251. agenta/sdk/utils/references.py +23 -0
  252. agenta/sdk/workflows/builtin.py +600 -0
  253. agenta/sdk/workflows/configurations.py +22 -0
  254. agenta/sdk/workflows/errors.py +292 -0
  255. agenta/sdk/workflows/handlers.py +1791 -0
  256. agenta/sdk/workflows/interfaces.py +948 -0
  257. agenta/sdk/workflows/sandbox.py +118 -0
  258. agenta/sdk/workflows/utils.py +303 -6
  259. {agenta-0.57.0.dist-info → agenta-0.63.2.dist-info}/METADATA +33 -30
  260. agenta-0.63.2.dist-info/RECORD +421 -0
  261. agenta/sdk/middleware/adapt.py +0 -253
  262. agenta/sdk/middleware/base.py +0 -40
  263. agenta/sdk/middleware/flags.py +0 -40
  264. agenta/sdk/workflows/types.py +0 -472
  265. agenta-0.57.0.dist-info/RECORD +0 -371
  266. /agenta/sdk/{workflows → engines/running}/registry.py +0 -0
  267. {agenta-0.57.0.dist-info → agenta-0.63.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1791 @@
1
+ from typing import List, Any, Optional, Any, Dict, Union
2
+ from json import dumps, loads
3
+ import traceback
4
+ import json
5
+ import re
6
+ import math
7
+
8
+ import httpx
9
+
10
+ import litellm
11
+
12
+ from pydantic import BaseModel, Field
13
+ from openai import AsyncOpenAI, OpenAIError
14
+ from difflib import SequenceMatcher
15
+
16
+ from agenta.sdk.utils.logging import get_module_logger
17
+
18
+ from agenta.sdk.litellm import mockllm
19
+ from agenta.sdk.types import PromptTemplate, Message
20
+ from agenta.sdk.managers.secrets import SecretsManager
21
+
22
+ from agenta.sdk.decorators.tracing import instrument
23
+
24
+ from agenta.sdk.models.shared import Data
25
+ from agenta.sdk.models.tracing import Trace
26
+ from agenta.sdk.workflows.sandbox import execute_code_safely
27
+ from agenta.sdk.workflows.errors import (
28
+ InvalidConfigurationParametersV0Error,
29
+ MissingConfigurationParameterV0Error,
30
+ InvalidConfigurationParameterV0Error,
31
+ InvalidInputsV0Error,
32
+ MissingInputV0Error,
33
+ InvalidInputV0Error,
34
+ InvalidOutputsV0Error,
35
+ MissingOutputV0Error,
36
+ InvalidSecretsV0Error,
37
+ JSONDiffV0Error,
38
+ LevenshteinDistanceV0Error,
39
+ SyntacticSimilarityV0Error,
40
+ SemanticSimilarityV0Error,
41
+ WebhookServerV0Error,
42
+ WebhookClientV0Error,
43
+ CustomCodeServerV0Error,
44
+ RegexPatternV0Error,
45
+ PromptFormattingV0Error,
46
+ PromptCompletionV0Error,
47
+ )
48
+
49
+ from agenta.sdk.litellm import mockllm
50
+ from agenta.sdk.litellm.litellm import litellm_handler
51
+
52
+ litellm.logging = False
53
+ litellm.set_verbose = False
54
+ litellm.drop_params = True
55
+ # litellm.turn_off_message_logging = True
56
+ mockllm.litellm = litellm
57
+
58
+ litellm.callbacks = [litellm_handler()]
59
+
60
+ log = get_module_logger(__name__)
61
+
62
+
63
+ async def _compute_embedding(openai: Any, model: str, input: str) -> List[float]:
64
+ response = await openai.embeddings.create(model=model, input=input)
65
+ # embeddings API already returns a list of floats
66
+ return response.data[0].embedding
67
+
68
+
69
+ def _compute_similarity(embedding_1: List[float], embedding_2: List[float]) -> float:
70
+ # Cosine similarity
71
+ dot = sum(a * b for a, b in zip(embedding_1, embedding_2))
72
+ norm1 = math.sqrt(sum(a * a for a in embedding_1))
73
+ norm2 = math.sqrt(sum(b * b for b in embedding_2))
74
+ if norm1 == 0 or norm2 == 0:
75
+ return 0.0
76
+ return dot / (norm1 * norm2)
77
+
78
+
79
+ import json
80
+ import re
81
+ from typing import Any, Dict, Iterable, Tuple, Optional
82
+
83
+ try:
84
+ import jsonpath # ✅ use module API
85
+ from jsonpath import JSONPointer # pointer class is fine to use
86
+ except Exception:
87
+ jsonpath = None
88
+ JSONPointer = None
89
+
90
+ # ========= Scheme detection =========
91
+
92
+
93
+ def detect_scheme(expr: str) -> str:
94
+ """Return 'json-path', 'json-pointer', or 'dot-notation' based on the placeholder prefix."""
95
+ if expr.startswith("$"):
96
+ return "json-path"
97
+ if expr.startswith("/"):
98
+ return "json-pointer"
99
+ return "dot-notation"
100
+
101
+
102
+ # ========= Resolvers =========
103
+
104
+
105
+ def resolve_dot_notation(expr: str, data: dict) -> object:
106
+ if "[" in expr or "]" in expr:
107
+ raise KeyError(f"Bracket syntax is not supported in dot-notation: {expr!r}")
108
+
109
+ # First, check if the expression exists as a literal key (e.g., "topic.story" as a single key)
110
+ # This allows users to use dots in their variable names without nested access
111
+ if expr in data:
112
+ return data[expr]
113
+
114
+ # If not found as a literal key, try to parse as dot-notation path
115
+ cur = data
116
+ for token in (p for p in expr.split(".") if p):
117
+ if isinstance(cur, list) and token.isdigit():
118
+ cur = cur[int(token)]
119
+ else:
120
+ if not isinstance(cur, dict):
121
+ raise KeyError(
122
+ f"Cannot access key {token!r} on non-dict while resolving {expr!r}"
123
+ )
124
+ if token not in cur:
125
+ raise KeyError(f"Missing key {token!r} while resolving {expr!r}")
126
+ cur = cur[token]
127
+ return cur
128
+
129
+
130
+ def resolve_json_path(expr: str, data: dict) -> object:
131
+ if jsonpath is None:
132
+ raise ImportError("python-jsonpath is required for json-path ($...)")
133
+
134
+ if not (expr == "$" or expr.startswith("$.") or expr.startswith("$[")):
135
+ raise ValueError(
136
+ f"Invalid json-path expression {expr!r}. "
137
+ "Must start with '$', '$.' or '$[' (no implicit normalization)."
138
+ )
139
+
140
+ # Use package-level APIf
141
+ results = jsonpath.findall(expr, data) # always returns a list
142
+ return results[0] if len(results) == 1 else results
143
+
144
+
145
+ def resolve_json_pointer(expr: str, data: Dict[str, Any]) -> Any:
146
+ """Resolve a JSON Pointer; returns a single value."""
147
+ if JSONPointer is None:
148
+ raise ImportError("python-jsonpath is required for json-pointer (/...)")
149
+ return JSONPointer(expr).resolve(data)
150
+
151
+
152
+ def resolve_any(expr: str, data: Dict[str, Any]) -> Any:
153
+ """Dispatch to the right resolver based on detected scheme."""
154
+ scheme = detect_scheme(expr)
155
+ if scheme == "json-path":
156
+ return resolve_json_path(expr, data)
157
+ if scheme == "json-pointer":
158
+ return resolve_json_pointer(expr, data)
159
+ return resolve_dot_notation(expr, data)
160
+
161
+
162
+ # ========= Placeholder & coercion helpers =========
163
+
164
+ _PLACEHOLDER_RE = re.compile(r"\{\{\s*(.*?)\s*\}\}")
165
+
166
+
167
+ def extract_placeholders(template: str) -> Iterable[str]:
168
+ """Yield the inner text of all {{ ... }} occurrences (trimmed)."""
169
+ for m in _PLACEHOLDER_RE.finditer(template):
170
+ yield m.group(1).strip()
171
+
172
+
173
+ def coerce_to_str(value: Any) -> str:
174
+ """Pretty stringify values for embedding into templates."""
175
+ if isinstance(value, (dict, list)):
176
+ return json.dumps(value, ensure_ascii=False)
177
+ return str(value)
178
+
179
+
180
+ def build_replacements(
181
+ placeholders: Iterable[str], data: Dict[str, Any]
182
+ ) -> Tuple[Dict[str, str], set]:
183
+ """
184
+ Resolve all placeholders against data.
185
+ Returns (replacements, unresolved_placeholders).
186
+ """
187
+ replacements: Dict[str, str] = {}
188
+ unresolved: set = set()
189
+ for expr in set(placeholders):
190
+ try:
191
+ val = resolve_any(expr, data)
192
+ # Escape backslashes to avoid regex replacement surprises
193
+ replacements[expr] = coerce_to_str(val).replace("\\", "\\\\")
194
+ except Exception:
195
+ unresolved.add(expr)
196
+ return replacements, unresolved
197
+
198
+
199
+ def apply_replacements(template: str, replacements: Dict[str, str]) -> str:
200
+ """Replace {{ expr }} using a callback to avoid regex-injection issues."""
201
+
202
+ def _repl(m: re.Match) -> str:
203
+ expr = m.group(1).strip()
204
+ return replacements.get(expr, m.group(0))
205
+
206
+ return _PLACEHOLDER_RE.sub(_repl, template)
207
+
208
+
209
+ def compute_truly_unreplaced(original: set, rendered: str) -> set:
210
+ """Only count placeholders that were in the original template and remain."""
211
+ now = set(extract_placeholders(rendered))
212
+ return original & now
213
+
214
+
215
+ def missing_lib_hints(unreplaced: set) -> Optional[str]:
216
+ """Suggest installing python-jsonpath if placeholders indicate json-path or json-pointer usage."""
217
+ if any(expr.startswith("$") or expr.startswith("/") for expr in unreplaced) and (
218
+ jsonpath is None or JSONPointer is None
219
+ ):
220
+ return (
221
+ "Install python-jsonpath to enable json-path ($...) and json-pointer (/...)"
222
+ )
223
+ return None
224
+
225
+
226
+ def _format_with_template(
227
+ content: str,
228
+ format: str,
229
+ kwargs: Dict[str, Any],
230
+ ) -> str:
231
+ """Internal method to format content based on template_format"""
232
+ if format == "fstring":
233
+ return content.format(**kwargs)
234
+
235
+ elif format == "jinja2":
236
+ from jinja2 import Template, TemplateError
237
+
238
+ try:
239
+ return Template(content).render(**kwargs)
240
+ except TemplateError:
241
+ return content
242
+
243
+ elif format == "curly":
244
+ original_placeholders = set(extract_placeholders(content))
245
+
246
+ replacements, _unresolved = build_replacements(original_placeholders, kwargs)
247
+
248
+ result = apply_replacements(content, replacements)
249
+
250
+ truly_unreplaced = compute_truly_unreplaced(original_placeholders, result)
251
+
252
+ if truly_unreplaced:
253
+ hint = missing_lib_hints(truly_unreplaced)
254
+ suffix = f" Hint: {hint}" if hint else ""
255
+ raise ValueError(
256
+ f"Template variables not found or unresolved: "
257
+ f"{', '.join(sorted(truly_unreplaced))}.{suffix}"
258
+ )
259
+
260
+ return result
261
+
262
+ return content
263
+
264
+
265
+ def _flatten_json(json_obj: Union[list, dict]) -> Dict[str, Any]:
266
+ """
267
+ This function takes a (nested) JSON object and flattens it into a single-level dictionary where each key represents the path to the value in the original JSON structure. This is done recursively, ensuring that the full hierarchical context is preserved in the keys.
268
+
269
+ Args:
270
+ json_obj (Union[list, dict]): The (nested) JSON object to flatten. It can be either a dictionary or a list.
271
+
272
+ Returns:
273
+ Dict[str, Any]: The flattened JSON object as a dictionary, with keys representing the paths to the values in the original structure.
274
+ """
275
+
276
+ output = {}
277
+
278
+ def flatten(obj: Union[list, dict], path: str = "") -> None:
279
+ if isinstance(obj, dict):
280
+ for key, value in obj.items():
281
+ new_key = f"{path}.{key}" if path else key
282
+ if isinstance(value, (dict, list)):
283
+ flatten(value, new_key)
284
+ else:
285
+ output[new_key] = value
286
+
287
+ elif isinstance(obj, list):
288
+ for index, value in enumerate(obj):
289
+ new_key = f"{path}.{index}" if path else str(index)
290
+ if isinstance(value, (dict, list)):
291
+ flatten(value, new_key)
292
+ else:
293
+ output[new_key] = value
294
+
295
+ flatten(json_obj)
296
+ return output
297
+
298
+
299
+ def _compare_jsons(
300
+ ground_truth: Union[list, dict],
301
+ app_output: Union[list, dict],
302
+ settings_values: dict,
303
+ ):
304
+ """
305
+ This function takes two JSON objects (ground truth and application output), flattens them using the `_flatten_json` function, and then compares the fields.
306
+
307
+ Args:
308
+ ground_truth (list | dict): The ground truth
309
+ app_output (list | dict): The application output
310
+ settings_values: dict: The advanced configuration of the evaluator
311
+
312
+ Returns:
313
+ the average score between both JSON objects
314
+ """
315
+
316
+ def normalize_keys(d: Dict[str, Any], case_insensitive: bool) -> Dict[str, Any]:
317
+ if not case_insensitive:
318
+ return d
319
+ return {k.lower(): v for k, v in d.items()}
320
+
321
+ def diff(ground_truth: Any, app_output: Any, compare_schema_only: bool) -> float:
322
+ gt_key, gt_value = next(iter(ground_truth.items()))
323
+ ao_key, ao_value = next(iter(app_output.items()))
324
+
325
+ if compare_schema_only:
326
+ return (
327
+ 1.0 if (gt_key == ao_key and type(gt_value) == type(ao_value)) else 0.0
328
+ )
329
+ return 1.0 if (gt_key == ao_key and gt_value == ao_value) else 0.0
330
+
331
+ flattened_ground_truth = _flatten_json(ground_truth)
332
+ flattened_app_output = _flatten_json(app_output)
333
+
334
+ keys = set(flattened_ground_truth.keys())
335
+ if settings_values.get("predict_keys", False):
336
+ keys = keys.union(set(flattened_app_output.keys()))
337
+
338
+ cumulated_score = 0.0
339
+ no_of_keys = len(keys)
340
+
341
+ case_insensitive_keys = settings_values.get("case_insensitive_keys", False)
342
+ compare_schema_only = settings_values.get("compare_schema_only", False)
343
+ flattened_ground_truth = normalize_keys(
344
+ flattened_ground_truth, case_insensitive_keys
345
+ )
346
+ flattened_app_output = normalize_keys(flattened_app_output, case_insensitive_keys)
347
+
348
+ for key in keys:
349
+ ground_truth_value = flattened_ground_truth.get(key, None)
350
+ llm_app_output_value = flattened_app_output.get(key, None)
351
+
352
+ key_score = 0.0
353
+ if ground_truth_value is not None and llm_app_output_value is not None:
354
+ key_score = diff(
355
+ {key: ground_truth_value},
356
+ {key: llm_app_output_value},
357
+ compare_schema_only,
358
+ )
359
+
360
+ cumulated_score += key_score
361
+ try:
362
+ average_score = cumulated_score / no_of_keys
363
+ return average_score
364
+ except ZeroDivisionError:
365
+ return 0.0
366
+
367
+
368
+ @instrument()
369
+ def echo_v0(aloha: Any):
370
+ return {"got": aloha}
371
+
372
+
373
+ @instrument(annotate=True)
374
+ def auto_exact_match_v0(
375
+ parameters: Optional[Data] = None,
376
+ inputs: Optional[Data] = None,
377
+ outputs: Optional[Union[Data, str]] = None,
378
+ ) -> Any:
379
+ """
380
+ Exact match evaluator for comparing outputs against reference outputs.
381
+
382
+ inputs: Testcase data, which may contain reference outputs
383
+ outputs: Output from the workflow execution
384
+ parameters: Configuration for the evaluator
385
+
386
+ Returns:
387
+ Evaluation result with success flag (True for match, False for mismatch)
388
+ """
389
+ if parameters is None or not isinstance(parameters, dict):
390
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
391
+
392
+ if not "correct_answer_key" in parameters:
393
+ raise MissingConfigurationParameterV0Error(path="correct_answer_key")
394
+
395
+ correct_answer_key = str(parameters["correct_answer_key"])
396
+
397
+ if inputs is None or not isinstance(inputs, dict):
398
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
399
+
400
+ if not correct_answer_key in inputs:
401
+ raise MissingInputV0Error(path=correct_answer_key)
402
+
403
+ correct_answer = inputs[correct_answer_key]
404
+
405
+ # --------------------------------------------------------------------------
406
+ success = False
407
+ if isinstance(outputs, str) and isinstance(correct_answer, str):
408
+ success = outputs == correct_answer
409
+ elif isinstance(outputs, dict) and isinstance(correct_answer, dict):
410
+ outputs = dumps(outputs, sort_keys=True)
411
+ correct_answer = dumps(correct_answer, sort_keys=True)
412
+ success = outputs == correct_answer
413
+ # --------------------------------------------------------------------------
414
+
415
+ return {"success": success}
416
+
417
+
418
+ @instrument(annotate=True)
419
+ def auto_regex_test_v0(
420
+ parameters: Optional[Data] = None,
421
+ outputs: Optional[Union[Data, str]] = None,
422
+ ) -> Any:
423
+ """
424
+ Regex test evaluator for checking if output matches a regex pattern.
425
+
426
+ Args:
427
+ inputs: Testcase data
428
+ outputs: Output from the workflow execution
429
+ parameters: Configuration for the evaluator with regex pattern and matching flag
430
+
431
+ Returns:
432
+ Evaluation result with success flag
433
+ """
434
+ if parameters is None or not isinstance(parameters, dict):
435
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
436
+
437
+ if not "regex_pattern" in parameters:
438
+ raise MissingConfigurationParameterV0Error(path="regex_pattern")
439
+
440
+ regex_pattern = parameters["regex_pattern"]
441
+
442
+ if not isinstance(regex_pattern, str):
443
+ raise InvalidConfigurationParameterV0Error(
444
+ path="regex_pattern",
445
+ expected="str",
446
+ got=regex_pattern,
447
+ )
448
+
449
+ case_sensitive = parameters.get("case_sensitive", True) is True
450
+
451
+ regex_should_match = parameters.get("regex_should_match", True) is True
452
+
453
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
454
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
455
+
456
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
457
+
458
+ # --------------------------------------------------------------------------
459
+ try:
460
+ pattern = re.compile(
461
+ regex_pattern,
462
+ flags=0 if case_sensitive else re.IGNORECASE,
463
+ )
464
+ except Exception as e:
465
+ raise RegexPatternV0Error(pattern=regex_pattern) from e
466
+
467
+ result = pattern.search(outputs_str)
468
+
469
+ success = bool(result) == regex_should_match
470
+ # --------------------------------------------------------------------------
471
+
472
+ return {"success": success}
473
+
474
+
475
+ @instrument(annotate=True)
476
+ def field_match_test_v0(
477
+ parameters: Optional[Data] = None,
478
+ inputs: Optional[Data] = None,
479
+ outputs: Optional[Union[Data, str]] = None,
480
+ ) -> Any:
481
+ """
482
+ Field match test evaluator for extracting and comparing a specific field from JSON output.
483
+
484
+ Args:
485
+ inputs: Testcase data with ground truth
486
+ outputs: Output from the workflow execution (expected to be JSON string or dict)
487
+ parameters: Configuration for the evaluator with json_field to extract
488
+
489
+ Returns:
490
+ Evaluation result with success flag
491
+ """
492
+ if parameters is None or not isinstance(parameters, dict):
493
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
494
+
495
+ if not "json_field" in parameters:
496
+ raise MissingConfigurationParameterV0Error(path="json_field")
497
+
498
+ json_field = str(parameters["json_field"])
499
+
500
+ if not "correct_answer_key" in parameters:
501
+ raise MissingConfigurationParameterV0Error(path="correct_answer_key")
502
+
503
+ correct_answer_key = str(parameters["correct_answer_key"])
504
+
505
+ if inputs is None or not isinstance(inputs, dict):
506
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
507
+
508
+ if not correct_answer_key in inputs:
509
+ raise MissingInputV0Error(path=correct_answer_key)
510
+
511
+ correct_answer = inputs[correct_answer_key]
512
+
513
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
514
+ # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
515
+ return {"success": False}
516
+
517
+ outputs_dict = outputs
518
+ if isinstance(outputs, str):
519
+ try:
520
+ outputs_dict = loads(outputs)
521
+ except json.JSONDecodeError as e:
522
+ # raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
523
+ return {"success": False}
524
+
525
+ if not isinstance(outputs_dict, dict):
526
+ # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
527
+ return {"success": False}
528
+
529
+ if not json_field in outputs_dict:
530
+ # raise MissingOutputV0Error(path=json_field)
531
+ return {"success": False}
532
+
533
+ # --------------------------------------------------------------------------
534
+ success = outputs_dict[json_field] == correct_answer
535
+ # --------------------------------------------------------------------------
536
+
537
+ return {"success": success}
538
+
539
+
540
+ @instrument(annotate=True)
541
+ async def auto_webhook_test_v0(
542
+ parameters: Optional[Data] = None,
543
+ inputs: Optional[Data] = None,
544
+ outputs: Optional[Union[Data, str]] = None,
545
+ ) -> Any:
546
+ """
547
+ Webhook test evaluator for sending output to an external service for evaluation.
548
+
549
+ Args:
550
+ inputs: Testcase data with ground truth
551
+ outputs: Output from the workflow execution
552
+ parameters: Configuration for the evaluator with webhook_url
553
+
554
+ Returns:
555
+ Evaluation result with score from the webhook
556
+ """
557
+ if parameters is None or not isinstance(parameters, dict):
558
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
559
+
560
+ if not "webhook_url" in parameters:
561
+ raise MissingConfigurationParameterV0Error(path="webhook_url")
562
+
563
+ webhook_url = str(parameters["webhook_url"])
564
+
565
+ if not "correct_answer_key" in parameters:
566
+ raise MissingConfigurationParameterV0Error(path="correct_answer_key")
567
+
568
+ correct_answer_key = str(parameters["correct_answer_key"])
569
+
570
+ if inputs is None or not isinstance(inputs, dict):
571
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
572
+
573
+ if not correct_answer_key in inputs:
574
+ raise MissingInputV0Error(path=correct_answer_key)
575
+
576
+ correct_answer = inputs[correct_answer_key]
577
+
578
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
579
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
580
+
581
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
582
+
583
+ threshold = parameters.get("threshold") or 0.5
584
+
585
+ if not isinstance(threshold, float):
586
+ raise InvalidConfigurationParameterV0Error(
587
+ path="threshold",
588
+ expected="float",
589
+ got=threshold,
590
+ )
591
+
592
+ if not 0.0 < threshold <= 1.0:
593
+ raise InvalidConfigurationParameterV0Error(
594
+ path="threshold",
595
+ expected="float[0.0, 1.0]",
596
+ got=threshold,
597
+ )
598
+
599
+ _outputs = None
600
+
601
+ # --------------------------------------------------------------------------
602
+ json_payload = {
603
+ "inputs": inputs,
604
+ "output": outputs_str,
605
+ "correct_answer": correct_answer,
606
+ }
607
+
608
+ async with httpx.AsyncClient() as client:
609
+ try:
610
+ response = await client.post(
611
+ url=webhook_url,
612
+ json=json_payload,
613
+ )
614
+ except Exception as e:
615
+ raise WebhookClientV0Error(
616
+ message=str(e),
617
+ ) from e
618
+
619
+ if response.status_code != 200:
620
+ raise WebhookServerV0Error(
621
+ code=response.status_code,
622
+ message=response.json(),
623
+ )
624
+
625
+ try:
626
+ _outputs = response.json()
627
+ except Exception as e:
628
+ raise WebhookClientV0Error(
629
+ message=str(e),
630
+ ) from e
631
+ # --------------------------------------------------------------------------
632
+
633
+ if isinstance(_outputs, (int, float)):
634
+ return {"score": _outputs, "success": _outputs >= threshold}
635
+
636
+ if isinstance(_outputs, bool):
637
+ return {"success": _outputs}
638
+
639
+ if isinstance(_outputs, dict) or isinstance(_outputs, str):
640
+ return _outputs
641
+
642
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=_outputs)
643
+
644
+
645
+ @instrument(annotate=True)
646
+ async def auto_custom_code_run_v0(
647
+ parameters: Optional[Data] = None,
648
+ inputs: Optional[Data] = None,
649
+ outputs: Optional[Union[Data, str]] = None,
650
+ ) -> Any:
651
+ """
652
+ Custom code execution evaluator for running arbitrary code to evaluate outputs.
653
+
654
+ Args:
655
+ inputs: Testcase data with ground truth
656
+ outputs: Output from the workflow execution
657
+ parameters: Configuration for the evaluator with code to execute
658
+
659
+ Returns:
660
+ Evaluation result with score from the custom code
661
+ """
662
+ if parameters is None or not isinstance(parameters, dict):
663
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
664
+
665
+ if not "code" in parameters:
666
+ raise MissingConfigurationParameterV0Error(path="code")
667
+
668
+ code = str(parameters["code"])
669
+
670
+ if not "correct_answer_key" in parameters:
671
+ raise MissingConfigurationParameterV0Error(path="correct_answer_key")
672
+
673
+ correct_answer_key = str(parameters["correct_answer_key"])
674
+
675
+ if inputs is None or not isinstance(inputs, dict):
676
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
677
+
678
+ if not correct_answer_key in inputs:
679
+ raise MissingInputV0Error(path=correct_answer_key)
680
+
681
+ correct_answer = inputs[correct_answer_key]
682
+
683
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
684
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
685
+
686
+ threshold = parameters.get("threshold") or 0.5
687
+
688
+ if not isinstance(threshold, float):
689
+ raise InvalidConfigurationParameterV0Error(
690
+ path="threshold",
691
+ expected="float",
692
+ got=threshold,
693
+ )
694
+
695
+ if not 0.0 < threshold <= 1.0:
696
+ raise InvalidConfigurationParameterV0Error(
697
+ path="threshold",
698
+ expected="float[0.0, 1.0]",
699
+ got=threshold,
700
+ )
701
+
702
+ _outputs = None
703
+
704
+ # --------------------------------------------------------------------------
705
+ try:
706
+ _outputs = execute_code_safely(
707
+ app_params={},
708
+ inputs=inputs,
709
+ output=outputs,
710
+ correct_answer=correct_answer,
711
+ code=code,
712
+ )
713
+ except Exception as e:
714
+ raise CustomCodeServerV0Error(
715
+ message=str(e),
716
+ stacktrace=traceback.format_exc(),
717
+ ) from e
718
+ # --------------------------------------------------------------------------
719
+
720
+ if isinstance(_outputs, (int, float)):
721
+ return {"score": _outputs, "success": _outputs >= threshold}
722
+
723
+ if isinstance(_outputs, bool):
724
+ return {"success": _outputs}
725
+
726
+ if isinstance(_outputs, dict) or isinstance(_outputs, str):
727
+ return _outputs
728
+
729
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=_outputs)
730
+
731
+
732
+ @instrument(annotate=True)
733
+ async def auto_ai_critique_v0(
734
+ parameters: Optional[Data] = None,
735
+ inputs: Optional[Data] = None,
736
+ outputs: Optional[Union[Data, str]] = None,
737
+ ) -> Any:
738
+ # return {"score": 0.75, "success": True}
739
+
740
+ """
741
+ AI critique evaluator for using an LLM to evaluate outputs.
742
+
743
+ Args:
744
+ inputs: Testcase data with ground truth
745
+ outputs: Output from the workflow execution
746
+ parameters: Configuration for the evaluator with prompt_template and model
747
+
748
+ Returns:
749
+ Evaluation result with score from the AI
750
+ """
751
+ if parameters is None or not isinstance(parameters, dict):
752
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
753
+
754
+ correct_answer_key = parameters.get("correct_answer_key")
755
+
756
+ if not "prompt_template" in parameters:
757
+ raise MissingConfigurationParameterV0Error(path="prompt_template")
758
+
759
+ prompt_template = parameters.get("prompt_template")
760
+
761
+ if not isinstance(prompt_template, list):
762
+ raise InvalidConfigurationParameterV0Error(
763
+ path="prompt_template",
764
+ expected="list",
765
+ got=prompt_template,
766
+ )
767
+
768
+ template_version = parameters.get("version") or "3"
769
+
770
+ default_format = "fstring" if template_version == "2" else "curly"
771
+
772
+ template_format = str(parameters.get("template_format") or default_format)
773
+
774
+ model = parameters.get("model") or "gpt-3.5-turbo"
775
+
776
+ if not isinstance(model, str):
777
+ raise InvalidConfigurationParameterV0Error(
778
+ path="model",
779
+ expected="str",
780
+ got=model,
781
+ )
782
+
783
+ response_type = parameters.get("response_type") or (
784
+ "json_schema" if template_version == "4" else "text"
785
+ )
786
+
787
+ if not response_type in ["text", "json_object", "json_schema"]:
788
+ raise InvalidConfigurationParameterV0Error(
789
+ path="response_type",
790
+ expected=["text", "json_object", "json_schema"],
791
+ got=response_type,
792
+ )
793
+
794
+ json_schema = parameters.get("json_schema") or None
795
+
796
+ json_schema = json_schema if response_type == "json_schema" else None
797
+
798
+ if response_type == "json_schema" and not isinstance(json_schema, dict):
799
+ raise InvalidConfigurationParameterV0Error(
800
+ path="json_schema",
801
+ expected="dict",
802
+ got=json_schema,
803
+ )
804
+
805
+ response_format: dict = dict(type=response_type)
806
+
807
+ if response_type == "json_schema":
808
+ response_format["json_schema"] = json_schema
809
+
810
+ correct_answer = None
811
+
812
+ if inputs:
813
+ if not isinstance(inputs, dict):
814
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
815
+
816
+ if correct_answer_key:
817
+ if correct_answer_key in inputs:
818
+ correct_answer = inputs[correct_answer_key]
819
+
820
+ secrets = await SecretsManager.retrieve_secrets()
821
+
822
+ if secrets is None or not isinstance(secrets, list):
823
+ raise InvalidSecretsV0Error(expected="list", got=secrets)
824
+
825
+ openai_api_key = None # secrets.get("OPENAI_API_KEY")
826
+ anthropic_api_key = None # secrets.get("ANTHROPIC_API_KEY")
827
+ openrouter_api_key = None # secrets.get("OPENROUTER_API_KEY")
828
+ cohere_api_key = None # secrets.get("COHERE_API_KEY")
829
+ azure_api_key = None # secrets.get("AZURE_API_KEY")
830
+ groq_api_key = None # secrets.get("GROQ_API_KEY")
831
+
832
+ for secret in secrets:
833
+ if secret.get("kind") == "provider_key":
834
+ secret_data = secret.get("data", {})
835
+ if secret_data.get("kind") == "openai":
836
+ provider_data = secret_data.get("provider", {})
837
+ openai_api_key = provider_data.get("key") or openai_api_key
838
+ if secret_data.get("kind") == "anthropic":
839
+ provider_data = secret_data.get("provider", {})
840
+ anthropic_api_key = provider_data.get("key") or anthropic_api_key
841
+ if secret_data.get("kind") == "openrouter":
842
+ provider_data = secret_data.get("provider", {})
843
+ openrouter_api_key = provider_data.get("key") or openrouter_api_key
844
+ if secret_data.get("kind") == "cohere":
845
+ provider_data = secret_data.get("provider", {})
846
+ cohere_api_key = provider_data.get("key") or cohere_api_key
847
+ if secret_data.get("kind") == "azure":
848
+ provider_data = secret_data.get("provider", {})
849
+ azure_api_key = provider_data.get("key") or azure_api_key
850
+ if secret_data.get("kind") == "groq":
851
+ provider_data = secret_data.get("provider", {})
852
+ groq_api_key = provider_data.get("key") or groq_api_key
853
+
854
+ threshold = parameters.get("threshold") or 0.5
855
+
856
+ if not isinstance(threshold, float):
857
+ raise InvalidConfigurationParameterV0Error(
858
+ path="threshold",
859
+ expected="float",
860
+ got=threshold,
861
+ )
862
+
863
+ _outputs = None
864
+
865
+ # --------------------------------------------------------------------------
866
+ litellm.openai_key = openai_api_key
867
+ litellm.anthropic_key = anthropic_api_key
868
+ litellm.openrouter_key = openrouter_api_key
869
+ litellm.cohere_key = cohere_api_key
870
+ litellm.azure_key = azure_api_key
871
+ litellm.groq_key = groq_api_key
872
+
873
+ context: Dict[str, Any] = dict()
874
+
875
+ if parameters:
876
+ context.update(
877
+ **{
878
+ "parameters": parameters,
879
+ }
880
+ )
881
+
882
+ if correct_answer:
883
+ context.update(
884
+ **{
885
+ "ground_truth": correct_answer,
886
+ "correct_answer": correct_answer,
887
+ "reference": correct_answer,
888
+ }
889
+ )
890
+
891
+ if outputs:
892
+ context.update(
893
+ **{
894
+ "prediction": outputs,
895
+ "outputs": outputs,
896
+ }
897
+ )
898
+
899
+ if inputs:
900
+ context.update(**inputs)
901
+ context.update(
902
+ **{
903
+ "inputs": inputs,
904
+ }
905
+ )
906
+
907
+ try:
908
+ formatted_prompt_template = [
909
+ {
910
+ "role": message["role"],
911
+ "content": _format_with_template(
912
+ content=message["content"],
913
+ format=template_format,
914
+ kwargs=context,
915
+ ),
916
+ }
917
+ for message in prompt_template
918
+ ]
919
+ except Exception as e:
920
+ raise PromptFormattingV0Error(
921
+ message=str(e),
922
+ stacktrace=traceback.format_exc(),
923
+ ) from e
924
+
925
+ try:
926
+ response = await litellm.acompletion(
927
+ model=model,
928
+ messages=formatted_prompt_template,
929
+ temperature=0.01,
930
+ response_format=response_format,
931
+ )
932
+
933
+ _outputs = response.choices[0].message.content.strip() # type: ignore
934
+
935
+ except litellm.AuthenticationError as e: # type: ignore
936
+ e.message = e.message.replace(
937
+ "litellm.AuthenticationError: AuthenticationError: ", ""
938
+ )
939
+ raise e
940
+
941
+ except Exception as e:
942
+ raise PromptCompletionV0Error(
943
+ message=str(e),
944
+ stacktrace=traceback.format_exc(),
945
+ ) from e
946
+ # --------------------------------------------------------------------------
947
+
948
+ try:
949
+ _outputs = json.loads(_outputs)
950
+ except:
951
+ pass
952
+
953
+ if isinstance(_outputs, (int, float)):
954
+ return {
955
+ "score": _outputs,
956
+ "success": _outputs >= threshold,
957
+ }
958
+
959
+ if isinstance(_outputs, bool):
960
+ return {
961
+ "success": _outputs,
962
+ }
963
+
964
+ if isinstance(_outputs, dict):
965
+ return _outputs
966
+
967
+ raise InvalidOutputsV0Error(expected=["dict", "str", "int", "float"], got=_outputs)
968
+
969
+
970
+ @instrument(annotate=True)
971
+ def auto_starts_with_v0(
972
+ parameters: Optional[Data] = None,
973
+ outputs: Optional[Union[Data, str]] = None,
974
+ ) -> Any:
975
+ """
976
+ Starts with evaluator for checking if output starts with a specific prefix.
977
+
978
+ Args:
979
+ inputs: Testcase data
980
+ outputs: Output from the workflow execution
981
+ parameters: Configuration for the evaluator with prefix and case sensitivity setting
982
+
983
+ Returns:
984
+ Evaluation result with success flag
985
+ """
986
+ if parameters is None or not isinstance(parameters, dict):
987
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
988
+
989
+ if not "prefix" in parameters:
990
+ raise MissingConfigurationParameterV0Error(path="prefix")
991
+
992
+ prefix = parameters["prefix"]
993
+
994
+ if not isinstance(prefix, str):
995
+ raise InvalidConfigurationParameterV0Error(
996
+ path="prefix",
997
+ expected="str",
998
+ got=prefix,
999
+ )
1000
+
1001
+ case_sensitive = parameters.get("case_sensitive", True) is True
1002
+
1003
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1004
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1005
+
1006
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1007
+
1008
+ # --------------------------------------------------------------------------
1009
+ if not case_sensitive:
1010
+ outputs_str = outputs_str.lower()
1011
+ prefix = prefix.lower()
1012
+
1013
+ success = outputs_str.startswith(prefix)
1014
+ # --------------------------------------------------------------------------
1015
+
1016
+ return {"success": success}
1017
+
1018
+
1019
+ @instrument(annotate=True)
1020
+ def auto_ends_with_v0(
1021
+ parameters: Optional[Data] = None,
1022
+ outputs: Optional[Union[Data, str]] = None,
1023
+ ) -> Any:
1024
+ """
1025
+ Ends with evaluator for checking if output ends with a specific suffix.
1026
+
1027
+ Args:
1028
+ inputs: Testcase data
1029
+ outputs: Output from the workflow execution
1030
+ parameters: Configuration for the evaluator with suffix and case sensitivity setting
1031
+
1032
+ Returns:
1033
+ Evaluation result with success flag
1034
+ """
1035
+ if parameters is None or not isinstance(parameters, dict):
1036
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1037
+
1038
+ if not "suffix" in parameters:
1039
+ raise MissingConfigurationParameterV0Error(path="suffix")
1040
+
1041
+ suffix = parameters["suffix"]
1042
+
1043
+ if not isinstance(suffix, str):
1044
+ raise InvalidConfigurationParameterV0Error(
1045
+ path="suffix",
1046
+ expected="str",
1047
+ got=suffix,
1048
+ )
1049
+
1050
+ case_sensitive = parameters.get("case_sensitive", True) is True
1051
+
1052
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1053
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1054
+
1055
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1056
+
1057
+ # --------------------------------------------------------------------------
1058
+ if not case_sensitive:
1059
+ outputs_str = outputs_str.lower()
1060
+ suffix = suffix.lower()
1061
+
1062
+ success = outputs_str.endswith(suffix)
1063
+ # --------------------------------------------------------------------------
1064
+
1065
+ return {"success": success}
1066
+
1067
+
1068
+ @instrument(annotate=True)
1069
+ def auto_contains_v0(
1070
+ parameters: Optional[Data] = None,
1071
+ outputs: Optional[Union[Data, str]] = None,
1072
+ ) -> Any:
1073
+ """
1074
+ Contains evaluator for checking if output contains a specific substring.
1075
+
1076
+ Args:
1077
+ inputs: Testcase data
1078
+ outputs: Output from the workflow execution
1079
+ parameters: Configuration for the evaluator with substring and case sensitivity setting
1080
+
1081
+ Returns:
1082
+ Evaluation result with success flag
1083
+ """
1084
+ if parameters is None or not isinstance(parameters, dict):
1085
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1086
+
1087
+ if not "substring" in parameters:
1088
+ raise MissingConfigurationParameterV0Error(path="substring")
1089
+
1090
+ substring = parameters["substring"]
1091
+
1092
+ if not isinstance(substring, str):
1093
+ raise InvalidConfigurationParameterV0Error(
1094
+ path="substring",
1095
+ expected="str",
1096
+ got=substring,
1097
+ )
1098
+
1099
+ case_sensitive = parameters.get("case_sensitive", True) is True
1100
+
1101
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1102
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1103
+
1104
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1105
+
1106
+ # --------------------------------------------------------------------------
1107
+ if not case_sensitive:
1108
+ outputs_str = outputs_str.lower()
1109
+ substring = substring.lower()
1110
+
1111
+ success = substring in outputs_str
1112
+ # --------------------------------------------------------------------------
1113
+
1114
+ return {"success": success}
1115
+
1116
+
1117
+ @instrument(annotate=True)
1118
+ def auto_contains_any_v0(
1119
+ parameters: Optional[Data] = None,
1120
+ outputs: Optional[Union[Data, str]] = None,
1121
+ ) -> Any:
1122
+ """
1123
+ Contains any evaluator for checking if output contains any of the specified substrings.
1124
+
1125
+ Args:
1126
+ inputs: Testcase data
1127
+ outputs: Output from the workflow execution
1128
+ parameters: Configuration for the evaluator with substrings list and case sensitivity setting
1129
+
1130
+ Returns:
1131
+ Evaluation result with success flag
1132
+ """
1133
+ if parameters is None or not isinstance(parameters, dict):
1134
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1135
+
1136
+ if not "substrings" in parameters:
1137
+ raise MissingConfigurationParameterV0Error(path="substrings")
1138
+
1139
+ substrings = parameters["substrings"]
1140
+
1141
+ if not isinstance(substrings, list):
1142
+ raise InvalidConfigurationParameterV0Error(
1143
+ path="substrings",
1144
+ expected="list",
1145
+ got=substrings,
1146
+ )
1147
+
1148
+ substrings = [s.strip() for s in substrings]
1149
+
1150
+ if not all(isinstance(s, str) for s in substrings):
1151
+ raise InvalidConfigurationParameterV0Error(
1152
+ path="substrings",
1153
+ expected="list[str]",
1154
+ got=substrings,
1155
+ )
1156
+
1157
+ case_sensitive = parameters.get("case_sensitive", True) is True
1158
+
1159
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1160
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1161
+
1162
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1163
+
1164
+ # --------------------------------------------------------------------------
1165
+ if not case_sensitive:
1166
+ outputs_str = outputs_str.lower()
1167
+ substrings = [s.lower() for s in substrings]
1168
+
1169
+ success = any(substring in outputs_str for substring in substrings)
1170
+ # --------------------------------------------------------------------------
1171
+
1172
+ return {"success": success}
1173
+
1174
+
1175
+ @instrument(annotate=True)
1176
+ def auto_contains_all_v0(
1177
+ parameters: Optional[Data] = None,
1178
+ outputs: Optional[Union[Data, str]] = None,
1179
+ ) -> Any:
1180
+ """
1181
+ Contains all evaluator for checking if output contains all of the specified substrings.
1182
+
1183
+ Args:
1184
+ inputs: Testcase data
1185
+ outputs: Output from the workflow execution
1186
+ parameters: Configuration for the evaluator with substrings list and case sensitivity setting
1187
+
1188
+ Returns:
1189
+ Evaluation result with success flag
1190
+ """
1191
+ if parameters is None or not isinstance(parameters, dict):
1192
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1193
+
1194
+ if not "substrings" in parameters:
1195
+ raise MissingConfigurationParameterV0Error(path="substrings")
1196
+
1197
+ substrings = parameters["substrings"]
1198
+
1199
+ if not isinstance(substrings, list):
1200
+ raise InvalidConfigurationParameterV0Error(
1201
+ path="substrings",
1202
+ expected="list",
1203
+ got=substrings,
1204
+ )
1205
+
1206
+ substrings = [s.strip() for s in substrings]
1207
+
1208
+ if not all(isinstance(s, str) for s in substrings):
1209
+ raise InvalidConfigurationParameterV0Error(
1210
+ path="substrings",
1211
+ expected="list[str]",
1212
+ got=substrings,
1213
+ )
1214
+
1215
+ case_sensitive = parameters.get("case_sensitive", True) is True
1216
+
1217
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1218
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1219
+
1220
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1221
+
1222
+ # --------------------------------------------------------------------------
1223
+ if not case_sensitive:
1224
+ outputs_str = outputs_str.lower()
1225
+ substrings = [s.lower() for s in substrings]
1226
+
1227
+ success = all(substring in outputs_str for substring in substrings)
1228
+ # --------------------------------------------------------------------------
1229
+
1230
+ return {"success": success}
1231
+
1232
+
1233
+ @instrument(annotate=True)
1234
+ def auto_contains_json_v0(
1235
+ outputs: Optional[Union[Data, str]] = None,
1236
+ ) -> Any:
1237
+ """
1238
+ Contains JSON evaluator for checking if output contains valid JSON content.
1239
+
1240
+ Args:
1241
+ inputs: Testcase data
1242
+ outputs: Output from the workflow execution
1243
+ parameters: Configuration for the evaluator
1244
+
1245
+ Returns:
1246
+ Evaluation result with success flag
1247
+ """
1248
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1249
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1250
+
1251
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1252
+
1253
+ # --------------------------------------------------------------------------
1254
+ success = True
1255
+ potential_json = ""
1256
+
1257
+ try:
1258
+ start_index = outputs_str.index("{")
1259
+ end_index = outputs_str.rindex("}") + 1
1260
+ potential_json = outputs_str[start_index:end_index]
1261
+ except Exception: # pylint: disable=broad-exception-caught
1262
+ success = False
1263
+
1264
+ if success:
1265
+ try:
1266
+ json.loads(potential_json)
1267
+ except Exception: # pylint: disable=broad-exception-caught
1268
+ success = False
1269
+ # --------------------------------------------------------------------------
1270
+
1271
+ return {"success": success}
1272
+
1273
+
1274
+ @instrument(annotate=True)
1275
+ def auto_json_diff_v0(
1276
+ parameters: Optional[Data] = None,
1277
+ inputs: Optional[Data] = None,
1278
+ outputs: Optional[Union[Data, str]] = None,
1279
+ ) -> Any:
1280
+ """
1281
+ JSON diff evaluator for finding differences between JSON structures.
1282
+
1283
+ Args:
1284
+ inputs: Testcase data with reference JSON
1285
+ outputs: Output from the workflow execution
1286
+ parameters: Configuration for the evaluator
1287
+
1288
+ Returns:
1289
+ Evaluation result with score only (no diff explanation)
1290
+ """
1291
+ if parameters is None or not isinstance(parameters, dict):
1292
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1293
+
1294
+ if not "correct_answer_key" in parameters:
1295
+ raise MissingConfigurationParameterV0Error(path="correct_answer_key")
1296
+
1297
+ correct_answer_key = str(parameters["correct_answer_key"])
1298
+
1299
+ if inputs is None or not isinstance(inputs, dict):
1300
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
1301
+
1302
+ if not correct_answer_key in inputs:
1303
+ raise MissingInputV0Error(path=correct_answer_key)
1304
+
1305
+ correct_answer = inputs[correct_answer_key]
1306
+
1307
+ if not isinstance(correct_answer, str) and not isinstance(correct_answer, dict):
1308
+ raise InvalidInputV0Error(
1309
+ path=correct_answer_key, expected=["dict", "str"], got=correct_answer
1310
+ )
1311
+
1312
+ correct_answer_dict = (
1313
+ correct_answer if isinstance(correct_answer, dict) else loads(correct_answer)
1314
+ )
1315
+
1316
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1317
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1318
+
1319
+ outputs_dict = outputs
1320
+ if isinstance(outputs, str):
1321
+ try:
1322
+ outputs_dict = loads(outputs)
1323
+ except json.JSONDecodeError as e:
1324
+ raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
1325
+
1326
+ threshold = parameters.get("threshold") or 0.5
1327
+
1328
+ if not isinstance(threshold, float):
1329
+ raise InvalidConfigurationParameterV0Error(
1330
+ path="threshold",
1331
+ expected="float",
1332
+ got=threshold,
1333
+ )
1334
+
1335
+ if not 0.0 < threshold <= 1.0:
1336
+ raise InvalidConfigurationParameterV0Error(
1337
+ path="threshold",
1338
+ expected="float[0.0, 1.0]",
1339
+ got=threshold,
1340
+ )
1341
+
1342
+ _outputs = None
1343
+
1344
+ # --------------------------------------------------------------------------
1345
+ try:
1346
+ _outputs = _compare_jsons(
1347
+ ground_truth=correct_answer_dict,
1348
+ app_output=outputs_dict, # type: ignore
1349
+ settings_values=parameters,
1350
+ )
1351
+
1352
+ except Exception as e:
1353
+ raise JSONDiffV0Error(message=str(e), stacktrace=traceback.format_exc()) from e
1354
+ # --------------------------------------------------------------------------
1355
+
1356
+ if isinstance(_outputs, (int, float)):
1357
+ return {"score": _outputs, "success": _outputs >= threshold}
1358
+
1359
+ raise JSONDiffV0Error(
1360
+ message=f"json-diff error: got ({type(_outputs)}) {_outputs}, expected (int, float)."
1361
+ )
1362
+
1363
+
1364
+ @instrument(annotate=True)
1365
+ def auto_levenshtein_distance_v0(
1366
+ parameters: Optional[Data] = None,
1367
+ inputs: Optional[Data] = None,
1368
+ outputs: Optional[Union[Data, str]] = None,
1369
+ ) -> Any:
1370
+ """
1371
+ Levenshtein distance evaluator using pure Python implementation.
1372
+ Measures edit distance and returns normalized similarity score.
1373
+
1374
+ Args:
1375
+ inputs: Testcase data with reference string.
1376
+ outputs: Output from the workflow execution.
1377
+ parameters: Configuration for the evaluator.
1378
+
1379
+ Returns:
1380
+ Dictionary with normalized similarity score (0 to 1),
1381
+ or error message if evaluation fails.
1382
+ """
1383
+ if parameters is None or not isinstance(parameters, dict):
1384
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1385
+
1386
+ if not "correct_answer_key" in parameters:
1387
+ raise MissingConfigurationParameterV0Error(path="correct_answer_key")
1388
+
1389
+ correct_answer_key = str(parameters["correct_answer_key"])
1390
+
1391
+ case_sensitive = parameters.get("case_sensitive", True) is True
1392
+
1393
+ if inputs is None or not isinstance(inputs, dict):
1394
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
1395
+
1396
+ if not correct_answer_key in inputs:
1397
+ raise MissingInputV0Error(path=correct_answer_key)
1398
+
1399
+ correct_answer = inputs[correct_answer_key]
1400
+
1401
+ if not isinstance(correct_answer, str) and not isinstance(correct_answer, dict):
1402
+ raise InvalidInputV0Error(
1403
+ path=correct_answer_key, expected=["dict", "str"], got=correct_answer
1404
+ )
1405
+
1406
+ correct_answer_str = (
1407
+ correct_answer if isinstance(correct_answer, str) else dumps(correct_answer)
1408
+ )
1409
+
1410
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1411
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1412
+
1413
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1414
+
1415
+ threshold = parameters.get("threshold") or 0.5
1416
+
1417
+ if not isinstance(threshold, float):
1418
+ raise InvalidConfigurationParameterV0Error(
1419
+ path="threshold",
1420
+ expected="float",
1421
+ got=threshold,
1422
+ )
1423
+
1424
+ if not 0.0 < threshold <= 1.0:
1425
+ raise InvalidConfigurationParameterV0Error(
1426
+ path="threshold",
1427
+ expected="float[0.0, 1.0]",
1428
+ got=threshold,
1429
+ )
1430
+
1431
+ _outputs = None
1432
+
1433
+ # --------------------------------------------------------------------------
1434
+ if not case_sensitive:
1435
+ outputs_str = outputs_str.lower()
1436
+ correct_answer_str = correct_answer_str.lower()
1437
+
1438
+ try:
1439
+ # Compute Levenshtein distance
1440
+ if len(correct_answer_str) == 0:
1441
+ distance = len(outputs_str)
1442
+ else:
1443
+ previous_row = list(range(len(correct_answer_str) + 1))
1444
+ for i, c1 in enumerate(outputs_str):
1445
+ current_row = [i + 1]
1446
+ for j, c2 in enumerate(correct_answer_str):
1447
+ insert = previous_row[j + 1] + 1
1448
+ delete = current_row[j] + 1
1449
+ substitute = previous_row[j] + (c1 != c2)
1450
+ current_row.append(min(insert, delete, substitute))
1451
+ previous_row = current_row
1452
+ distance = previous_row[-1]
1453
+
1454
+ # Normalize similarity score
1455
+ max_length = max(len(outputs_str), len(correct_answer_str))
1456
+ _outputs = 1.0 if max_length == 0 else 1.0 - (distance / max_length)
1457
+ except Exception as e:
1458
+ raise LevenshteinDistanceV0Error(
1459
+ message=str(e), stacktrace=traceback.format_exc()
1460
+ ) from e
1461
+ # --------------------------------------------------------------------------
1462
+
1463
+ if isinstance(_outputs, (int, float)):
1464
+ return {"score": _outputs, "success": _outputs >= threshold}
1465
+
1466
+ raise LevenshteinDistanceV0Error(
1467
+ message=f"levenshtein-distance error: got ({type(_outputs)}) {_outputs}, expected (int, float)."
1468
+ )
1469
+
1470
+
1471
+ @instrument(annotate=True)
1472
+ def auto_similarity_match_v0(
1473
+ parameters: Optional[Data] = None,
1474
+ inputs: Optional[Data] = None,
1475
+ outputs: Optional[Union[Data, str]] = None,
1476
+ ) -> Any:
1477
+ """
1478
+ Similarity match evaluator for measuring string similarity between output and reference.
1479
+
1480
+ Args:
1481
+ inputs: Testcase data with reference string
1482
+ outputs: Output from the workflow execution
1483
+ parameters: Configuration for the evaluator
1484
+
1485
+ Returns:
1486
+ Evaluation result with similarity score
1487
+ """
1488
+ if parameters is None or not isinstance(parameters, dict):
1489
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1490
+
1491
+ if not "correct_answer_key" in parameters:
1492
+ raise MissingConfigurationParameterV0Error(path="correct_answer_key")
1493
+
1494
+ correct_answer_key = str(parameters["correct_answer_key"])
1495
+
1496
+ case_sensitive = parameters.get("case_sensitive", True) is True
1497
+
1498
+ if inputs is None or not isinstance(inputs, dict):
1499
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
1500
+
1501
+ if not correct_answer_key in inputs:
1502
+ raise MissingInputV0Error(path=correct_answer_key)
1503
+
1504
+ correct_answer = inputs[correct_answer_key]
1505
+
1506
+ if not isinstance(correct_answer, str) and not isinstance(correct_answer, dict):
1507
+ raise InvalidInputV0Error(
1508
+ path=correct_answer_key, expected=["dict", "str"], got=correct_answer
1509
+ )
1510
+
1511
+ correct_answer_str = (
1512
+ correct_answer if isinstance(correct_answer, str) else dumps(correct_answer)
1513
+ )
1514
+
1515
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1516
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1517
+
1518
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1519
+
1520
+ threshold = (
1521
+ parameters.get("threshold") or parameters.get("similarity_threshold") or 0.5
1522
+ )
1523
+
1524
+ if not isinstance(threshold, float):
1525
+ raise InvalidConfigurationParameterV0Error(
1526
+ path="threshold",
1527
+ expected="float",
1528
+ got=threshold,
1529
+ )
1530
+
1531
+ if not 0.0 < threshold <= 1.0:
1532
+ raise InvalidConfigurationParameterV0Error(
1533
+ path="threshold",
1534
+ expected="float[0.0, 1.0]",
1535
+ got=threshold,
1536
+ )
1537
+
1538
+ _outputs = None
1539
+
1540
+ # --------------------------------------------------------------------------
1541
+ if not case_sensitive:
1542
+ outputs_str = outputs_str.lower()
1543
+ correct_answer_str = correct_answer_str.lower()
1544
+
1545
+ try:
1546
+ matcher = SequenceMatcher(None, outputs_str, correct_answer_str)
1547
+
1548
+ _outputs = matcher.ratio()
1549
+ except Exception as e:
1550
+ raise SyntacticSimilarityV0Error(
1551
+ message=str(e), stacktrace=traceback.format_exc()
1552
+ ) from e
1553
+ # --------------------------------------------------------------------------
1554
+
1555
+ if isinstance(_outputs, (int, float)):
1556
+ return {"score": _outputs, "success": _outputs >= threshold}
1557
+
1558
+ raise SyntacticSimilarityV0Error(
1559
+ message=f"syntactic-similarity-match error: got ({type(_outputs)}) {_outputs}, expected (int, float)."
1560
+ )
1561
+
1562
+
1563
+ @instrument(annotate=True)
1564
+ async def auto_semantic_similarity_v0(
1565
+ *,
1566
+ parameters: Optional[Data] = None,
1567
+ inputs: Optional[Data] = None,
1568
+ outputs: Optional[Union[Data, str]] = None,
1569
+ ) -> Any:
1570
+ """
1571
+ Semantic similarity evaluator for measuring semantic similarity between output and reference using embeddings.
1572
+
1573
+ Args:
1574
+ inputs: Testcase data with reference string
1575
+ outputs: Output from the workflow execution
1576
+ parameters: Configuration for the evaluator with embedding model and credentials
1577
+
1578
+ Returns:
1579
+ Evaluation result with cosine similarity score
1580
+ """
1581
+ if parameters is None or not isinstance(parameters, dict):
1582
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1583
+
1584
+ if not "correct_answer_key" in parameters:
1585
+ raise MissingConfigurationParameterV0Error(path="correct_answer_key")
1586
+
1587
+ correct_answer_key = str(parameters["correct_answer_key"])
1588
+
1589
+ embedding_model = parameters.get("embedding_model", "text-embedding-3-small")
1590
+
1591
+ if not isinstance(embedding_model, str):
1592
+ raise InvalidConfigurationParametersV0Error(expected="str", got=embedding_model)
1593
+
1594
+ if inputs is None or not isinstance(inputs, dict):
1595
+ raise InvalidInputsV0Error(expected="dict", got=inputs)
1596
+
1597
+ if not correct_answer_key in inputs:
1598
+ raise MissingInputV0Error(path=correct_answer_key)
1599
+
1600
+ correct_answer = inputs[correct_answer_key]
1601
+
1602
+ if not isinstance(correct_answer, str) and not isinstance(correct_answer, dict):
1603
+ raise InvalidInputV0Error(
1604
+ path=correct_answer_key, expected=["dict", "str"], got=correct_answer
1605
+ )
1606
+
1607
+ correct_answer_str = (
1608
+ correct_answer if isinstance(correct_answer, str) else dumps(correct_answer)
1609
+ )
1610
+
1611
+ if not isinstance(outputs, str) and not isinstance(outputs, dict):
1612
+ raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
1613
+
1614
+ outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
1615
+
1616
+ secrets = await SecretsManager.retrieve_secrets()
1617
+
1618
+ if secrets is None or not isinstance(secrets, list):
1619
+ raise InvalidSecretsV0Error(expected="list", got=secrets)
1620
+
1621
+ openai_api_key = None # secrets.get("OPENAI_API_KEY")
1622
+
1623
+ for secret in secrets:
1624
+ if secret.get("kind") == "provider_key":
1625
+ secret_data = secret.get("data", {})
1626
+ if secret_data.get("kind") == "openai":
1627
+ provider_data = secret_data.get("provider", {})
1628
+ openai_api_key = provider_data.get("key") or openai_api_key
1629
+
1630
+ threshold = parameters.get("threshold") or 0.5
1631
+
1632
+ if not isinstance(threshold, float):
1633
+ raise InvalidConfigurationParameterV0Error(
1634
+ path="threshold",
1635
+ expected="float",
1636
+ got=threshold,
1637
+ )
1638
+
1639
+ if not 0.0 < threshold <= 1.0:
1640
+ raise InvalidConfigurationParameterV0Error(
1641
+ path="threshold",
1642
+ expected="float[0.0, 1.0]",
1643
+ got=threshold,
1644
+ )
1645
+
1646
+ _outputs = None
1647
+
1648
+ # --------------------------------------------------------------------------
1649
+ try:
1650
+ openai = AsyncOpenAI(api_key=openai_api_key)
1651
+ except OpenAIError as e:
1652
+ raise OpenAIError("OpenAIException - " + e.args[0])
1653
+
1654
+ output_embedding = await _compute_embedding(
1655
+ openai,
1656
+ embedding_model,
1657
+ outputs_str,
1658
+ )
1659
+
1660
+ reference_embedding = await _compute_embedding(
1661
+ openai,
1662
+ embedding_model,
1663
+ correct_answer_str,
1664
+ )
1665
+
1666
+ _outputs = float(
1667
+ _compute_similarity(
1668
+ output_embedding,
1669
+ reference_embedding,
1670
+ )
1671
+ )
1672
+ # --------------------------------------------------------------------------
1673
+
1674
+ if isinstance(_outputs, (int, float)):
1675
+ return {"score": _outputs, "success": _outputs >= threshold}
1676
+
1677
+ raise SemanticSimilarityV0Error(
1678
+ message=f"semantic-similarity error: got ({type(_outputs)}) {_outputs}, expected (int, float)."
1679
+ )
1680
+
1681
+
1682
+ class SinglePromptConfig(BaseModel):
1683
+ prompt: PromptTemplate = Field(
1684
+ default=PromptTemplate(
1685
+ system_prompt="You are an expert in geography",
1686
+ user_prompt="What is the capital of {{country}}?",
1687
+ )
1688
+ )
1689
+
1690
+
1691
+ @instrument()
1692
+ async def completion_v0(
1693
+ parameters: Data,
1694
+ inputs: Dict[str, str],
1695
+ ) -> Any:
1696
+ if parameters is None or not isinstance(parameters, dict):
1697
+ raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
1698
+
1699
+ if not "prompt" in parameters:
1700
+ raise MissingConfigurationParameterV0Error(path="prompt")
1701
+
1702
+ params: Dict[str, Any] = {**(parameters or {})}
1703
+
1704
+ config = SinglePromptConfig(**params)
1705
+ if config.prompt.input_keys is not None:
1706
+ required_keys = set(config.prompt.input_keys)
1707
+ provided_keys = set(inputs.keys())
1708
+
1709
+ if required_keys != provided_keys:
1710
+ raise InvalidInputsV0Error(
1711
+ expected=sorted(required_keys),
1712
+ got=sorted(provided_keys),
1713
+ )
1714
+
1715
+ await SecretsManager.ensure_secrets_in_workflow()
1716
+
1717
+ provider_settings = SecretsManager.get_provider_settings_from_workflow(
1718
+ config.prompt.llm_config.model
1719
+ )
1720
+
1721
+ if not provider_settings:
1722
+ raise InvalidSecretsV0Error(expected="dict", got=provider_settings)
1723
+
1724
+ with mockllm.user_aws_credentials_from(provider_settings):
1725
+ response = await mockllm.acompletion(
1726
+ **{
1727
+ k: v
1728
+ for k, v in config.prompt.format(**inputs).to_openai_kwargs().items()
1729
+ if k != "model"
1730
+ },
1731
+ **provider_settings,
1732
+ )
1733
+
1734
+ message = response.choices[0].message # type: ignore
1735
+
1736
+ if message.content is not None:
1737
+ return message.content
1738
+ if hasattr(message, "refusal") and message.refusal is not None: # type: ignore
1739
+ return message.refusal # type: ignore
1740
+ if hasattr(message, "parsed") and message.parsed is not None: # type: ignore
1741
+ return message.parsed # type: ignore
1742
+ if hasattr(message, "tool_calls") and message.tool_calls is not None:
1743
+ return [tool_call.dict() for tool_call in message.tool_calls]
1744
+
1745
+
1746
+ @instrument()
1747
+ async def chat_v0(
1748
+ parameters: Data,
1749
+ inputs: Optional[Dict[str, str]] = None,
1750
+ messages: Optional[List[Message]] = None,
1751
+ ):
1752
+ params: Dict[str, Any] = {**(parameters or {})}
1753
+
1754
+ config = SinglePromptConfig(**params)
1755
+ if config.prompt.input_keys is not None:
1756
+ required_keys = set(config.prompt.input_keys)
1757
+ provided_keys = set(inputs.keys()) if inputs is not None else set()
1758
+
1759
+ if required_keys != provided_keys:
1760
+ raise InvalidInputsV0Error(
1761
+ expected=sorted(required_keys),
1762
+ got=sorted(provided_keys),
1763
+ )
1764
+
1765
+ if inputs is not None:
1766
+ formatted_prompt = config.prompt.format(**inputs)
1767
+ else:
1768
+ formatted_prompt = config.prompt
1769
+ openai_kwargs = formatted_prompt.to_openai_kwargs()
1770
+
1771
+ if messages is not None:
1772
+ openai_kwargs["messages"].extend(messages)
1773
+
1774
+ await SecretsManager.ensure_secrets_in_workflow()
1775
+
1776
+ provider_settings = SecretsManager.get_provider_settings_from_workflow(
1777
+ config.prompt.llm_config.model
1778
+ )
1779
+
1780
+ if not provider_settings:
1781
+ raise InvalidSecretsV0Error(expected="dict", got=provider_settings)
1782
+
1783
+ with mockllm.user_aws_credentials_from(provider_settings):
1784
+ response = await mockllm.acompletion(
1785
+ **{
1786
+ k: v for k, v in openai_kwargs.items() if k != "model"
1787
+ }, # we should use the model_name from provider_settings
1788
+ **provider_settings,
1789
+ )
1790
+
1791
+ return response.choices[0].message.model_dump(exclude_none=True) # type: ignore