agenta 0.52.6__py3-none-any.whl → 0.63.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenta/__init__.py +12 -3
- agenta/client/__init__.py +4 -4
- agenta/client/backend/__init__.py +4 -4
- agenta/client/backend/api_keys/client.py +2 -2
- agenta/client/backend/billing/client.py +2 -2
- agenta/client/backend/billing/raw_client.py +2 -2
- agenta/client/backend/client.py +56 -48
- agenta/client/backend/core/client_wrapper.py +2 -2
- agenta/client/backend/core/file.py +3 -1
- agenta/client/backend/core/http_client.py +3 -3
- agenta/client/backend/core/pydantic_utilities.py +13 -3
- agenta/client/backend/human_evaluations/client.py +2 -2
- agenta/client/backend/human_evaluations/raw_client.py +2 -2
- agenta/client/backend/organization/client.py +46 -34
- agenta/client/backend/organization/raw_client.py +32 -26
- agenta/client/backend/raw_client.py +26 -26
- agenta/client/backend/testsets/client.py +18 -18
- agenta/client/backend/testsets/raw_client.py +30 -30
- agenta/client/backend/types/__init__.py +4 -4
- agenta/client/backend/types/account_request.py +3 -1
- agenta/client/backend/types/account_response.py +3 -1
- agenta/client/backend/types/agenta_node_dto.py +3 -1
- agenta/client/backend/types/agenta_nodes_response.py +3 -1
- agenta/client/backend/types/agenta_root_dto.py +3 -1
- agenta/client/backend/types/agenta_roots_response.py +3 -1
- agenta/client/backend/types/agenta_tree_dto.py +3 -1
- agenta/client/backend/types/agenta_trees_response.py +3 -1
- agenta/client/backend/types/aggregated_result.py +3 -1
- agenta/client/backend/types/analytics_response.py +3 -1
- agenta/client/backend/types/annotation.py +6 -4
- agenta/client/backend/types/annotation_create.py +3 -1
- agenta/client/backend/types/annotation_edit.py +3 -1
- agenta/client/backend/types/annotation_link.py +3 -1
- agenta/client/backend/types/annotation_link_response.py +3 -1
- agenta/client/backend/types/annotation_query.py +3 -1
- agenta/client/backend/types/annotation_query_request.py +3 -1
- agenta/client/backend/types/annotation_reference.py +3 -1
- agenta/client/backend/types/annotation_references.py +3 -1
- agenta/client/backend/types/annotation_response.py +3 -1
- agenta/client/backend/types/annotations_response.py +3 -1
- agenta/client/backend/types/app.py +3 -1
- agenta/client/backend/types/app_variant_response.py +3 -1
- agenta/client/backend/types/app_variant_revision.py +3 -1
- agenta/client/backend/types/artifact.py +6 -4
- agenta/client/backend/types/base_output.py +3 -1
- agenta/client/backend/types/body_fetch_workflow_revision.py +3 -1
- agenta/client/backend/types/body_import_testset.py +3 -1
- agenta/client/backend/types/bucket_dto.py +3 -1
- agenta/client/backend/types/collect_status_response.py +3 -1
- agenta/client/backend/types/config_db.py +3 -1
- agenta/client/backend/types/config_dto.py +3 -1
- agenta/client/backend/types/config_response_model.py +3 -1
- agenta/client/backend/types/correct_answer.py +3 -1
- agenta/client/backend/types/create_app_output.py +3 -1
- agenta/client/backend/types/custom_model_settings_dto.py +3 -1
- agenta/client/backend/types/custom_provider_dto.py +3 -1
- agenta/client/backend/types/custom_provider_kind.py +1 -1
- agenta/client/backend/types/custom_provider_settings_dto.py +3 -1
- agenta/client/backend/types/delete_evaluation.py +3 -1
- agenta/client/backend/types/environment_output.py +3 -1
- agenta/client/backend/types/environment_output_extended.py +3 -1
- agenta/client/backend/types/environment_revision.py +3 -1
- agenta/client/backend/types/error.py +3 -1
- agenta/client/backend/types/evaluation.py +3 -1
- agenta/client/backend/types/evaluation_scenario.py +3 -1
- agenta/client/backend/types/evaluation_scenario_input.py +3 -1
- agenta/client/backend/types/evaluation_scenario_output.py +3 -1
- agenta/client/backend/types/evaluation_scenario_result.py +3 -1
- agenta/client/backend/types/evaluator.py +6 -4
- agenta/client/backend/types/evaluator_config.py +6 -4
- agenta/client/backend/types/evaluator_flags.py +3 -1
- agenta/client/backend/types/evaluator_mapping_output_interface.py +3 -1
- agenta/client/backend/types/evaluator_output_interface.py +3 -1
- agenta/client/backend/types/evaluator_query.py +3 -1
- agenta/client/backend/types/evaluator_query_request.py +3 -1
- agenta/client/backend/types/evaluator_request.py +3 -1
- agenta/client/backend/types/evaluator_response.py +3 -1
- agenta/client/backend/types/evaluators_response.py +3 -1
- agenta/client/backend/types/exception_dto.py +3 -1
- agenta/client/backend/types/extended_o_tel_tracing_response.py +3 -1
- agenta/client/backend/types/get_config_response.py +3 -1
- agenta/client/backend/types/header.py +3 -1
- agenta/client/backend/types/http_validation_error.py +3 -1
- agenta/client/backend/types/human_evaluation.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario_input.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario_output.py +3 -1
- agenta/client/backend/types/invite_request.py +3 -1
- agenta/client/backend/types/legacy_analytics_response.py +3 -1
- agenta/client/backend/types/legacy_data_point.py +3 -1
- agenta/client/backend/types/legacy_evaluator.py +3 -1
- agenta/client/backend/types/legacy_scope_request.py +3 -1
- agenta/client/backend/types/legacy_scopes_response.py +3 -1
- agenta/client/backend/types/legacy_subscription_request.py +3 -1
- agenta/client/backend/types/legacy_user_request.py +3 -1
- agenta/client/backend/types/legacy_user_response.py +3 -1
- agenta/client/backend/types/lifecycle_dto.py +3 -1
- agenta/client/backend/types/link_dto.py +3 -1
- agenta/client/backend/types/list_api_keys_response.py +3 -1
- agenta/client/backend/types/llm_run_rate_limit.py +3 -1
- agenta/client/backend/types/meta_request.py +3 -1
- agenta/client/backend/types/metrics_dto.py +3 -1
- agenta/client/backend/types/new_testset.py +3 -1
- agenta/client/backend/types/node_dto.py +3 -1
- agenta/client/backend/types/o_tel_context_dto.py +3 -1
- agenta/client/backend/types/o_tel_event.py +6 -4
- agenta/client/backend/types/o_tel_event_dto.py +3 -1
- agenta/client/backend/types/o_tel_extra_dto.py +3 -1
- agenta/client/backend/types/o_tel_flat_span.py +6 -4
- agenta/client/backend/types/o_tel_link.py +6 -4
- agenta/client/backend/types/o_tel_link_dto.py +3 -1
- agenta/client/backend/types/o_tel_links_response.py +3 -1
- agenta/client/backend/types/o_tel_span.py +1 -1
- agenta/client/backend/types/o_tel_span_dto.py +3 -1
- agenta/client/backend/types/o_tel_spans_tree.py +3 -1
- agenta/client/backend/types/o_tel_tracing_data_response.py +3 -1
- agenta/client/backend/types/o_tel_tracing_request.py +3 -1
- agenta/client/backend/types/o_tel_tracing_response.py +3 -1
- agenta/client/backend/types/organization.py +3 -1
- agenta/client/backend/types/organization_details.py +3 -1
- agenta/client/backend/types/organization_membership_request.py +3 -1
- agenta/client/backend/types/organization_output.py +3 -1
- agenta/client/backend/types/organization_request.py +3 -1
- agenta/client/backend/types/parent_dto.py +3 -1
- agenta/client/backend/types/project_membership_request.py +3 -1
- agenta/client/backend/types/project_request.py +3 -1
- agenta/client/backend/types/project_scope.py +3 -1
- agenta/client/backend/types/projects_response.py +3 -1
- agenta/client/backend/types/reference.py +6 -4
- agenta/client/backend/types/reference_dto.py +3 -1
- agenta/client/backend/types/reference_request_model.py +3 -1
- agenta/client/backend/types/result.py +3 -1
- agenta/client/backend/types/root_dto.py +3 -1
- agenta/client/backend/types/scopes_response_model.py +3 -1
- agenta/client/backend/types/secret_dto.py +3 -1
- agenta/client/backend/types/secret_response_dto.py +3 -1
- agenta/client/backend/types/simple_evaluation_output.py +3 -1
- agenta/client/backend/types/span_dto.py +6 -4
- agenta/client/backend/types/standard_provider_dto.py +3 -1
- agenta/client/backend/types/standard_provider_settings_dto.py +3 -1
- agenta/client/backend/types/status_dto.py +3 -1
- agenta/client/backend/types/tags_request.py +3 -1
- agenta/client/backend/types/testcase_response.py +6 -4
- agenta/client/backend/types/testset.py +6 -4
- agenta/client/backend/types/{test_set_output_response.py → testset_output_response.py} +4 -2
- agenta/client/backend/types/testset_request.py +3 -1
- agenta/client/backend/types/testset_response.py +3 -1
- agenta/client/backend/types/{test_set_simple_response.py → testset_simple_response.py} +4 -2
- agenta/client/backend/types/testsets_response.py +3 -1
- agenta/client/backend/types/time_dto.py +3 -1
- agenta/client/backend/types/tree_dto.py +3 -1
- agenta/client/backend/types/update_app_output.py +3 -1
- agenta/client/backend/types/user_request.py +3 -1
- agenta/client/backend/types/validation_error.py +3 -1
- agenta/client/backend/types/workflow_artifact.py +6 -4
- agenta/client/backend/types/workflow_data.py +3 -1
- agenta/client/backend/types/workflow_flags.py +3 -1
- agenta/client/backend/types/workflow_request.py +3 -1
- agenta/client/backend/types/workflow_response.py +3 -1
- agenta/client/backend/types/workflow_revision.py +6 -4
- agenta/client/backend/types/workflow_revision_request.py +3 -1
- agenta/client/backend/types/workflow_revision_response.py +3 -1
- agenta/client/backend/types/workflow_revisions_response.py +3 -1
- agenta/client/backend/types/workflow_variant.py +6 -4
- agenta/client/backend/types/workflow_variant_request.py +3 -1
- agenta/client/backend/types/workflow_variant_response.py +3 -1
- agenta/client/backend/types/workflow_variants_response.py +3 -1
- agenta/client/backend/types/workflows_response.py +3 -1
- agenta/client/backend/types/workspace.py +3 -1
- agenta/client/backend/types/workspace_member_response.py +3 -1
- agenta/client/backend/types/workspace_membership_request.py +3 -1
- agenta/client/backend/types/workspace_permission.py +3 -1
- agenta/client/backend/types/workspace_request.py +3 -1
- agenta/client/backend/types/workspace_response.py +3 -1
- agenta/client/backend/vault/raw_client.py +4 -4
- agenta/client/backend/workspace/client.py +2 -2
- agenta/client/client.py +102 -88
- agenta/sdk/__init__.py +52 -3
- agenta/sdk/agenta_init.py +43 -16
- agenta/sdk/assets.py +23 -15
- agenta/sdk/context/serving.py +20 -8
- agenta/sdk/context/tracing.py +40 -22
- agenta/sdk/contexts/__init__.py +0 -0
- agenta/sdk/contexts/routing.py +38 -0
- agenta/sdk/contexts/running.py +57 -0
- agenta/sdk/contexts/tracing.py +86 -0
- agenta/sdk/decorators/__init__.py +1 -0
- agenta/sdk/decorators/routing.py +284 -0
- agenta/sdk/decorators/running.py +692 -98
- agenta/sdk/decorators/serving.py +20 -21
- agenta/sdk/decorators/tracing.py +176 -131
- agenta/sdk/engines/__init__.py +0 -0
- agenta/sdk/engines/running/__init__.py +0 -0
- agenta/sdk/engines/running/utils.py +17 -0
- agenta/sdk/engines/tracing/__init__.py +1 -0
- agenta/sdk/engines/tracing/attributes.py +185 -0
- agenta/sdk/engines/tracing/conventions.py +49 -0
- agenta/sdk/engines/tracing/exporters.py +130 -0
- agenta/sdk/engines/tracing/inline.py +1154 -0
- agenta/sdk/engines/tracing/processors.py +190 -0
- agenta/sdk/engines/tracing/propagation.py +102 -0
- agenta/sdk/engines/tracing/spans.py +136 -0
- agenta/sdk/engines/tracing/tracing.py +324 -0
- agenta/sdk/evaluations/__init__.py +2 -0
- agenta/sdk/evaluations/metrics.py +37 -0
- agenta/sdk/evaluations/preview/__init__.py +0 -0
- agenta/sdk/evaluations/preview/evaluate.py +765 -0
- agenta/sdk/evaluations/preview/utils.py +861 -0
- agenta/sdk/evaluations/results.py +66 -0
- agenta/sdk/evaluations/runs.py +153 -0
- agenta/sdk/evaluations/scenarios.py +48 -0
- agenta/sdk/litellm/litellm.py +12 -0
- agenta/sdk/litellm/mockllm.py +6 -8
- agenta/sdk/litellm/mocks/__init__.py +5 -5
- agenta/sdk/managers/applications.py +304 -0
- agenta/sdk/managers/config.py +2 -2
- agenta/sdk/managers/evaluations.py +0 -0
- agenta/sdk/managers/evaluators.py +303 -0
- agenta/sdk/managers/secrets.py +161 -24
- agenta/sdk/managers/shared.py +3 -1
- agenta/sdk/managers/testsets.py +441 -0
- agenta/sdk/managers/vault.py +3 -3
- agenta/sdk/middleware/auth.py +0 -176
- agenta/sdk/middleware/config.py +27 -9
- agenta/sdk/middleware/vault.py +204 -9
- agenta/sdk/middlewares/__init__.py +0 -0
- agenta/sdk/middlewares/routing/__init__.py +0 -0
- agenta/sdk/middlewares/routing/auth.py +263 -0
- agenta/sdk/middlewares/routing/cors.py +30 -0
- agenta/sdk/middlewares/routing/otel.py +29 -0
- agenta/sdk/middlewares/running/__init__.py +0 -0
- agenta/sdk/middlewares/running/normalizer.py +321 -0
- agenta/sdk/middlewares/running/resolver.py +161 -0
- agenta/sdk/middlewares/running/vault.py +140 -0
- agenta/sdk/models/__init__.py +0 -0
- agenta/sdk/models/blobs.py +33 -0
- agenta/sdk/models/evaluations.py +119 -0
- agenta/sdk/models/git.py +126 -0
- agenta/sdk/models/shared.py +167 -0
- agenta/sdk/models/testsets.py +163 -0
- agenta/sdk/models/tracing.py +202 -0
- agenta/sdk/models/workflows.py +753 -0
- agenta/sdk/tracing/attributes.py +4 -4
- agenta/sdk/tracing/exporters.py +67 -17
- agenta/sdk/tracing/inline.py +37 -45
- agenta/sdk/tracing/processors.py +97 -0
- agenta/sdk/tracing/propagation.py +3 -1
- agenta/sdk/tracing/spans.py +4 -0
- agenta/sdk/tracing/tracing.py +13 -15
- agenta/sdk/types.py +222 -22
- agenta/sdk/utils/cache.py +1 -1
- agenta/sdk/utils/client.py +38 -0
- agenta/sdk/utils/helpers.py +13 -12
- agenta/sdk/utils/logging.py +18 -78
- agenta/sdk/utils/references.py +23 -0
- agenta/sdk/workflows/builtin.py +600 -0
- agenta/sdk/workflows/configurations.py +22 -0
- agenta/sdk/workflows/errors.py +292 -0
- agenta/sdk/workflows/handlers.py +1791 -0
- agenta/sdk/workflows/interfaces.py +948 -0
- agenta/sdk/workflows/sandbox.py +118 -0
- agenta/sdk/workflows/utils.py +303 -6
- {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/METADATA +37 -33
- agenta-0.63.2.dist-info/RECORD +421 -0
- {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/WHEEL +1 -1
- agenta/sdk/middleware/adapt.py +0 -253
- agenta/sdk/middleware/base.py +0 -40
- agenta/sdk/middleware/flags.py +0 -40
- agenta/sdk/workflows/types.py +0 -472
- agenta-0.52.6.dist-info/RECORD +0 -371
- /agenta/sdk/{workflows → engines/running}/registry.py +0 -0
|
@@ -0,0 +1,1791 @@
|
|
|
1
|
+
from typing import List, Any, Optional, Any, Dict, Union
|
|
2
|
+
from json import dumps, loads
|
|
3
|
+
import traceback
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
import math
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
import litellm
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
from openai import AsyncOpenAI, OpenAIError
|
|
14
|
+
from difflib import SequenceMatcher
|
|
15
|
+
|
|
16
|
+
from agenta.sdk.utils.logging import get_module_logger
|
|
17
|
+
|
|
18
|
+
from agenta.sdk.litellm import mockllm
|
|
19
|
+
from agenta.sdk.types import PromptTemplate, Message
|
|
20
|
+
from agenta.sdk.managers.secrets import SecretsManager
|
|
21
|
+
|
|
22
|
+
from agenta.sdk.decorators.tracing import instrument
|
|
23
|
+
|
|
24
|
+
from agenta.sdk.models.shared import Data
|
|
25
|
+
from agenta.sdk.models.tracing import Trace
|
|
26
|
+
from agenta.sdk.workflows.sandbox import execute_code_safely
|
|
27
|
+
from agenta.sdk.workflows.errors import (
|
|
28
|
+
InvalidConfigurationParametersV0Error,
|
|
29
|
+
MissingConfigurationParameterV0Error,
|
|
30
|
+
InvalidConfigurationParameterV0Error,
|
|
31
|
+
InvalidInputsV0Error,
|
|
32
|
+
MissingInputV0Error,
|
|
33
|
+
InvalidInputV0Error,
|
|
34
|
+
InvalidOutputsV0Error,
|
|
35
|
+
MissingOutputV0Error,
|
|
36
|
+
InvalidSecretsV0Error,
|
|
37
|
+
JSONDiffV0Error,
|
|
38
|
+
LevenshteinDistanceV0Error,
|
|
39
|
+
SyntacticSimilarityV0Error,
|
|
40
|
+
SemanticSimilarityV0Error,
|
|
41
|
+
WebhookServerV0Error,
|
|
42
|
+
WebhookClientV0Error,
|
|
43
|
+
CustomCodeServerV0Error,
|
|
44
|
+
RegexPatternV0Error,
|
|
45
|
+
PromptFormattingV0Error,
|
|
46
|
+
PromptCompletionV0Error,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
from agenta.sdk.litellm import mockllm
|
|
50
|
+
from agenta.sdk.litellm.litellm import litellm_handler
|
|
51
|
+
|
|
52
|
+
litellm.logging = False
|
|
53
|
+
litellm.set_verbose = False
|
|
54
|
+
litellm.drop_params = True
|
|
55
|
+
# litellm.turn_off_message_logging = True
|
|
56
|
+
mockllm.litellm = litellm
|
|
57
|
+
|
|
58
|
+
litellm.callbacks = [litellm_handler()]
|
|
59
|
+
|
|
60
|
+
log = get_module_logger(__name__)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
async def _compute_embedding(openai: Any, model: str, input: str) -> List[float]:
|
|
64
|
+
response = await openai.embeddings.create(model=model, input=input)
|
|
65
|
+
# embeddings API already returns a list of floats
|
|
66
|
+
return response.data[0].embedding
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _compute_similarity(embedding_1: List[float], embedding_2: List[float]) -> float:
|
|
70
|
+
# Cosine similarity
|
|
71
|
+
dot = sum(a * b for a, b in zip(embedding_1, embedding_2))
|
|
72
|
+
norm1 = math.sqrt(sum(a * a for a in embedding_1))
|
|
73
|
+
norm2 = math.sqrt(sum(b * b for b in embedding_2))
|
|
74
|
+
if norm1 == 0 or norm2 == 0:
|
|
75
|
+
return 0.0
|
|
76
|
+
return dot / (norm1 * norm2)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
import json
|
|
80
|
+
import re
|
|
81
|
+
from typing import Any, Dict, Iterable, Tuple, Optional
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
import jsonpath # ✅ use module API
|
|
85
|
+
from jsonpath import JSONPointer # pointer class is fine to use
|
|
86
|
+
except Exception:
|
|
87
|
+
jsonpath = None
|
|
88
|
+
JSONPointer = None
|
|
89
|
+
|
|
90
|
+
# ========= Scheme detection =========
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def detect_scheme(expr: str) -> str:
|
|
94
|
+
"""Return 'json-path', 'json-pointer', or 'dot-notation' based on the placeholder prefix."""
|
|
95
|
+
if expr.startswith("$"):
|
|
96
|
+
return "json-path"
|
|
97
|
+
if expr.startswith("/"):
|
|
98
|
+
return "json-pointer"
|
|
99
|
+
return "dot-notation"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ========= Resolvers =========
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def resolve_dot_notation(expr: str, data: dict) -> object:
|
|
106
|
+
if "[" in expr or "]" in expr:
|
|
107
|
+
raise KeyError(f"Bracket syntax is not supported in dot-notation: {expr!r}")
|
|
108
|
+
|
|
109
|
+
# First, check if the expression exists as a literal key (e.g., "topic.story" as a single key)
|
|
110
|
+
# This allows users to use dots in their variable names without nested access
|
|
111
|
+
if expr in data:
|
|
112
|
+
return data[expr]
|
|
113
|
+
|
|
114
|
+
# If not found as a literal key, try to parse as dot-notation path
|
|
115
|
+
cur = data
|
|
116
|
+
for token in (p for p in expr.split(".") if p):
|
|
117
|
+
if isinstance(cur, list) and token.isdigit():
|
|
118
|
+
cur = cur[int(token)]
|
|
119
|
+
else:
|
|
120
|
+
if not isinstance(cur, dict):
|
|
121
|
+
raise KeyError(
|
|
122
|
+
f"Cannot access key {token!r} on non-dict while resolving {expr!r}"
|
|
123
|
+
)
|
|
124
|
+
if token not in cur:
|
|
125
|
+
raise KeyError(f"Missing key {token!r} while resolving {expr!r}")
|
|
126
|
+
cur = cur[token]
|
|
127
|
+
return cur
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def resolve_json_path(expr: str, data: dict) -> object:
|
|
131
|
+
if jsonpath is None:
|
|
132
|
+
raise ImportError("python-jsonpath is required for json-path ($...)")
|
|
133
|
+
|
|
134
|
+
if not (expr == "$" or expr.startswith("$.") or expr.startswith("$[")):
|
|
135
|
+
raise ValueError(
|
|
136
|
+
f"Invalid json-path expression {expr!r}. "
|
|
137
|
+
"Must start with '$', '$.' or '$[' (no implicit normalization)."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Use package-level APIf
|
|
141
|
+
results = jsonpath.findall(expr, data) # always returns a list
|
|
142
|
+
return results[0] if len(results) == 1 else results
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def resolve_json_pointer(expr: str, data: Dict[str, Any]) -> Any:
|
|
146
|
+
"""Resolve a JSON Pointer; returns a single value."""
|
|
147
|
+
if JSONPointer is None:
|
|
148
|
+
raise ImportError("python-jsonpath is required for json-pointer (/...)")
|
|
149
|
+
return JSONPointer(expr).resolve(data)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def resolve_any(expr: str, data: Dict[str, Any]) -> Any:
|
|
153
|
+
"""Dispatch to the right resolver based on detected scheme."""
|
|
154
|
+
scheme = detect_scheme(expr)
|
|
155
|
+
if scheme == "json-path":
|
|
156
|
+
return resolve_json_path(expr, data)
|
|
157
|
+
if scheme == "json-pointer":
|
|
158
|
+
return resolve_json_pointer(expr, data)
|
|
159
|
+
return resolve_dot_notation(expr, data)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# ========= Placeholder & coercion helpers =========
|
|
163
|
+
|
|
164
|
+
_PLACEHOLDER_RE = re.compile(r"\{\{\s*(.*?)\s*\}\}")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def extract_placeholders(template: str) -> Iterable[str]:
|
|
168
|
+
"""Yield the inner text of all {{ ... }} occurrences (trimmed)."""
|
|
169
|
+
for m in _PLACEHOLDER_RE.finditer(template):
|
|
170
|
+
yield m.group(1).strip()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def coerce_to_str(value: Any) -> str:
|
|
174
|
+
"""Pretty stringify values for embedding into templates."""
|
|
175
|
+
if isinstance(value, (dict, list)):
|
|
176
|
+
return json.dumps(value, ensure_ascii=False)
|
|
177
|
+
return str(value)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def build_replacements(
|
|
181
|
+
placeholders: Iterable[str], data: Dict[str, Any]
|
|
182
|
+
) -> Tuple[Dict[str, str], set]:
|
|
183
|
+
"""
|
|
184
|
+
Resolve all placeholders against data.
|
|
185
|
+
Returns (replacements, unresolved_placeholders).
|
|
186
|
+
"""
|
|
187
|
+
replacements: Dict[str, str] = {}
|
|
188
|
+
unresolved: set = set()
|
|
189
|
+
for expr in set(placeholders):
|
|
190
|
+
try:
|
|
191
|
+
val = resolve_any(expr, data)
|
|
192
|
+
# Escape backslashes to avoid regex replacement surprises
|
|
193
|
+
replacements[expr] = coerce_to_str(val).replace("\\", "\\\\")
|
|
194
|
+
except Exception:
|
|
195
|
+
unresolved.add(expr)
|
|
196
|
+
return replacements, unresolved
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def apply_replacements(template: str, replacements: Dict[str, str]) -> str:
|
|
200
|
+
"""Replace {{ expr }} using a callback to avoid regex-injection issues."""
|
|
201
|
+
|
|
202
|
+
def _repl(m: re.Match) -> str:
|
|
203
|
+
expr = m.group(1).strip()
|
|
204
|
+
return replacements.get(expr, m.group(0))
|
|
205
|
+
|
|
206
|
+
return _PLACEHOLDER_RE.sub(_repl, template)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def compute_truly_unreplaced(original: set, rendered: str) -> set:
|
|
210
|
+
"""Only count placeholders that were in the original template and remain."""
|
|
211
|
+
now = set(extract_placeholders(rendered))
|
|
212
|
+
return original & now
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def missing_lib_hints(unreplaced: set) -> Optional[str]:
|
|
216
|
+
"""Suggest installing python-jsonpath if placeholders indicate json-path or json-pointer usage."""
|
|
217
|
+
if any(expr.startswith("$") or expr.startswith("/") for expr in unreplaced) and (
|
|
218
|
+
jsonpath is None or JSONPointer is None
|
|
219
|
+
):
|
|
220
|
+
return (
|
|
221
|
+
"Install python-jsonpath to enable json-path ($...) and json-pointer (/...)"
|
|
222
|
+
)
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _format_with_template(
|
|
227
|
+
content: str,
|
|
228
|
+
format: str,
|
|
229
|
+
kwargs: Dict[str, Any],
|
|
230
|
+
) -> str:
|
|
231
|
+
"""Internal method to format content based on template_format"""
|
|
232
|
+
if format == "fstring":
|
|
233
|
+
return content.format(**kwargs)
|
|
234
|
+
|
|
235
|
+
elif format == "jinja2":
|
|
236
|
+
from jinja2 import Template, TemplateError
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
return Template(content).render(**kwargs)
|
|
240
|
+
except TemplateError:
|
|
241
|
+
return content
|
|
242
|
+
|
|
243
|
+
elif format == "curly":
|
|
244
|
+
original_placeholders = set(extract_placeholders(content))
|
|
245
|
+
|
|
246
|
+
replacements, _unresolved = build_replacements(original_placeholders, kwargs)
|
|
247
|
+
|
|
248
|
+
result = apply_replacements(content, replacements)
|
|
249
|
+
|
|
250
|
+
truly_unreplaced = compute_truly_unreplaced(original_placeholders, result)
|
|
251
|
+
|
|
252
|
+
if truly_unreplaced:
|
|
253
|
+
hint = missing_lib_hints(truly_unreplaced)
|
|
254
|
+
suffix = f" Hint: {hint}" if hint else ""
|
|
255
|
+
raise ValueError(
|
|
256
|
+
f"Template variables not found or unresolved: "
|
|
257
|
+
f"{', '.join(sorted(truly_unreplaced))}.{suffix}"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return result
|
|
261
|
+
|
|
262
|
+
return content
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _flatten_json(json_obj: Union[list, dict]) -> Dict[str, Any]:
|
|
266
|
+
"""
|
|
267
|
+
This function takes a (nested) JSON object and flattens it into a single-level dictionary where each key represents the path to the value in the original JSON structure. This is done recursively, ensuring that the full hierarchical context is preserved in the keys.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
json_obj (Union[list, dict]): The (nested) JSON object to flatten. It can be either a dictionary or a list.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Dict[str, Any]: The flattened JSON object as a dictionary, with keys representing the paths to the values in the original structure.
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
output = {}
|
|
277
|
+
|
|
278
|
+
def flatten(obj: Union[list, dict], path: str = "") -> None:
|
|
279
|
+
if isinstance(obj, dict):
|
|
280
|
+
for key, value in obj.items():
|
|
281
|
+
new_key = f"{path}.{key}" if path else key
|
|
282
|
+
if isinstance(value, (dict, list)):
|
|
283
|
+
flatten(value, new_key)
|
|
284
|
+
else:
|
|
285
|
+
output[new_key] = value
|
|
286
|
+
|
|
287
|
+
elif isinstance(obj, list):
|
|
288
|
+
for index, value in enumerate(obj):
|
|
289
|
+
new_key = f"{path}.{index}" if path else str(index)
|
|
290
|
+
if isinstance(value, (dict, list)):
|
|
291
|
+
flatten(value, new_key)
|
|
292
|
+
else:
|
|
293
|
+
output[new_key] = value
|
|
294
|
+
|
|
295
|
+
flatten(json_obj)
|
|
296
|
+
return output
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _compare_jsons(
|
|
300
|
+
ground_truth: Union[list, dict],
|
|
301
|
+
app_output: Union[list, dict],
|
|
302
|
+
settings_values: dict,
|
|
303
|
+
):
|
|
304
|
+
"""
|
|
305
|
+
This function takes two JSON objects (ground truth and application output), flattens them using the `_flatten_json` function, and then compares the fields.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
ground_truth (list | dict): The ground truth
|
|
309
|
+
app_output (list | dict): The application output
|
|
310
|
+
settings_values: dict: The advanced configuration of the evaluator
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
the average score between both JSON objects
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
def normalize_keys(d: Dict[str, Any], case_insensitive: bool) -> Dict[str, Any]:
|
|
317
|
+
if not case_insensitive:
|
|
318
|
+
return d
|
|
319
|
+
return {k.lower(): v for k, v in d.items()}
|
|
320
|
+
|
|
321
|
+
def diff(ground_truth: Any, app_output: Any, compare_schema_only: bool) -> float:
|
|
322
|
+
gt_key, gt_value = next(iter(ground_truth.items()))
|
|
323
|
+
ao_key, ao_value = next(iter(app_output.items()))
|
|
324
|
+
|
|
325
|
+
if compare_schema_only:
|
|
326
|
+
return (
|
|
327
|
+
1.0 if (gt_key == ao_key and type(gt_value) == type(ao_value)) else 0.0
|
|
328
|
+
)
|
|
329
|
+
return 1.0 if (gt_key == ao_key and gt_value == ao_value) else 0.0
|
|
330
|
+
|
|
331
|
+
flattened_ground_truth = _flatten_json(ground_truth)
|
|
332
|
+
flattened_app_output = _flatten_json(app_output)
|
|
333
|
+
|
|
334
|
+
keys = set(flattened_ground_truth.keys())
|
|
335
|
+
if settings_values.get("predict_keys", False):
|
|
336
|
+
keys = keys.union(set(flattened_app_output.keys()))
|
|
337
|
+
|
|
338
|
+
cumulated_score = 0.0
|
|
339
|
+
no_of_keys = len(keys)
|
|
340
|
+
|
|
341
|
+
case_insensitive_keys = settings_values.get("case_insensitive_keys", False)
|
|
342
|
+
compare_schema_only = settings_values.get("compare_schema_only", False)
|
|
343
|
+
flattened_ground_truth = normalize_keys(
|
|
344
|
+
flattened_ground_truth, case_insensitive_keys
|
|
345
|
+
)
|
|
346
|
+
flattened_app_output = normalize_keys(flattened_app_output, case_insensitive_keys)
|
|
347
|
+
|
|
348
|
+
for key in keys:
|
|
349
|
+
ground_truth_value = flattened_ground_truth.get(key, None)
|
|
350
|
+
llm_app_output_value = flattened_app_output.get(key, None)
|
|
351
|
+
|
|
352
|
+
key_score = 0.0
|
|
353
|
+
if ground_truth_value is not None and llm_app_output_value is not None:
|
|
354
|
+
key_score = diff(
|
|
355
|
+
{key: ground_truth_value},
|
|
356
|
+
{key: llm_app_output_value},
|
|
357
|
+
compare_schema_only,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
cumulated_score += key_score
|
|
361
|
+
try:
|
|
362
|
+
average_score = cumulated_score / no_of_keys
|
|
363
|
+
return average_score
|
|
364
|
+
except ZeroDivisionError:
|
|
365
|
+
return 0.0
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
@instrument()
|
|
369
|
+
def echo_v0(aloha: Any):
|
|
370
|
+
return {"got": aloha}
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
@instrument(annotate=True)
|
|
374
|
+
def auto_exact_match_v0(
|
|
375
|
+
parameters: Optional[Data] = None,
|
|
376
|
+
inputs: Optional[Data] = None,
|
|
377
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
378
|
+
) -> Any:
|
|
379
|
+
"""
|
|
380
|
+
Exact match evaluator for comparing outputs against reference outputs.
|
|
381
|
+
|
|
382
|
+
inputs: Testcase data, which may contain reference outputs
|
|
383
|
+
outputs: Output from the workflow execution
|
|
384
|
+
parameters: Configuration for the evaluator
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
Evaluation result with success flag (True for match, False for mismatch)
|
|
388
|
+
"""
|
|
389
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
390
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
391
|
+
|
|
392
|
+
if not "correct_answer_key" in parameters:
|
|
393
|
+
raise MissingConfigurationParameterV0Error(path="correct_answer_key")
|
|
394
|
+
|
|
395
|
+
correct_answer_key = str(parameters["correct_answer_key"])
|
|
396
|
+
|
|
397
|
+
if inputs is None or not isinstance(inputs, dict):
|
|
398
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
399
|
+
|
|
400
|
+
if not correct_answer_key in inputs:
|
|
401
|
+
raise MissingInputV0Error(path=correct_answer_key)
|
|
402
|
+
|
|
403
|
+
correct_answer = inputs[correct_answer_key]
|
|
404
|
+
|
|
405
|
+
# --------------------------------------------------------------------------
|
|
406
|
+
success = False
|
|
407
|
+
if isinstance(outputs, str) and isinstance(correct_answer, str):
|
|
408
|
+
success = outputs == correct_answer
|
|
409
|
+
elif isinstance(outputs, dict) and isinstance(correct_answer, dict):
|
|
410
|
+
outputs = dumps(outputs, sort_keys=True)
|
|
411
|
+
correct_answer = dumps(correct_answer, sort_keys=True)
|
|
412
|
+
success = outputs == correct_answer
|
|
413
|
+
# --------------------------------------------------------------------------
|
|
414
|
+
|
|
415
|
+
return {"success": success}
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
@instrument(annotate=True)
|
|
419
|
+
def auto_regex_test_v0(
|
|
420
|
+
parameters: Optional[Data] = None,
|
|
421
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
422
|
+
) -> Any:
|
|
423
|
+
"""
|
|
424
|
+
Regex test evaluator for checking if output matches a regex pattern.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
inputs: Testcase data
|
|
428
|
+
outputs: Output from the workflow execution
|
|
429
|
+
parameters: Configuration for the evaluator with regex pattern and matching flag
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
Evaluation result with success flag
|
|
433
|
+
"""
|
|
434
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
435
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
436
|
+
|
|
437
|
+
if not "regex_pattern" in parameters:
|
|
438
|
+
raise MissingConfigurationParameterV0Error(path="regex_pattern")
|
|
439
|
+
|
|
440
|
+
regex_pattern = parameters["regex_pattern"]
|
|
441
|
+
|
|
442
|
+
if not isinstance(regex_pattern, str):
|
|
443
|
+
raise InvalidConfigurationParameterV0Error(
|
|
444
|
+
path="regex_pattern",
|
|
445
|
+
expected="str",
|
|
446
|
+
got=regex_pattern,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
case_sensitive = parameters.get("case_sensitive", True) is True
|
|
450
|
+
|
|
451
|
+
regex_should_match = parameters.get("regex_should_match", True) is True
|
|
452
|
+
|
|
453
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
454
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
455
|
+
|
|
456
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
457
|
+
|
|
458
|
+
# --------------------------------------------------------------------------
|
|
459
|
+
try:
|
|
460
|
+
pattern = re.compile(
|
|
461
|
+
regex_pattern,
|
|
462
|
+
flags=0 if case_sensitive else re.IGNORECASE,
|
|
463
|
+
)
|
|
464
|
+
except Exception as e:
|
|
465
|
+
raise RegexPatternV0Error(pattern=regex_pattern) from e
|
|
466
|
+
|
|
467
|
+
result = pattern.search(outputs_str)
|
|
468
|
+
|
|
469
|
+
success = bool(result) == regex_should_match
|
|
470
|
+
# --------------------------------------------------------------------------
|
|
471
|
+
|
|
472
|
+
return {"success": success}
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
@instrument(annotate=True)
|
|
476
|
+
def field_match_test_v0(
|
|
477
|
+
parameters: Optional[Data] = None,
|
|
478
|
+
inputs: Optional[Data] = None,
|
|
479
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
480
|
+
) -> Any:
|
|
481
|
+
"""
|
|
482
|
+
Field match test evaluator for extracting and comparing a specific field from JSON output.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
inputs: Testcase data with ground truth
|
|
486
|
+
outputs: Output from the workflow execution (expected to be JSON string or dict)
|
|
487
|
+
parameters: Configuration for the evaluator with json_field to extract
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
Evaluation result with success flag
|
|
491
|
+
"""
|
|
492
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
493
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
494
|
+
|
|
495
|
+
if not "json_field" in parameters:
|
|
496
|
+
raise MissingConfigurationParameterV0Error(path="json_field")
|
|
497
|
+
|
|
498
|
+
json_field = str(parameters["json_field"])
|
|
499
|
+
|
|
500
|
+
if not "correct_answer_key" in parameters:
|
|
501
|
+
raise MissingConfigurationParameterV0Error(path="correct_answer_key")
|
|
502
|
+
|
|
503
|
+
correct_answer_key = str(parameters["correct_answer_key"])
|
|
504
|
+
|
|
505
|
+
if inputs is None or not isinstance(inputs, dict):
|
|
506
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
507
|
+
|
|
508
|
+
if not correct_answer_key in inputs:
|
|
509
|
+
raise MissingInputV0Error(path=correct_answer_key)
|
|
510
|
+
|
|
511
|
+
correct_answer = inputs[correct_answer_key]
|
|
512
|
+
|
|
513
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
514
|
+
# raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
515
|
+
return {"success": False}
|
|
516
|
+
|
|
517
|
+
outputs_dict = outputs
|
|
518
|
+
if isinstance(outputs, str):
|
|
519
|
+
try:
|
|
520
|
+
outputs_dict = loads(outputs)
|
|
521
|
+
except json.JSONDecodeError as e:
|
|
522
|
+
# raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
|
|
523
|
+
return {"success": False}
|
|
524
|
+
|
|
525
|
+
if not isinstance(outputs_dict, dict):
|
|
526
|
+
# raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
527
|
+
return {"success": False}
|
|
528
|
+
|
|
529
|
+
if not json_field in outputs_dict:
|
|
530
|
+
# raise MissingOutputV0Error(path=json_field)
|
|
531
|
+
return {"success": False}
|
|
532
|
+
|
|
533
|
+
# --------------------------------------------------------------------------
|
|
534
|
+
success = outputs_dict[json_field] == correct_answer
|
|
535
|
+
# --------------------------------------------------------------------------
|
|
536
|
+
|
|
537
|
+
return {"success": success}
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
@instrument(annotate=True)
|
|
541
|
+
async def auto_webhook_test_v0(
|
|
542
|
+
parameters: Optional[Data] = None,
|
|
543
|
+
inputs: Optional[Data] = None,
|
|
544
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
545
|
+
) -> Any:
|
|
546
|
+
"""
|
|
547
|
+
Webhook test evaluator for sending output to an external service for evaluation.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
inputs: Testcase data with ground truth
|
|
551
|
+
outputs: Output from the workflow execution
|
|
552
|
+
parameters: Configuration for the evaluator with webhook_url
|
|
553
|
+
|
|
554
|
+
Returns:
|
|
555
|
+
Evaluation result with score from the webhook
|
|
556
|
+
"""
|
|
557
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
558
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
559
|
+
|
|
560
|
+
if not "webhook_url" in parameters:
|
|
561
|
+
raise MissingConfigurationParameterV0Error(path="webhook_url")
|
|
562
|
+
|
|
563
|
+
webhook_url = str(parameters["webhook_url"])
|
|
564
|
+
|
|
565
|
+
if not "correct_answer_key" in parameters:
|
|
566
|
+
raise MissingConfigurationParameterV0Error(path="correct_answer_key")
|
|
567
|
+
|
|
568
|
+
correct_answer_key = str(parameters["correct_answer_key"])
|
|
569
|
+
|
|
570
|
+
if inputs is None or not isinstance(inputs, dict):
|
|
571
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
572
|
+
|
|
573
|
+
if not correct_answer_key in inputs:
|
|
574
|
+
raise MissingInputV0Error(path=correct_answer_key)
|
|
575
|
+
|
|
576
|
+
correct_answer = inputs[correct_answer_key]
|
|
577
|
+
|
|
578
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
579
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
580
|
+
|
|
581
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
582
|
+
|
|
583
|
+
threshold = parameters.get("threshold") or 0.5
|
|
584
|
+
|
|
585
|
+
if not isinstance(threshold, float):
|
|
586
|
+
raise InvalidConfigurationParameterV0Error(
|
|
587
|
+
path="threshold",
|
|
588
|
+
expected="float",
|
|
589
|
+
got=threshold,
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
if not 0.0 < threshold <= 1.0:
|
|
593
|
+
raise InvalidConfigurationParameterV0Error(
|
|
594
|
+
path="threshold",
|
|
595
|
+
expected="float[0.0, 1.0]",
|
|
596
|
+
got=threshold,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
_outputs = None
|
|
600
|
+
|
|
601
|
+
# --------------------------------------------------------------------------
|
|
602
|
+
json_payload = {
|
|
603
|
+
"inputs": inputs,
|
|
604
|
+
"output": outputs_str,
|
|
605
|
+
"correct_answer": correct_answer,
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
async with httpx.AsyncClient() as client:
|
|
609
|
+
try:
|
|
610
|
+
response = await client.post(
|
|
611
|
+
url=webhook_url,
|
|
612
|
+
json=json_payload,
|
|
613
|
+
)
|
|
614
|
+
except Exception as e:
|
|
615
|
+
raise WebhookClientV0Error(
|
|
616
|
+
message=str(e),
|
|
617
|
+
) from e
|
|
618
|
+
|
|
619
|
+
if response.status_code != 200:
|
|
620
|
+
raise WebhookServerV0Error(
|
|
621
|
+
code=response.status_code,
|
|
622
|
+
message=response.json(),
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
try:
|
|
626
|
+
_outputs = response.json()
|
|
627
|
+
except Exception as e:
|
|
628
|
+
raise WebhookClientV0Error(
|
|
629
|
+
message=str(e),
|
|
630
|
+
) from e
|
|
631
|
+
# --------------------------------------------------------------------------
|
|
632
|
+
|
|
633
|
+
if isinstance(_outputs, (int, float)):
|
|
634
|
+
return {"score": _outputs, "success": _outputs >= threshold}
|
|
635
|
+
|
|
636
|
+
if isinstance(_outputs, bool):
|
|
637
|
+
return {"success": _outputs}
|
|
638
|
+
|
|
639
|
+
if isinstance(_outputs, dict) or isinstance(_outputs, str):
|
|
640
|
+
return _outputs
|
|
641
|
+
|
|
642
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=_outputs)
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
@instrument(annotate=True)
|
|
646
|
+
async def auto_custom_code_run_v0(
|
|
647
|
+
parameters: Optional[Data] = None,
|
|
648
|
+
inputs: Optional[Data] = None,
|
|
649
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
650
|
+
) -> Any:
|
|
651
|
+
"""
|
|
652
|
+
Custom code execution evaluator for running arbitrary code to evaluate outputs.
|
|
653
|
+
|
|
654
|
+
Args:
|
|
655
|
+
inputs: Testcase data with ground truth
|
|
656
|
+
outputs: Output from the workflow execution
|
|
657
|
+
parameters: Configuration for the evaluator with code to execute
|
|
658
|
+
|
|
659
|
+
Returns:
|
|
660
|
+
Evaluation result with score from the custom code
|
|
661
|
+
"""
|
|
662
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
663
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
664
|
+
|
|
665
|
+
if not "code" in parameters:
|
|
666
|
+
raise MissingConfigurationParameterV0Error(path="code")
|
|
667
|
+
|
|
668
|
+
code = str(parameters["code"])
|
|
669
|
+
|
|
670
|
+
if not "correct_answer_key" in parameters:
|
|
671
|
+
raise MissingConfigurationParameterV0Error(path="correct_answer_key")
|
|
672
|
+
|
|
673
|
+
correct_answer_key = str(parameters["correct_answer_key"])
|
|
674
|
+
|
|
675
|
+
if inputs is None or not isinstance(inputs, dict):
|
|
676
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
677
|
+
|
|
678
|
+
if not correct_answer_key in inputs:
|
|
679
|
+
raise MissingInputV0Error(path=correct_answer_key)
|
|
680
|
+
|
|
681
|
+
correct_answer = inputs[correct_answer_key]
|
|
682
|
+
|
|
683
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
684
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
685
|
+
|
|
686
|
+
threshold = parameters.get("threshold") or 0.5
|
|
687
|
+
|
|
688
|
+
if not isinstance(threshold, float):
|
|
689
|
+
raise InvalidConfigurationParameterV0Error(
|
|
690
|
+
path="threshold",
|
|
691
|
+
expected="float",
|
|
692
|
+
got=threshold,
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
if not 0.0 < threshold <= 1.0:
|
|
696
|
+
raise InvalidConfigurationParameterV0Error(
|
|
697
|
+
path="threshold",
|
|
698
|
+
expected="float[0.0, 1.0]",
|
|
699
|
+
got=threshold,
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
_outputs = None
|
|
703
|
+
|
|
704
|
+
# --------------------------------------------------------------------------
|
|
705
|
+
try:
|
|
706
|
+
_outputs = execute_code_safely(
|
|
707
|
+
app_params={},
|
|
708
|
+
inputs=inputs,
|
|
709
|
+
output=outputs,
|
|
710
|
+
correct_answer=correct_answer,
|
|
711
|
+
code=code,
|
|
712
|
+
)
|
|
713
|
+
except Exception as e:
|
|
714
|
+
raise CustomCodeServerV0Error(
|
|
715
|
+
message=str(e),
|
|
716
|
+
stacktrace=traceback.format_exc(),
|
|
717
|
+
) from e
|
|
718
|
+
# --------------------------------------------------------------------------
|
|
719
|
+
|
|
720
|
+
if isinstance(_outputs, (int, float)):
|
|
721
|
+
return {"score": _outputs, "success": _outputs >= threshold}
|
|
722
|
+
|
|
723
|
+
if isinstance(_outputs, bool):
|
|
724
|
+
return {"success": _outputs}
|
|
725
|
+
|
|
726
|
+
if isinstance(_outputs, dict) or isinstance(_outputs, str):
|
|
727
|
+
return _outputs
|
|
728
|
+
|
|
729
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=_outputs)
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
@instrument(annotate=True)
|
|
733
|
+
async def auto_ai_critique_v0(
|
|
734
|
+
parameters: Optional[Data] = None,
|
|
735
|
+
inputs: Optional[Data] = None,
|
|
736
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
737
|
+
) -> Any:
|
|
738
|
+
# return {"score": 0.75, "success": True}
|
|
739
|
+
|
|
740
|
+
"""
|
|
741
|
+
AI critique evaluator for using an LLM to evaluate outputs.
|
|
742
|
+
|
|
743
|
+
Args:
|
|
744
|
+
inputs: Testcase data with ground truth
|
|
745
|
+
outputs: Output from the workflow execution
|
|
746
|
+
parameters: Configuration for the evaluator with prompt_template and model
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
Evaluation result with score from the AI
|
|
750
|
+
"""
|
|
751
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
752
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
753
|
+
|
|
754
|
+
correct_answer_key = parameters.get("correct_answer_key")
|
|
755
|
+
|
|
756
|
+
if not "prompt_template" in parameters:
|
|
757
|
+
raise MissingConfigurationParameterV0Error(path="prompt_template")
|
|
758
|
+
|
|
759
|
+
prompt_template = parameters.get("prompt_template")
|
|
760
|
+
|
|
761
|
+
if not isinstance(prompt_template, list):
|
|
762
|
+
raise InvalidConfigurationParameterV0Error(
|
|
763
|
+
path="prompt_template",
|
|
764
|
+
expected="list",
|
|
765
|
+
got=prompt_template,
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
template_version = parameters.get("version") or "3"
|
|
769
|
+
|
|
770
|
+
default_format = "fstring" if template_version == "2" else "curly"
|
|
771
|
+
|
|
772
|
+
template_format = str(parameters.get("template_format") or default_format)
|
|
773
|
+
|
|
774
|
+
model = parameters.get("model") or "gpt-3.5-turbo"
|
|
775
|
+
|
|
776
|
+
if not isinstance(model, str):
|
|
777
|
+
raise InvalidConfigurationParameterV0Error(
|
|
778
|
+
path="model",
|
|
779
|
+
expected="str",
|
|
780
|
+
got=model,
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
response_type = parameters.get("response_type") or (
|
|
784
|
+
"json_schema" if template_version == "4" else "text"
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
if not response_type in ["text", "json_object", "json_schema"]:
|
|
788
|
+
raise InvalidConfigurationParameterV0Error(
|
|
789
|
+
path="response_type",
|
|
790
|
+
expected=["text", "json_object", "json_schema"],
|
|
791
|
+
got=response_type,
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
json_schema = parameters.get("json_schema") or None
|
|
795
|
+
|
|
796
|
+
json_schema = json_schema if response_type == "json_schema" else None
|
|
797
|
+
|
|
798
|
+
if response_type == "json_schema" and not isinstance(json_schema, dict):
|
|
799
|
+
raise InvalidConfigurationParameterV0Error(
|
|
800
|
+
path="json_schema",
|
|
801
|
+
expected="dict",
|
|
802
|
+
got=json_schema,
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
response_format: dict = dict(type=response_type)
|
|
806
|
+
|
|
807
|
+
if response_type == "json_schema":
|
|
808
|
+
response_format["json_schema"] = json_schema
|
|
809
|
+
|
|
810
|
+
correct_answer = None
|
|
811
|
+
|
|
812
|
+
if inputs:
|
|
813
|
+
if not isinstance(inputs, dict):
|
|
814
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
815
|
+
|
|
816
|
+
if correct_answer_key:
|
|
817
|
+
if correct_answer_key in inputs:
|
|
818
|
+
correct_answer = inputs[correct_answer_key]
|
|
819
|
+
|
|
820
|
+
secrets = await SecretsManager.retrieve_secrets()
|
|
821
|
+
|
|
822
|
+
if secrets is None or not isinstance(secrets, list):
|
|
823
|
+
raise InvalidSecretsV0Error(expected="list", got=secrets)
|
|
824
|
+
|
|
825
|
+
openai_api_key = None # secrets.get("OPENAI_API_KEY")
|
|
826
|
+
anthropic_api_key = None # secrets.get("ANTHROPIC_API_KEY")
|
|
827
|
+
openrouter_api_key = None # secrets.get("OPENROUTER_API_KEY")
|
|
828
|
+
cohere_api_key = None # secrets.get("COHERE_API_KEY")
|
|
829
|
+
azure_api_key = None # secrets.get("AZURE_API_KEY")
|
|
830
|
+
groq_api_key = None # secrets.get("GROQ_API_KEY")
|
|
831
|
+
|
|
832
|
+
for secret in secrets:
|
|
833
|
+
if secret.get("kind") == "provider_key":
|
|
834
|
+
secret_data = secret.get("data", {})
|
|
835
|
+
if secret_data.get("kind") == "openai":
|
|
836
|
+
provider_data = secret_data.get("provider", {})
|
|
837
|
+
openai_api_key = provider_data.get("key") or openai_api_key
|
|
838
|
+
if secret_data.get("kind") == "anthropic":
|
|
839
|
+
provider_data = secret_data.get("provider", {})
|
|
840
|
+
anthropic_api_key = provider_data.get("key") or anthropic_api_key
|
|
841
|
+
if secret_data.get("kind") == "openrouter":
|
|
842
|
+
provider_data = secret_data.get("provider", {})
|
|
843
|
+
openrouter_api_key = provider_data.get("key") or openrouter_api_key
|
|
844
|
+
if secret_data.get("kind") == "cohere":
|
|
845
|
+
provider_data = secret_data.get("provider", {})
|
|
846
|
+
cohere_api_key = provider_data.get("key") or cohere_api_key
|
|
847
|
+
if secret_data.get("kind") == "azure":
|
|
848
|
+
provider_data = secret_data.get("provider", {})
|
|
849
|
+
azure_api_key = provider_data.get("key") or azure_api_key
|
|
850
|
+
if secret_data.get("kind") == "groq":
|
|
851
|
+
provider_data = secret_data.get("provider", {})
|
|
852
|
+
groq_api_key = provider_data.get("key") or groq_api_key
|
|
853
|
+
|
|
854
|
+
threshold = parameters.get("threshold") or 0.5
|
|
855
|
+
|
|
856
|
+
if not isinstance(threshold, float):
|
|
857
|
+
raise InvalidConfigurationParameterV0Error(
|
|
858
|
+
path="threshold",
|
|
859
|
+
expected="float",
|
|
860
|
+
got=threshold,
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
_outputs = None
|
|
864
|
+
|
|
865
|
+
# --------------------------------------------------------------------------
|
|
866
|
+
litellm.openai_key = openai_api_key
|
|
867
|
+
litellm.anthropic_key = anthropic_api_key
|
|
868
|
+
litellm.openrouter_key = openrouter_api_key
|
|
869
|
+
litellm.cohere_key = cohere_api_key
|
|
870
|
+
litellm.azure_key = azure_api_key
|
|
871
|
+
litellm.groq_key = groq_api_key
|
|
872
|
+
|
|
873
|
+
context: Dict[str, Any] = dict()
|
|
874
|
+
|
|
875
|
+
if parameters:
|
|
876
|
+
context.update(
|
|
877
|
+
**{
|
|
878
|
+
"parameters": parameters,
|
|
879
|
+
}
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
if correct_answer:
|
|
883
|
+
context.update(
|
|
884
|
+
**{
|
|
885
|
+
"ground_truth": correct_answer,
|
|
886
|
+
"correct_answer": correct_answer,
|
|
887
|
+
"reference": correct_answer,
|
|
888
|
+
}
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
if outputs:
|
|
892
|
+
context.update(
|
|
893
|
+
**{
|
|
894
|
+
"prediction": outputs,
|
|
895
|
+
"outputs": outputs,
|
|
896
|
+
}
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
if inputs:
|
|
900
|
+
context.update(**inputs)
|
|
901
|
+
context.update(
|
|
902
|
+
**{
|
|
903
|
+
"inputs": inputs,
|
|
904
|
+
}
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
try:
|
|
908
|
+
formatted_prompt_template = [
|
|
909
|
+
{
|
|
910
|
+
"role": message["role"],
|
|
911
|
+
"content": _format_with_template(
|
|
912
|
+
content=message["content"],
|
|
913
|
+
format=template_format,
|
|
914
|
+
kwargs=context,
|
|
915
|
+
),
|
|
916
|
+
}
|
|
917
|
+
for message in prompt_template
|
|
918
|
+
]
|
|
919
|
+
except Exception as e:
|
|
920
|
+
raise PromptFormattingV0Error(
|
|
921
|
+
message=str(e),
|
|
922
|
+
stacktrace=traceback.format_exc(),
|
|
923
|
+
) from e
|
|
924
|
+
|
|
925
|
+
try:
|
|
926
|
+
response = await litellm.acompletion(
|
|
927
|
+
model=model,
|
|
928
|
+
messages=formatted_prompt_template,
|
|
929
|
+
temperature=0.01,
|
|
930
|
+
response_format=response_format,
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
_outputs = response.choices[0].message.content.strip() # type: ignore
|
|
934
|
+
|
|
935
|
+
except litellm.AuthenticationError as e: # type: ignore
|
|
936
|
+
e.message = e.message.replace(
|
|
937
|
+
"litellm.AuthenticationError: AuthenticationError: ", ""
|
|
938
|
+
)
|
|
939
|
+
raise e
|
|
940
|
+
|
|
941
|
+
except Exception as e:
|
|
942
|
+
raise PromptCompletionV0Error(
|
|
943
|
+
message=str(e),
|
|
944
|
+
stacktrace=traceback.format_exc(),
|
|
945
|
+
) from e
|
|
946
|
+
# --------------------------------------------------------------------------
|
|
947
|
+
|
|
948
|
+
try:
|
|
949
|
+
_outputs = json.loads(_outputs)
|
|
950
|
+
except:
|
|
951
|
+
pass
|
|
952
|
+
|
|
953
|
+
if isinstance(_outputs, (int, float)):
|
|
954
|
+
return {
|
|
955
|
+
"score": _outputs,
|
|
956
|
+
"success": _outputs >= threshold,
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
if isinstance(_outputs, bool):
|
|
960
|
+
return {
|
|
961
|
+
"success": _outputs,
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
if isinstance(_outputs, dict):
|
|
965
|
+
return _outputs
|
|
966
|
+
|
|
967
|
+
raise InvalidOutputsV0Error(expected=["dict", "str", "int", "float"], got=_outputs)
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
@instrument(annotate=True)
|
|
971
|
+
def auto_starts_with_v0(
|
|
972
|
+
parameters: Optional[Data] = None,
|
|
973
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
974
|
+
) -> Any:
|
|
975
|
+
"""
|
|
976
|
+
Starts with evaluator for checking if output starts with a specific prefix.
|
|
977
|
+
|
|
978
|
+
Args:
|
|
979
|
+
inputs: Testcase data
|
|
980
|
+
outputs: Output from the workflow execution
|
|
981
|
+
parameters: Configuration for the evaluator with prefix and case sensitivity setting
|
|
982
|
+
|
|
983
|
+
Returns:
|
|
984
|
+
Evaluation result with success flag
|
|
985
|
+
"""
|
|
986
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
987
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
988
|
+
|
|
989
|
+
if not "prefix" in parameters:
|
|
990
|
+
raise MissingConfigurationParameterV0Error(path="prefix")
|
|
991
|
+
|
|
992
|
+
prefix = parameters["prefix"]
|
|
993
|
+
|
|
994
|
+
if not isinstance(prefix, str):
|
|
995
|
+
raise InvalidConfigurationParameterV0Error(
|
|
996
|
+
path="prefix",
|
|
997
|
+
expected="str",
|
|
998
|
+
got=prefix,
|
|
999
|
+
)
|
|
1000
|
+
|
|
1001
|
+
case_sensitive = parameters.get("case_sensitive", True) is True
|
|
1002
|
+
|
|
1003
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1004
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1005
|
+
|
|
1006
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1007
|
+
|
|
1008
|
+
# --------------------------------------------------------------------------
|
|
1009
|
+
if not case_sensitive:
|
|
1010
|
+
outputs_str = outputs_str.lower()
|
|
1011
|
+
prefix = prefix.lower()
|
|
1012
|
+
|
|
1013
|
+
success = outputs_str.startswith(prefix)
|
|
1014
|
+
# --------------------------------------------------------------------------
|
|
1015
|
+
|
|
1016
|
+
return {"success": success}
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
@instrument(annotate=True)
|
|
1020
|
+
def auto_ends_with_v0(
|
|
1021
|
+
parameters: Optional[Data] = None,
|
|
1022
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1023
|
+
) -> Any:
|
|
1024
|
+
"""
|
|
1025
|
+
Ends with evaluator for checking if output ends with a specific suffix.
|
|
1026
|
+
|
|
1027
|
+
Args:
|
|
1028
|
+
inputs: Testcase data
|
|
1029
|
+
outputs: Output from the workflow execution
|
|
1030
|
+
parameters: Configuration for the evaluator with suffix and case sensitivity setting
|
|
1031
|
+
|
|
1032
|
+
Returns:
|
|
1033
|
+
Evaluation result with success flag
|
|
1034
|
+
"""
|
|
1035
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1036
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1037
|
+
|
|
1038
|
+
if not "suffix" in parameters:
|
|
1039
|
+
raise MissingConfigurationParameterV0Error(path="suffix")
|
|
1040
|
+
|
|
1041
|
+
suffix = parameters["suffix"]
|
|
1042
|
+
|
|
1043
|
+
if not isinstance(suffix, str):
|
|
1044
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1045
|
+
path="suffix",
|
|
1046
|
+
expected="str",
|
|
1047
|
+
got=suffix,
|
|
1048
|
+
)
|
|
1049
|
+
|
|
1050
|
+
case_sensitive = parameters.get("case_sensitive", True) is True
|
|
1051
|
+
|
|
1052
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1053
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1054
|
+
|
|
1055
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1056
|
+
|
|
1057
|
+
# --------------------------------------------------------------------------
|
|
1058
|
+
if not case_sensitive:
|
|
1059
|
+
outputs_str = outputs_str.lower()
|
|
1060
|
+
suffix = suffix.lower()
|
|
1061
|
+
|
|
1062
|
+
success = outputs_str.endswith(suffix)
|
|
1063
|
+
# --------------------------------------------------------------------------
|
|
1064
|
+
|
|
1065
|
+
return {"success": success}
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
@instrument(annotate=True)
|
|
1069
|
+
def auto_contains_v0(
|
|
1070
|
+
parameters: Optional[Data] = None,
|
|
1071
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1072
|
+
) -> Any:
|
|
1073
|
+
"""
|
|
1074
|
+
Contains evaluator for checking if output contains a specific substring.
|
|
1075
|
+
|
|
1076
|
+
Args:
|
|
1077
|
+
inputs: Testcase data
|
|
1078
|
+
outputs: Output from the workflow execution
|
|
1079
|
+
parameters: Configuration for the evaluator with substring and case sensitivity setting
|
|
1080
|
+
|
|
1081
|
+
Returns:
|
|
1082
|
+
Evaluation result with success flag
|
|
1083
|
+
"""
|
|
1084
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1085
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1086
|
+
|
|
1087
|
+
if not "substring" in parameters:
|
|
1088
|
+
raise MissingConfigurationParameterV0Error(path="substring")
|
|
1089
|
+
|
|
1090
|
+
substring = parameters["substring"]
|
|
1091
|
+
|
|
1092
|
+
if not isinstance(substring, str):
|
|
1093
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1094
|
+
path="substring",
|
|
1095
|
+
expected="str",
|
|
1096
|
+
got=substring,
|
|
1097
|
+
)
|
|
1098
|
+
|
|
1099
|
+
case_sensitive = parameters.get("case_sensitive", True) is True
|
|
1100
|
+
|
|
1101
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1102
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1103
|
+
|
|
1104
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1105
|
+
|
|
1106
|
+
# --------------------------------------------------------------------------
|
|
1107
|
+
if not case_sensitive:
|
|
1108
|
+
outputs_str = outputs_str.lower()
|
|
1109
|
+
substring = substring.lower()
|
|
1110
|
+
|
|
1111
|
+
success = substring in outputs_str
|
|
1112
|
+
# --------------------------------------------------------------------------
|
|
1113
|
+
|
|
1114
|
+
return {"success": success}
|
|
1115
|
+
|
|
1116
|
+
|
|
1117
|
+
@instrument(annotate=True)
|
|
1118
|
+
def auto_contains_any_v0(
|
|
1119
|
+
parameters: Optional[Data] = None,
|
|
1120
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1121
|
+
) -> Any:
|
|
1122
|
+
"""
|
|
1123
|
+
Contains any evaluator for checking if output contains any of the specified substrings.
|
|
1124
|
+
|
|
1125
|
+
Args:
|
|
1126
|
+
inputs: Testcase data
|
|
1127
|
+
outputs: Output from the workflow execution
|
|
1128
|
+
parameters: Configuration for the evaluator with substrings list and case sensitivity setting
|
|
1129
|
+
|
|
1130
|
+
Returns:
|
|
1131
|
+
Evaluation result with success flag
|
|
1132
|
+
"""
|
|
1133
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1134
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1135
|
+
|
|
1136
|
+
if not "substrings" in parameters:
|
|
1137
|
+
raise MissingConfigurationParameterV0Error(path="substrings")
|
|
1138
|
+
|
|
1139
|
+
substrings = parameters["substrings"]
|
|
1140
|
+
|
|
1141
|
+
if not isinstance(substrings, list):
|
|
1142
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1143
|
+
path="substrings",
|
|
1144
|
+
expected="list",
|
|
1145
|
+
got=substrings,
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
substrings = [s.strip() for s in substrings]
|
|
1149
|
+
|
|
1150
|
+
if not all(isinstance(s, str) for s in substrings):
|
|
1151
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1152
|
+
path="substrings",
|
|
1153
|
+
expected="list[str]",
|
|
1154
|
+
got=substrings,
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
case_sensitive = parameters.get("case_sensitive", True) is True
|
|
1158
|
+
|
|
1159
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1160
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1161
|
+
|
|
1162
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1163
|
+
|
|
1164
|
+
# --------------------------------------------------------------------------
|
|
1165
|
+
if not case_sensitive:
|
|
1166
|
+
outputs_str = outputs_str.lower()
|
|
1167
|
+
substrings = [s.lower() for s in substrings]
|
|
1168
|
+
|
|
1169
|
+
success = any(substring in outputs_str for substring in substrings)
|
|
1170
|
+
# --------------------------------------------------------------------------
|
|
1171
|
+
|
|
1172
|
+
return {"success": success}
|
|
1173
|
+
|
|
1174
|
+
|
|
1175
|
+
@instrument(annotate=True)
|
|
1176
|
+
def auto_contains_all_v0(
|
|
1177
|
+
parameters: Optional[Data] = None,
|
|
1178
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1179
|
+
) -> Any:
|
|
1180
|
+
"""
|
|
1181
|
+
Contains all evaluator for checking if output contains all of the specified substrings.
|
|
1182
|
+
|
|
1183
|
+
Args:
|
|
1184
|
+
inputs: Testcase data
|
|
1185
|
+
outputs: Output from the workflow execution
|
|
1186
|
+
parameters: Configuration for the evaluator with substrings list and case sensitivity setting
|
|
1187
|
+
|
|
1188
|
+
Returns:
|
|
1189
|
+
Evaluation result with success flag
|
|
1190
|
+
"""
|
|
1191
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1192
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1193
|
+
|
|
1194
|
+
if not "substrings" in parameters:
|
|
1195
|
+
raise MissingConfigurationParameterV0Error(path="substrings")
|
|
1196
|
+
|
|
1197
|
+
substrings = parameters["substrings"]
|
|
1198
|
+
|
|
1199
|
+
if not isinstance(substrings, list):
|
|
1200
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1201
|
+
path="substrings",
|
|
1202
|
+
expected="list",
|
|
1203
|
+
got=substrings,
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
substrings = [s.strip() for s in substrings]
|
|
1207
|
+
|
|
1208
|
+
if not all(isinstance(s, str) for s in substrings):
|
|
1209
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1210
|
+
path="substrings",
|
|
1211
|
+
expected="list[str]",
|
|
1212
|
+
got=substrings,
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
case_sensitive = parameters.get("case_sensitive", True) is True
|
|
1216
|
+
|
|
1217
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1218
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1219
|
+
|
|
1220
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1221
|
+
|
|
1222
|
+
# --------------------------------------------------------------------------
|
|
1223
|
+
if not case_sensitive:
|
|
1224
|
+
outputs_str = outputs_str.lower()
|
|
1225
|
+
substrings = [s.lower() for s in substrings]
|
|
1226
|
+
|
|
1227
|
+
success = all(substring in outputs_str for substring in substrings)
|
|
1228
|
+
# --------------------------------------------------------------------------
|
|
1229
|
+
|
|
1230
|
+
return {"success": success}
|
|
1231
|
+
|
|
1232
|
+
|
|
1233
|
+
@instrument(annotate=True)
|
|
1234
|
+
def auto_contains_json_v0(
|
|
1235
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1236
|
+
) -> Any:
|
|
1237
|
+
"""
|
|
1238
|
+
Contains JSON evaluator for checking if output contains valid JSON content.
|
|
1239
|
+
|
|
1240
|
+
Args:
|
|
1241
|
+
inputs: Testcase data
|
|
1242
|
+
outputs: Output from the workflow execution
|
|
1243
|
+
parameters: Configuration for the evaluator
|
|
1244
|
+
|
|
1245
|
+
Returns:
|
|
1246
|
+
Evaluation result with success flag
|
|
1247
|
+
"""
|
|
1248
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1249
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1250
|
+
|
|
1251
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1252
|
+
|
|
1253
|
+
# --------------------------------------------------------------------------
|
|
1254
|
+
success = True
|
|
1255
|
+
potential_json = ""
|
|
1256
|
+
|
|
1257
|
+
try:
|
|
1258
|
+
start_index = outputs_str.index("{")
|
|
1259
|
+
end_index = outputs_str.rindex("}") + 1
|
|
1260
|
+
potential_json = outputs_str[start_index:end_index]
|
|
1261
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
1262
|
+
success = False
|
|
1263
|
+
|
|
1264
|
+
if success:
|
|
1265
|
+
try:
|
|
1266
|
+
json.loads(potential_json)
|
|
1267
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
1268
|
+
success = False
|
|
1269
|
+
# --------------------------------------------------------------------------
|
|
1270
|
+
|
|
1271
|
+
return {"success": success}
|
|
1272
|
+
|
|
1273
|
+
|
|
1274
|
+
@instrument(annotate=True)
|
|
1275
|
+
def auto_json_diff_v0(
|
|
1276
|
+
parameters: Optional[Data] = None,
|
|
1277
|
+
inputs: Optional[Data] = None,
|
|
1278
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1279
|
+
) -> Any:
|
|
1280
|
+
"""
|
|
1281
|
+
JSON diff evaluator for finding differences between JSON structures.
|
|
1282
|
+
|
|
1283
|
+
Args:
|
|
1284
|
+
inputs: Testcase data with reference JSON
|
|
1285
|
+
outputs: Output from the workflow execution
|
|
1286
|
+
parameters: Configuration for the evaluator
|
|
1287
|
+
|
|
1288
|
+
Returns:
|
|
1289
|
+
Evaluation result with score only (no diff explanation)
|
|
1290
|
+
"""
|
|
1291
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1292
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1293
|
+
|
|
1294
|
+
if not "correct_answer_key" in parameters:
|
|
1295
|
+
raise MissingConfigurationParameterV0Error(path="correct_answer_key")
|
|
1296
|
+
|
|
1297
|
+
correct_answer_key = str(parameters["correct_answer_key"])
|
|
1298
|
+
|
|
1299
|
+
if inputs is None or not isinstance(inputs, dict):
|
|
1300
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
1301
|
+
|
|
1302
|
+
if not correct_answer_key in inputs:
|
|
1303
|
+
raise MissingInputV0Error(path=correct_answer_key)
|
|
1304
|
+
|
|
1305
|
+
correct_answer = inputs[correct_answer_key]
|
|
1306
|
+
|
|
1307
|
+
if not isinstance(correct_answer, str) and not isinstance(correct_answer, dict):
|
|
1308
|
+
raise InvalidInputV0Error(
|
|
1309
|
+
path=correct_answer_key, expected=["dict", "str"], got=correct_answer
|
|
1310
|
+
)
|
|
1311
|
+
|
|
1312
|
+
correct_answer_dict = (
|
|
1313
|
+
correct_answer if isinstance(correct_answer, dict) else loads(correct_answer)
|
|
1314
|
+
)
|
|
1315
|
+
|
|
1316
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1317
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1318
|
+
|
|
1319
|
+
outputs_dict = outputs
|
|
1320
|
+
if isinstance(outputs, str):
|
|
1321
|
+
try:
|
|
1322
|
+
outputs_dict = loads(outputs)
|
|
1323
|
+
except json.JSONDecodeError as e:
|
|
1324
|
+
raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
|
|
1325
|
+
|
|
1326
|
+
threshold = parameters.get("threshold") or 0.5
|
|
1327
|
+
|
|
1328
|
+
if not isinstance(threshold, float):
|
|
1329
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1330
|
+
path="threshold",
|
|
1331
|
+
expected="float",
|
|
1332
|
+
got=threshold,
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
if not 0.0 < threshold <= 1.0:
|
|
1336
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1337
|
+
path="threshold",
|
|
1338
|
+
expected="float[0.0, 1.0]",
|
|
1339
|
+
got=threshold,
|
|
1340
|
+
)
|
|
1341
|
+
|
|
1342
|
+
_outputs = None
|
|
1343
|
+
|
|
1344
|
+
# --------------------------------------------------------------------------
|
|
1345
|
+
try:
|
|
1346
|
+
_outputs = _compare_jsons(
|
|
1347
|
+
ground_truth=correct_answer_dict,
|
|
1348
|
+
app_output=outputs_dict, # type: ignore
|
|
1349
|
+
settings_values=parameters,
|
|
1350
|
+
)
|
|
1351
|
+
|
|
1352
|
+
except Exception as e:
|
|
1353
|
+
raise JSONDiffV0Error(message=str(e), stacktrace=traceback.format_exc()) from e
|
|
1354
|
+
# --------------------------------------------------------------------------
|
|
1355
|
+
|
|
1356
|
+
if isinstance(_outputs, (int, float)):
|
|
1357
|
+
return {"score": _outputs, "success": _outputs >= threshold}
|
|
1358
|
+
|
|
1359
|
+
raise JSONDiffV0Error(
|
|
1360
|
+
message=f"json-diff error: got ({type(_outputs)}) {_outputs}, expected (int, float)."
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
|
|
1364
|
+
@instrument(annotate=True)
|
|
1365
|
+
def auto_levenshtein_distance_v0(
|
|
1366
|
+
parameters: Optional[Data] = None,
|
|
1367
|
+
inputs: Optional[Data] = None,
|
|
1368
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1369
|
+
) -> Any:
|
|
1370
|
+
"""
|
|
1371
|
+
Levenshtein distance evaluator using pure Python implementation.
|
|
1372
|
+
Measures edit distance and returns normalized similarity score.
|
|
1373
|
+
|
|
1374
|
+
Args:
|
|
1375
|
+
inputs: Testcase data with reference string.
|
|
1376
|
+
outputs: Output from the workflow execution.
|
|
1377
|
+
parameters: Configuration for the evaluator.
|
|
1378
|
+
|
|
1379
|
+
Returns:
|
|
1380
|
+
Dictionary with normalized similarity score (0 to 1),
|
|
1381
|
+
or error message if evaluation fails.
|
|
1382
|
+
"""
|
|
1383
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1384
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1385
|
+
|
|
1386
|
+
if not "correct_answer_key" in parameters:
|
|
1387
|
+
raise MissingConfigurationParameterV0Error(path="correct_answer_key")
|
|
1388
|
+
|
|
1389
|
+
correct_answer_key = str(parameters["correct_answer_key"])
|
|
1390
|
+
|
|
1391
|
+
case_sensitive = parameters.get("case_sensitive", True) is True
|
|
1392
|
+
|
|
1393
|
+
if inputs is None or not isinstance(inputs, dict):
|
|
1394
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
1395
|
+
|
|
1396
|
+
if not correct_answer_key in inputs:
|
|
1397
|
+
raise MissingInputV0Error(path=correct_answer_key)
|
|
1398
|
+
|
|
1399
|
+
correct_answer = inputs[correct_answer_key]
|
|
1400
|
+
|
|
1401
|
+
if not isinstance(correct_answer, str) and not isinstance(correct_answer, dict):
|
|
1402
|
+
raise InvalidInputV0Error(
|
|
1403
|
+
path=correct_answer_key, expected=["dict", "str"], got=correct_answer
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
correct_answer_str = (
|
|
1407
|
+
correct_answer if isinstance(correct_answer, str) else dumps(correct_answer)
|
|
1408
|
+
)
|
|
1409
|
+
|
|
1410
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1411
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1412
|
+
|
|
1413
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1414
|
+
|
|
1415
|
+
threshold = parameters.get("threshold") or 0.5
|
|
1416
|
+
|
|
1417
|
+
if not isinstance(threshold, float):
|
|
1418
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1419
|
+
path="threshold",
|
|
1420
|
+
expected="float",
|
|
1421
|
+
got=threshold,
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
if not 0.0 < threshold <= 1.0:
|
|
1425
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1426
|
+
path="threshold",
|
|
1427
|
+
expected="float[0.0, 1.0]",
|
|
1428
|
+
got=threshold,
|
|
1429
|
+
)
|
|
1430
|
+
|
|
1431
|
+
_outputs = None
|
|
1432
|
+
|
|
1433
|
+
# --------------------------------------------------------------------------
|
|
1434
|
+
if not case_sensitive:
|
|
1435
|
+
outputs_str = outputs_str.lower()
|
|
1436
|
+
correct_answer_str = correct_answer_str.lower()
|
|
1437
|
+
|
|
1438
|
+
try:
|
|
1439
|
+
# Compute Levenshtein distance
|
|
1440
|
+
if len(correct_answer_str) == 0:
|
|
1441
|
+
distance = len(outputs_str)
|
|
1442
|
+
else:
|
|
1443
|
+
previous_row = list(range(len(correct_answer_str) + 1))
|
|
1444
|
+
for i, c1 in enumerate(outputs_str):
|
|
1445
|
+
current_row = [i + 1]
|
|
1446
|
+
for j, c2 in enumerate(correct_answer_str):
|
|
1447
|
+
insert = previous_row[j + 1] + 1
|
|
1448
|
+
delete = current_row[j] + 1
|
|
1449
|
+
substitute = previous_row[j] + (c1 != c2)
|
|
1450
|
+
current_row.append(min(insert, delete, substitute))
|
|
1451
|
+
previous_row = current_row
|
|
1452
|
+
distance = previous_row[-1]
|
|
1453
|
+
|
|
1454
|
+
# Normalize similarity score
|
|
1455
|
+
max_length = max(len(outputs_str), len(correct_answer_str))
|
|
1456
|
+
_outputs = 1.0 if max_length == 0 else 1.0 - (distance / max_length)
|
|
1457
|
+
except Exception as e:
|
|
1458
|
+
raise LevenshteinDistanceV0Error(
|
|
1459
|
+
message=str(e), stacktrace=traceback.format_exc()
|
|
1460
|
+
) from e
|
|
1461
|
+
# --------------------------------------------------------------------------
|
|
1462
|
+
|
|
1463
|
+
if isinstance(_outputs, (int, float)):
|
|
1464
|
+
return {"score": _outputs, "success": _outputs >= threshold}
|
|
1465
|
+
|
|
1466
|
+
raise LevenshteinDistanceV0Error(
|
|
1467
|
+
message=f"levenshtein-distance error: got ({type(_outputs)}) {_outputs}, expected (int, float)."
|
|
1468
|
+
)
|
|
1469
|
+
|
|
1470
|
+
|
|
1471
|
+
@instrument(annotate=True)
|
|
1472
|
+
def auto_similarity_match_v0(
|
|
1473
|
+
parameters: Optional[Data] = None,
|
|
1474
|
+
inputs: Optional[Data] = None,
|
|
1475
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1476
|
+
) -> Any:
|
|
1477
|
+
"""
|
|
1478
|
+
Similarity match evaluator for measuring string similarity between output and reference.
|
|
1479
|
+
|
|
1480
|
+
Args:
|
|
1481
|
+
inputs: Testcase data with reference string
|
|
1482
|
+
outputs: Output from the workflow execution
|
|
1483
|
+
parameters: Configuration for the evaluator
|
|
1484
|
+
|
|
1485
|
+
Returns:
|
|
1486
|
+
Evaluation result with similarity score
|
|
1487
|
+
"""
|
|
1488
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1489
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1490
|
+
|
|
1491
|
+
if not "correct_answer_key" in parameters:
|
|
1492
|
+
raise MissingConfigurationParameterV0Error(path="correct_answer_key")
|
|
1493
|
+
|
|
1494
|
+
correct_answer_key = str(parameters["correct_answer_key"])
|
|
1495
|
+
|
|
1496
|
+
case_sensitive = parameters.get("case_sensitive", True) is True
|
|
1497
|
+
|
|
1498
|
+
if inputs is None or not isinstance(inputs, dict):
|
|
1499
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
1500
|
+
|
|
1501
|
+
if not correct_answer_key in inputs:
|
|
1502
|
+
raise MissingInputV0Error(path=correct_answer_key)
|
|
1503
|
+
|
|
1504
|
+
correct_answer = inputs[correct_answer_key]
|
|
1505
|
+
|
|
1506
|
+
if not isinstance(correct_answer, str) and not isinstance(correct_answer, dict):
|
|
1507
|
+
raise InvalidInputV0Error(
|
|
1508
|
+
path=correct_answer_key, expected=["dict", "str"], got=correct_answer
|
|
1509
|
+
)
|
|
1510
|
+
|
|
1511
|
+
correct_answer_str = (
|
|
1512
|
+
correct_answer if isinstance(correct_answer, str) else dumps(correct_answer)
|
|
1513
|
+
)
|
|
1514
|
+
|
|
1515
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1516
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1517
|
+
|
|
1518
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1519
|
+
|
|
1520
|
+
threshold = (
|
|
1521
|
+
parameters.get("threshold") or parameters.get("similarity_threshold") or 0.5
|
|
1522
|
+
)
|
|
1523
|
+
|
|
1524
|
+
if not isinstance(threshold, float):
|
|
1525
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1526
|
+
path="threshold",
|
|
1527
|
+
expected="float",
|
|
1528
|
+
got=threshold,
|
|
1529
|
+
)
|
|
1530
|
+
|
|
1531
|
+
if not 0.0 < threshold <= 1.0:
|
|
1532
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1533
|
+
path="threshold",
|
|
1534
|
+
expected="float[0.0, 1.0]",
|
|
1535
|
+
got=threshold,
|
|
1536
|
+
)
|
|
1537
|
+
|
|
1538
|
+
_outputs = None
|
|
1539
|
+
|
|
1540
|
+
# --------------------------------------------------------------------------
|
|
1541
|
+
if not case_sensitive:
|
|
1542
|
+
outputs_str = outputs_str.lower()
|
|
1543
|
+
correct_answer_str = correct_answer_str.lower()
|
|
1544
|
+
|
|
1545
|
+
try:
|
|
1546
|
+
matcher = SequenceMatcher(None, outputs_str, correct_answer_str)
|
|
1547
|
+
|
|
1548
|
+
_outputs = matcher.ratio()
|
|
1549
|
+
except Exception as e:
|
|
1550
|
+
raise SyntacticSimilarityV0Error(
|
|
1551
|
+
message=str(e), stacktrace=traceback.format_exc()
|
|
1552
|
+
) from e
|
|
1553
|
+
# --------------------------------------------------------------------------
|
|
1554
|
+
|
|
1555
|
+
if isinstance(_outputs, (int, float)):
|
|
1556
|
+
return {"score": _outputs, "success": _outputs >= threshold}
|
|
1557
|
+
|
|
1558
|
+
raise SyntacticSimilarityV0Error(
|
|
1559
|
+
message=f"syntactic-similarity-match error: got ({type(_outputs)}) {_outputs}, expected (int, float)."
|
|
1560
|
+
)
|
|
1561
|
+
|
|
1562
|
+
|
|
1563
|
+
@instrument(annotate=True)
|
|
1564
|
+
async def auto_semantic_similarity_v0(
|
|
1565
|
+
*,
|
|
1566
|
+
parameters: Optional[Data] = None,
|
|
1567
|
+
inputs: Optional[Data] = None,
|
|
1568
|
+
outputs: Optional[Union[Data, str]] = None,
|
|
1569
|
+
) -> Any:
|
|
1570
|
+
"""
|
|
1571
|
+
Semantic similarity evaluator for measuring semantic similarity between output and reference using embeddings.
|
|
1572
|
+
|
|
1573
|
+
Args:
|
|
1574
|
+
inputs: Testcase data with reference string
|
|
1575
|
+
outputs: Output from the workflow execution
|
|
1576
|
+
parameters: Configuration for the evaluator with embedding model and credentials
|
|
1577
|
+
|
|
1578
|
+
Returns:
|
|
1579
|
+
Evaluation result with cosine similarity score
|
|
1580
|
+
"""
|
|
1581
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1582
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1583
|
+
|
|
1584
|
+
if not "correct_answer_key" in parameters:
|
|
1585
|
+
raise MissingConfigurationParameterV0Error(path="correct_answer_key")
|
|
1586
|
+
|
|
1587
|
+
correct_answer_key = str(parameters["correct_answer_key"])
|
|
1588
|
+
|
|
1589
|
+
embedding_model = parameters.get("embedding_model", "text-embedding-3-small")
|
|
1590
|
+
|
|
1591
|
+
if not isinstance(embedding_model, str):
|
|
1592
|
+
raise InvalidConfigurationParametersV0Error(expected="str", got=embedding_model)
|
|
1593
|
+
|
|
1594
|
+
if inputs is None or not isinstance(inputs, dict):
|
|
1595
|
+
raise InvalidInputsV0Error(expected="dict", got=inputs)
|
|
1596
|
+
|
|
1597
|
+
if not correct_answer_key in inputs:
|
|
1598
|
+
raise MissingInputV0Error(path=correct_answer_key)
|
|
1599
|
+
|
|
1600
|
+
correct_answer = inputs[correct_answer_key]
|
|
1601
|
+
|
|
1602
|
+
if not isinstance(correct_answer, str) and not isinstance(correct_answer, dict):
|
|
1603
|
+
raise InvalidInputV0Error(
|
|
1604
|
+
path=correct_answer_key, expected=["dict", "str"], got=correct_answer
|
|
1605
|
+
)
|
|
1606
|
+
|
|
1607
|
+
correct_answer_str = (
|
|
1608
|
+
correct_answer if isinstance(correct_answer, str) else dumps(correct_answer)
|
|
1609
|
+
)
|
|
1610
|
+
|
|
1611
|
+
if not isinstance(outputs, str) and not isinstance(outputs, dict):
|
|
1612
|
+
raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
|
|
1613
|
+
|
|
1614
|
+
outputs_str = outputs if isinstance(outputs, str) else dumps(outputs)
|
|
1615
|
+
|
|
1616
|
+
secrets = await SecretsManager.retrieve_secrets()
|
|
1617
|
+
|
|
1618
|
+
if secrets is None or not isinstance(secrets, list):
|
|
1619
|
+
raise InvalidSecretsV0Error(expected="list", got=secrets)
|
|
1620
|
+
|
|
1621
|
+
openai_api_key = None # secrets.get("OPENAI_API_KEY")
|
|
1622
|
+
|
|
1623
|
+
for secret in secrets:
|
|
1624
|
+
if secret.get("kind") == "provider_key":
|
|
1625
|
+
secret_data = secret.get("data", {})
|
|
1626
|
+
if secret_data.get("kind") == "openai":
|
|
1627
|
+
provider_data = secret_data.get("provider", {})
|
|
1628
|
+
openai_api_key = provider_data.get("key") or openai_api_key
|
|
1629
|
+
|
|
1630
|
+
threshold = parameters.get("threshold") or 0.5
|
|
1631
|
+
|
|
1632
|
+
if not isinstance(threshold, float):
|
|
1633
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1634
|
+
path="threshold",
|
|
1635
|
+
expected="float",
|
|
1636
|
+
got=threshold,
|
|
1637
|
+
)
|
|
1638
|
+
|
|
1639
|
+
if not 0.0 < threshold <= 1.0:
|
|
1640
|
+
raise InvalidConfigurationParameterV0Error(
|
|
1641
|
+
path="threshold",
|
|
1642
|
+
expected="float[0.0, 1.0]",
|
|
1643
|
+
got=threshold,
|
|
1644
|
+
)
|
|
1645
|
+
|
|
1646
|
+
_outputs = None
|
|
1647
|
+
|
|
1648
|
+
# --------------------------------------------------------------------------
|
|
1649
|
+
try:
|
|
1650
|
+
openai = AsyncOpenAI(api_key=openai_api_key)
|
|
1651
|
+
except OpenAIError as e:
|
|
1652
|
+
raise OpenAIError("OpenAIException - " + e.args[0])
|
|
1653
|
+
|
|
1654
|
+
output_embedding = await _compute_embedding(
|
|
1655
|
+
openai,
|
|
1656
|
+
embedding_model,
|
|
1657
|
+
outputs_str,
|
|
1658
|
+
)
|
|
1659
|
+
|
|
1660
|
+
reference_embedding = await _compute_embedding(
|
|
1661
|
+
openai,
|
|
1662
|
+
embedding_model,
|
|
1663
|
+
correct_answer_str,
|
|
1664
|
+
)
|
|
1665
|
+
|
|
1666
|
+
_outputs = float(
|
|
1667
|
+
_compute_similarity(
|
|
1668
|
+
output_embedding,
|
|
1669
|
+
reference_embedding,
|
|
1670
|
+
)
|
|
1671
|
+
)
|
|
1672
|
+
# --------------------------------------------------------------------------
|
|
1673
|
+
|
|
1674
|
+
if isinstance(_outputs, (int, float)):
|
|
1675
|
+
return {"score": _outputs, "success": _outputs >= threshold}
|
|
1676
|
+
|
|
1677
|
+
raise SemanticSimilarityV0Error(
|
|
1678
|
+
message=f"semantic-similarity error: got ({type(_outputs)}) {_outputs}, expected (int, float)."
|
|
1679
|
+
)
|
|
1680
|
+
|
|
1681
|
+
|
|
1682
|
+
class SinglePromptConfig(BaseModel):
|
|
1683
|
+
prompt: PromptTemplate = Field(
|
|
1684
|
+
default=PromptTemplate(
|
|
1685
|
+
system_prompt="You are an expert in geography",
|
|
1686
|
+
user_prompt="What is the capital of {{country}}?",
|
|
1687
|
+
)
|
|
1688
|
+
)
|
|
1689
|
+
|
|
1690
|
+
|
|
1691
|
+
@instrument()
|
|
1692
|
+
async def completion_v0(
|
|
1693
|
+
parameters: Data,
|
|
1694
|
+
inputs: Dict[str, str],
|
|
1695
|
+
) -> Any:
|
|
1696
|
+
if parameters is None or not isinstance(parameters, dict):
|
|
1697
|
+
raise InvalidConfigurationParametersV0Error(expected="dict", got=parameters)
|
|
1698
|
+
|
|
1699
|
+
if not "prompt" in parameters:
|
|
1700
|
+
raise MissingConfigurationParameterV0Error(path="prompt")
|
|
1701
|
+
|
|
1702
|
+
params: Dict[str, Any] = {**(parameters or {})}
|
|
1703
|
+
|
|
1704
|
+
config = SinglePromptConfig(**params)
|
|
1705
|
+
if config.prompt.input_keys is not None:
|
|
1706
|
+
required_keys = set(config.prompt.input_keys)
|
|
1707
|
+
provided_keys = set(inputs.keys())
|
|
1708
|
+
|
|
1709
|
+
if required_keys != provided_keys:
|
|
1710
|
+
raise InvalidInputsV0Error(
|
|
1711
|
+
expected=sorted(required_keys),
|
|
1712
|
+
got=sorted(provided_keys),
|
|
1713
|
+
)
|
|
1714
|
+
|
|
1715
|
+
await SecretsManager.ensure_secrets_in_workflow()
|
|
1716
|
+
|
|
1717
|
+
provider_settings = SecretsManager.get_provider_settings_from_workflow(
|
|
1718
|
+
config.prompt.llm_config.model
|
|
1719
|
+
)
|
|
1720
|
+
|
|
1721
|
+
if not provider_settings:
|
|
1722
|
+
raise InvalidSecretsV0Error(expected="dict", got=provider_settings)
|
|
1723
|
+
|
|
1724
|
+
with mockllm.user_aws_credentials_from(provider_settings):
|
|
1725
|
+
response = await mockllm.acompletion(
|
|
1726
|
+
**{
|
|
1727
|
+
k: v
|
|
1728
|
+
for k, v in config.prompt.format(**inputs).to_openai_kwargs().items()
|
|
1729
|
+
if k != "model"
|
|
1730
|
+
},
|
|
1731
|
+
**provider_settings,
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
message = response.choices[0].message # type: ignore
|
|
1735
|
+
|
|
1736
|
+
if message.content is not None:
|
|
1737
|
+
return message.content
|
|
1738
|
+
if hasattr(message, "refusal") and message.refusal is not None: # type: ignore
|
|
1739
|
+
return message.refusal # type: ignore
|
|
1740
|
+
if hasattr(message, "parsed") and message.parsed is not None: # type: ignore
|
|
1741
|
+
return message.parsed # type: ignore
|
|
1742
|
+
if hasattr(message, "tool_calls") and message.tool_calls is not None:
|
|
1743
|
+
return [tool_call.dict() for tool_call in message.tool_calls]
|
|
1744
|
+
|
|
1745
|
+
|
|
1746
|
+
@instrument()
|
|
1747
|
+
async def chat_v0(
|
|
1748
|
+
parameters: Data,
|
|
1749
|
+
inputs: Optional[Dict[str, str]] = None,
|
|
1750
|
+
messages: Optional[List[Message]] = None,
|
|
1751
|
+
):
|
|
1752
|
+
params: Dict[str, Any] = {**(parameters or {})}
|
|
1753
|
+
|
|
1754
|
+
config = SinglePromptConfig(**params)
|
|
1755
|
+
if config.prompt.input_keys is not None:
|
|
1756
|
+
required_keys = set(config.prompt.input_keys)
|
|
1757
|
+
provided_keys = set(inputs.keys()) if inputs is not None else set()
|
|
1758
|
+
|
|
1759
|
+
if required_keys != provided_keys:
|
|
1760
|
+
raise InvalidInputsV0Error(
|
|
1761
|
+
expected=sorted(required_keys),
|
|
1762
|
+
got=sorted(provided_keys),
|
|
1763
|
+
)
|
|
1764
|
+
|
|
1765
|
+
if inputs is not None:
|
|
1766
|
+
formatted_prompt = config.prompt.format(**inputs)
|
|
1767
|
+
else:
|
|
1768
|
+
formatted_prompt = config.prompt
|
|
1769
|
+
openai_kwargs = formatted_prompt.to_openai_kwargs()
|
|
1770
|
+
|
|
1771
|
+
if messages is not None:
|
|
1772
|
+
openai_kwargs["messages"].extend(messages)
|
|
1773
|
+
|
|
1774
|
+
await SecretsManager.ensure_secrets_in_workflow()
|
|
1775
|
+
|
|
1776
|
+
provider_settings = SecretsManager.get_provider_settings_from_workflow(
|
|
1777
|
+
config.prompt.llm_config.model
|
|
1778
|
+
)
|
|
1779
|
+
|
|
1780
|
+
if not provider_settings:
|
|
1781
|
+
raise InvalidSecretsV0Error(expected="dict", got=provider_settings)
|
|
1782
|
+
|
|
1783
|
+
with mockllm.user_aws_credentials_from(provider_settings):
|
|
1784
|
+
response = await mockllm.acompletion(
|
|
1785
|
+
**{
|
|
1786
|
+
k: v for k, v in openai_kwargs.items() if k != "model"
|
|
1787
|
+
}, # we should use the model_name from provider_settings
|
|
1788
|
+
**provider_settings,
|
|
1789
|
+
)
|
|
1790
|
+
|
|
1791
|
+
return response.choices[0].message.model_dump(exclude_none=True) # type: ignore
|