agenta 0.52.6__py3-none-any.whl → 0.63.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agenta/__init__.py +12 -3
- agenta/client/__init__.py +4 -4
- agenta/client/backend/__init__.py +4 -4
- agenta/client/backend/api_keys/client.py +2 -2
- agenta/client/backend/billing/client.py +2 -2
- agenta/client/backend/billing/raw_client.py +2 -2
- agenta/client/backend/client.py +56 -48
- agenta/client/backend/core/client_wrapper.py +2 -2
- agenta/client/backend/core/file.py +3 -1
- agenta/client/backend/core/http_client.py +3 -3
- agenta/client/backend/core/pydantic_utilities.py +13 -3
- agenta/client/backend/human_evaluations/client.py +2 -2
- agenta/client/backend/human_evaluations/raw_client.py +2 -2
- agenta/client/backend/organization/client.py +46 -34
- agenta/client/backend/organization/raw_client.py +32 -26
- agenta/client/backend/raw_client.py +26 -26
- agenta/client/backend/testsets/client.py +18 -18
- agenta/client/backend/testsets/raw_client.py +30 -30
- agenta/client/backend/types/__init__.py +4 -4
- agenta/client/backend/types/account_request.py +3 -1
- agenta/client/backend/types/account_response.py +3 -1
- agenta/client/backend/types/agenta_node_dto.py +3 -1
- agenta/client/backend/types/agenta_nodes_response.py +3 -1
- agenta/client/backend/types/agenta_root_dto.py +3 -1
- agenta/client/backend/types/agenta_roots_response.py +3 -1
- agenta/client/backend/types/agenta_tree_dto.py +3 -1
- agenta/client/backend/types/agenta_trees_response.py +3 -1
- agenta/client/backend/types/aggregated_result.py +3 -1
- agenta/client/backend/types/analytics_response.py +3 -1
- agenta/client/backend/types/annotation.py +6 -4
- agenta/client/backend/types/annotation_create.py +3 -1
- agenta/client/backend/types/annotation_edit.py +3 -1
- agenta/client/backend/types/annotation_link.py +3 -1
- agenta/client/backend/types/annotation_link_response.py +3 -1
- agenta/client/backend/types/annotation_query.py +3 -1
- agenta/client/backend/types/annotation_query_request.py +3 -1
- agenta/client/backend/types/annotation_reference.py +3 -1
- agenta/client/backend/types/annotation_references.py +3 -1
- agenta/client/backend/types/annotation_response.py +3 -1
- agenta/client/backend/types/annotations_response.py +3 -1
- agenta/client/backend/types/app.py +3 -1
- agenta/client/backend/types/app_variant_response.py +3 -1
- agenta/client/backend/types/app_variant_revision.py +3 -1
- agenta/client/backend/types/artifact.py +6 -4
- agenta/client/backend/types/base_output.py +3 -1
- agenta/client/backend/types/body_fetch_workflow_revision.py +3 -1
- agenta/client/backend/types/body_import_testset.py +3 -1
- agenta/client/backend/types/bucket_dto.py +3 -1
- agenta/client/backend/types/collect_status_response.py +3 -1
- agenta/client/backend/types/config_db.py +3 -1
- agenta/client/backend/types/config_dto.py +3 -1
- agenta/client/backend/types/config_response_model.py +3 -1
- agenta/client/backend/types/correct_answer.py +3 -1
- agenta/client/backend/types/create_app_output.py +3 -1
- agenta/client/backend/types/custom_model_settings_dto.py +3 -1
- agenta/client/backend/types/custom_provider_dto.py +3 -1
- agenta/client/backend/types/custom_provider_kind.py +1 -1
- agenta/client/backend/types/custom_provider_settings_dto.py +3 -1
- agenta/client/backend/types/delete_evaluation.py +3 -1
- agenta/client/backend/types/environment_output.py +3 -1
- agenta/client/backend/types/environment_output_extended.py +3 -1
- agenta/client/backend/types/environment_revision.py +3 -1
- agenta/client/backend/types/error.py +3 -1
- agenta/client/backend/types/evaluation.py +3 -1
- agenta/client/backend/types/evaluation_scenario.py +3 -1
- agenta/client/backend/types/evaluation_scenario_input.py +3 -1
- agenta/client/backend/types/evaluation_scenario_output.py +3 -1
- agenta/client/backend/types/evaluation_scenario_result.py +3 -1
- agenta/client/backend/types/evaluator.py +6 -4
- agenta/client/backend/types/evaluator_config.py +6 -4
- agenta/client/backend/types/evaluator_flags.py +3 -1
- agenta/client/backend/types/evaluator_mapping_output_interface.py +3 -1
- agenta/client/backend/types/evaluator_output_interface.py +3 -1
- agenta/client/backend/types/evaluator_query.py +3 -1
- agenta/client/backend/types/evaluator_query_request.py +3 -1
- agenta/client/backend/types/evaluator_request.py +3 -1
- agenta/client/backend/types/evaluator_response.py +3 -1
- agenta/client/backend/types/evaluators_response.py +3 -1
- agenta/client/backend/types/exception_dto.py +3 -1
- agenta/client/backend/types/extended_o_tel_tracing_response.py +3 -1
- agenta/client/backend/types/get_config_response.py +3 -1
- agenta/client/backend/types/header.py +3 -1
- agenta/client/backend/types/http_validation_error.py +3 -1
- agenta/client/backend/types/human_evaluation.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario_input.py +3 -1
- agenta/client/backend/types/human_evaluation_scenario_output.py +3 -1
- agenta/client/backend/types/invite_request.py +3 -1
- agenta/client/backend/types/legacy_analytics_response.py +3 -1
- agenta/client/backend/types/legacy_data_point.py +3 -1
- agenta/client/backend/types/legacy_evaluator.py +3 -1
- agenta/client/backend/types/legacy_scope_request.py +3 -1
- agenta/client/backend/types/legacy_scopes_response.py +3 -1
- agenta/client/backend/types/legacy_subscription_request.py +3 -1
- agenta/client/backend/types/legacy_user_request.py +3 -1
- agenta/client/backend/types/legacy_user_response.py +3 -1
- agenta/client/backend/types/lifecycle_dto.py +3 -1
- agenta/client/backend/types/link_dto.py +3 -1
- agenta/client/backend/types/list_api_keys_response.py +3 -1
- agenta/client/backend/types/llm_run_rate_limit.py +3 -1
- agenta/client/backend/types/meta_request.py +3 -1
- agenta/client/backend/types/metrics_dto.py +3 -1
- agenta/client/backend/types/new_testset.py +3 -1
- agenta/client/backend/types/node_dto.py +3 -1
- agenta/client/backend/types/o_tel_context_dto.py +3 -1
- agenta/client/backend/types/o_tel_event.py +6 -4
- agenta/client/backend/types/o_tel_event_dto.py +3 -1
- agenta/client/backend/types/o_tel_extra_dto.py +3 -1
- agenta/client/backend/types/o_tel_flat_span.py +6 -4
- agenta/client/backend/types/o_tel_link.py +6 -4
- agenta/client/backend/types/o_tel_link_dto.py +3 -1
- agenta/client/backend/types/o_tel_links_response.py +3 -1
- agenta/client/backend/types/o_tel_span.py +1 -1
- agenta/client/backend/types/o_tel_span_dto.py +3 -1
- agenta/client/backend/types/o_tel_spans_tree.py +3 -1
- agenta/client/backend/types/o_tel_tracing_data_response.py +3 -1
- agenta/client/backend/types/o_tel_tracing_request.py +3 -1
- agenta/client/backend/types/o_tel_tracing_response.py +3 -1
- agenta/client/backend/types/organization.py +3 -1
- agenta/client/backend/types/organization_details.py +3 -1
- agenta/client/backend/types/organization_membership_request.py +3 -1
- agenta/client/backend/types/organization_output.py +3 -1
- agenta/client/backend/types/organization_request.py +3 -1
- agenta/client/backend/types/parent_dto.py +3 -1
- agenta/client/backend/types/project_membership_request.py +3 -1
- agenta/client/backend/types/project_request.py +3 -1
- agenta/client/backend/types/project_scope.py +3 -1
- agenta/client/backend/types/projects_response.py +3 -1
- agenta/client/backend/types/reference.py +6 -4
- agenta/client/backend/types/reference_dto.py +3 -1
- agenta/client/backend/types/reference_request_model.py +3 -1
- agenta/client/backend/types/result.py +3 -1
- agenta/client/backend/types/root_dto.py +3 -1
- agenta/client/backend/types/scopes_response_model.py +3 -1
- agenta/client/backend/types/secret_dto.py +3 -1
- agenta/client/backend/types/secret_response_dto.py +3 -1
- agenta/client/backend/types/simple_evaluation_output.py +3 -1
- agenta/client/backend/types/span_dto.py +6 -4
- agenta/client/backend/types/standard_provider_dto.py +3 -1
- agenta/client/backend/types/standard_provider_settings_dto.py +3 -1
- agenta/client/backend/types/status_dto.py +3 -1
- agenta/client/backend/types/tags_request.py +3 -1
- agenta/client/backend/types/testcase_response.py +6 -4
- agenta/client/backend/types/testset.py +6 -4
- agenta/client/backend/types/{test_set_output_response.py → testset_output_response.py} +4 -2
- agenta/client/backend/types/testset_request.py +3 -1
- agenta/client/backend/types/testset_response.py +3 -1
- agenta/client/backend/types/{test_set_simple_response.py → testset_simple_response.py} +4 -2
- agenta/client/backend/types/testsets_response.py +3 -1
- agenta/client/backend/types/time_dto.py +3 -1
- agenta/client/backend/types/tree_dto.py +3 -1
- agenta/client/backend/types/update_app_output.py +3 -1
- agenta/client/backend/types/user_request.py +3 -1
- agenta/client/backend/types/validation_error.py +3 -1
- agenta/client/backend/types/workflow_artifact.py +6 -4
- agenta/client/backend/types/workflow_data.py +3 -1
- agenta/client/backend/types/workflow_flags.py +3 -1
- agenta/client/backend/types/workflow_request.py +3 -1
- agenta/client/backend/types/workflow_response.py +3 -1
- agenta/client/backend/types/workflow_revision.py +6 -4
- agenta/client/backend/types/workflow_revision_request.py +3 -1
- agenta/client/backend/types/workflow_revision_response.py +3 -1
- agenta/client/backend/types/workflow_revisions_response.py +3 -1
- agenta/client/backend/types/workflow_variant.py +6 -4
- agenta/client/backend/types/workflow_variant_request.py +3 -1
- agenta/client/backend/types/workflow_variant_response.py +3 -1
- agenta/client/backend/types/workflow_variants_response.py +3 -1
- agenta/client/backend/types/workflows_response.py +3 -1
- agenta/client/backend/types/workspace.py +3 -1
- agenta/client/backend/types/workspace_member_response.py +3 -1
- agenta/client/backend/types/workspace_membership_request.py +3 -1
- agenta/client/backend/types/workspace_permission.py +3 -1
- agenta/client/backend/types/workspace_request.py +3 -1
- agenta/client/backend/types/workspace_response.py +3 -1
- agenta/client/backend/vault/raw_client.py +4 -4
- agenta/client/backend/workspace/client.py +2 -2
- agenta/client/client.py +102 -88
- agenta/sdk/__init__.py +52 -3
- agenta/sdk/agenta_init.py +43 -16
- agenta/sdk/assets.py +23 -15
- agenta/sdk/context/serving.py +20 -8
- agenta/sdk/context/tracing.py +40 -22
- agenta/sdk/contexts/__init__.py +0 -0
- agenta/sdk/contexts/routing.py +38 -0
- agenta/sdk/contexts/running.py +57 -0
- agenta/sdk/contexts/tracing.py +86 -0
- agenta/sdk/decorators/__init__.py +1 -0
- agenta/sdk/decorators/routing.py +284 -0
- agenta/sdk/decorators/running.py +692 -98
- agenta/sdk/decorators/serving.py +20 -21
- agenta/sdk/decorators/tracing.py +176 -131
- agenta/sdk/engines/__init__.py +0 -0
- agenta/sdk/engines/running/__init__.py +0 -0
- agenta/sdk/engines/running/utils.py +17 -0
- agenta/sdk/engines/tracing/__init__.py +1 -0
- agenta/sdk/engines/tracing/attributes.py +185 -0
- agenta/sdk/engines/tracing/conventions.py +49 -0
- agenta/sdk/engines/tracing/exporters.py +130 -0
- agenta/sdk/engines/tracing/inline.py +1154 -0
- agenta/sdk/engines/tracing/processors.py +190 -0
- agenta/sdk/engines/tracing/propagation.py +102 -0
- agenta/sdk/engines/tracing/spans.py +136 -0
- agenta/sdk/engines/tracing/tracing.py +324 -0
- agenta/sdk/evaluations/__init__.py +2 -0
- agenta/sdk/evaluations/metrics.py +37 -0
- agenta/sdk/evaluations/preview/__init__.py +0 -0
- agenta/sdk/evaluations/preview/evaluate.py +765 -0
- agenta/sdk/evaluations/preview/utils.py +861 -0
- agenta/sdk/evaluations/results.py +66 -0
- agenta/sdk/evaluations/runs.py +153 -0
- agenta/sdk/evaluations/scenarios.py +48 -0
- agenta/sdk/litellm/litellm.py +12 -0
- agenta/sdk/litellm/mockllm.py +6 -8
- agenta/sdk/litellm/mocks/__init__.py +5 -5
- agenta/sdk/managers/applications.py +304 -0
- agenta/sdk/managers/config.py +2 -2
- agenta/sdk/managers/evaluations.py +0 -0
- agenta/sdk/managers/evaluators.py +303 -0
- agenta/sdk/managers/secrets.py +161 -24
- agenta/sdk/managers/shared.py +3 -1
- agenta/sdk/managers/testsets.py +441 -0
- agenta/sdk/managers/vault.py +3 -3
- agenta/sdk/middleware/auth.py +0 -176
- agenta/sdk/middleware/config.py +27 -9
- agenta/sdk/middleware/vault.py +204 -9
- agenta/sdk/middlewares/__init__.py +0 -0
- agenta/sdk/middlewares/routing/__init__.py +0 -0
- agenta/sdk/middlewares/routing/auth.py +263 -0
- agenta/sdk/middlewares/routing/cors.py +30 -0
- agenta/sdk/middlewares/routing/otel.py +29 -0
- agenta/sdk/middlewares/running/__init__.py +0 -0
- agenta/sdk/middlewares/running/normalizer.py +321 -0
- agenta/sdk/middlewares/running/resolver.py +161 -0
- agenta/sdk/middlewares/running/vault.py +140 -0
- agenta/sdk/models/__init__.py +0 -0
- agenta/sdk/models/blobs.py +33 -0
- agenta/sdk/models/evaluations.py +119 -0
- agenta/sdk/models/git.py +126 -0
- agenta/sdk/models/shared.py +167 -0
- agenta/sdk/models/testsets.py +163 -0
- agenta/sdk/models/tracing.py +202 -0
- agenta/sdk/models/workflows.py +753 -0
- agenta/sdk/tracing/attributes.py +4 -4
- agenta/sdk/tracing/exporters.py +67 -17
- agenta/sdk/tracing/inline.py +37 -45
- agenta/sdk/tracing/processors.py +97 -0
- agenta/sdk/tracing/propagation.py +3 -1
- agenta/sdk/tracing/spans.py +4 -0
- agenta/sdk/tracing/tracing.py +13 -15
- agenta/sdk/types.py +222 -22
- agenta/sdk/utils/cache.py +1 -1
- agenta/sdk/utils/client.py +38 -0
- agenta/sdk/utils/helpers.py +13 -12
- agenta/sdk/utils/logging.py +18 -78
- agenta/sdk/utils/references.py +23 -0
- agenta/sdk/workflows/builtin.py +600 -0
- agenta/sdk/workflows/configurations.py +22 -0
- agenta/sdk/workflows/errors.py +292 -0
- agenta/sdk/workflows/handlers.py +1791 -0
- agenta/sdk/workflows/interfaces.py +948 -0
- agenta/sdk/workflows/sandbox.py +118 -0
- agenta/sdk/workflows/utils.py +303 -6
- {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/METADATA +37 -33
- agenta-0.63.2.dist-info/RECORD +421 -0
- {agenta-0.52.6.dist-info → agenta-0.63.2.dist-info}/WHEEL +1 -1
- agenta/sdk/middleware/adapt.py +0 -253
- agenta/sdk/middleware/base.py +0 -40
- agenta/sdk/middleware/flags.py +0 -40
- agenta/sdk/workflows/types.py +0 -472
- agenta-0.52.6.dist-info/RECORD +0 -371
- /agenta/sdk/{workflows → engines/running}/registry.py +0 -0
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
from typing import Dict, List, Any, Union, Optional, Tuple
|
|
2
|
+
from uuid import UUID
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from agenta.sdk.models.evaluations import (
|
|
9
|
+
Origin,
|
|
10
|
+
Target,
|
|
11
|
+
Link,
|
|
12
|
+
Reference,
|
|
13
|
+
SimpleEvaluationData,
|
|
14
|
+
)
|
|
15
|
+
from agenta.sdk.models.workflows import (
|
|
16
|
+
ApplicationRevision,
|
|
17
|
+
EvaluatorRevision,
|
|
18
|
+
WorkflowServiceRequestData,
|
|
19
|
+
ApplicationServiceRequest,
|
|
20
|
+
EvaluatorServiceRequest,
|
|
21
|
+
)
|
|
22
|
+
from agenta.sdk.models.testsets import TestsetRevision
|
|
23
|
+
|
|
24
|
+
from agenta.sdk.utils.references import get_slug_from_name_and_id
|
|
25
|
+
from agenta.sdk.evaluations.preview.utils import fetch_trace_data
|
|
26
|
+
|
|
27
|
+
from agenta.sdk.managers.testsets import (
|
|
28
|
+
acreate as acreate_testset,
|
|
29
|
+
aretrieve as aretrieve_testset,
|
|
30
|
+
)
|
|
31
|
+
from agenta.sdk.managers.applications import (
|
|
32
|
+
aupsert as aupsert_application,
|
|
33
|
+
aretrieve as aretrieve_application,
|
|
34
|
+
)
|
|
35
|
+
from agenta.sdk.managers.evaluators import (
|
|
36
|
+
aupsert as aupsert_evaluator,
|
|
37
|
+
aretrieve as aretrieve_evaluator,
|
|
38
|
+
)
|
|
39
|
+
from agenta.sdk.evaluations.runs import (
|
|
40
|
+
acreate as acreate_run,
|
|
41
|
+
aclose as aclose_run,
|
|
42
|
+
aurl as aget_url,
|
|
43
|
+
)
|
|
44
|
+
from agenta.sdk.evaluations.scenarios import (
|
|
45
|
+
acreate as aadd_scenario,
|
|
46
|
+
)
|
|
47
|
+
from agenta.sdk.evaluations.results import (
|
|
48
|
+
acreate as alog_result,
|
|
49
|
+
)
|
|
50
|
+
from agenta.sdk.evaluations.metrics import (
|
|
51
|
+
arefresh as acompute_metrics,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
from agenta.sdk.models.workflows import (
|
|
56
|
+
WorkflowServiceInterface,
|
|
57
|
+
WorkflowServiceConfiguration,
|
|
58
|
+
)
|
|
59
|
+
from agenta.sdk.decorators.running import (
|
|
60
|
+
invoke_application,
|
|
61
|
+
invoke_evaluator,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class EvaluateSpecs(BaseModel):
|
|
66
|
+
testsets: Optional[Target] = None
|
|
67
|
+
applications: Optional[Target] = None
|
|
68
|
+
evaluators: Optional[Target] = None
|
|
69
|
+
|
|
70
|
+
repeats: Optional[int] = None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def _parse_evaluate_kwargs(
|
|
74
|
+
*,
|
|
75
|
+
testsets: Optional[Target] = None,
|
|
76
|
+
applications: Optional[Target] = None,
|
|
77
|
+
evaluators: Optional[Target] = None,
|
|
78
|
+
#
|
|
79
|
+
repeats: Optional[int] = None,
|
|
80
|
+
#
|
|
81
|
+
specs: Optional[Union[EvaluateSpecs, Dict[str, Any]]] = None,
|
|
82
|
+
) -> SimpleEvaluationData:
|
|
83
|
+
_specs = deepcopy(specs)
|
|
84
|
+
if isinstance(_specs, dict):
|
|
85
|
+
_specs = EvaluateSpecs(**_specs)
|
|
86
|
+
if _specs and not isinstance(_specs, EvaluateSpecs):
|
|
87
|
+
_specs = None
|
|
88
|
+
|
|
89
|
+
simple_evaluation_data = SimpleEvaluationData(
|
|
90
|
+
testset_steps=testsets or (_specs.testsets if _specs else None),
|
|
91
|
+
application_steps=applications or (_specs.applications if _specs else None),
|
|
92
|
+
evaluator_steps=evaluators or (_specs.evaluators if _specs else None),
|
|
93
|
+
#
|
|
94
|
+
repeats=repeats or (_specs.repeats if _specs else None),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if not simple_evaluation_data.testset_steps:
|
|
98
|
+
raise ValueError("Invalid 'evaluate()' specs: missing testsets")
|
|
99
|
+
if not simple_evaluation_data.application_steps:
|
|
100
|
+
raise ValueError("Invalid 'evaluate()' specs: missing applications")
|
|
101
|
+
if not simple_evaluation_data.evaluator_steps:
|
|
102
|
+
raise ValueError("Invalid 'evaluate()' specs: missing evaluators")
|
|
103
|
+
|
|
104
|
+
return simple_evaluation_data
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def _upsert_entities(
|
|
108
|
+
simple_evaluation_data: SimpleEvaluationData,
|
|
109
|
+
) -> SimpleEvaluationData:
|
|
110
|
+
if simple_evaluation_data.testset_steps:
|
|
111
|
+
if isinstance(simple_evaluation_data.testset_steps, list):
|
|
112
|
+
testset_steps: Dict[str, Origin] = {}
|
|
113
|
+
|
|
114
|
+
if all(
|
|
115
|
+
isinstance(testset_revision_id, UUID)
|
|
116
|
+
for testset_revision_id in simple_evaluation_data.testset_steps
|
|
117
|
+
):
|
|
118
|
+
for testset_revision_id in simple_evaluation_data.testset_steps:
|
|
119
|
+
if isinstance(testset_revision_id, UUID):
|
|
120
|
+
testset_steps[str(testset_revision_id)] = "custom"
|
|
121
|
+
|
|
122
|
+
elif all(
|
|
123
|
+
isinstance(testcases_data, List)
|
|
124
|
+
for testcases_data in simple_evaluation_data.testset_steps
|
|
125
|
+
):
|
|
126
|
+
for testcases_data in simple_evaluation_data.testset_steps:
|
|
127
|
+
if isinstance(testcases_data, List):
|
|
128
|
+
if all(isinstance(step, Dict) for step in testcases_data):
|
|
129
|
+
testset_revision_id = await acreate_testset(
|
|
130
|
+
data=testcases_data,
|
|
131
|
+
)
|
|
132
|
+
testset_steps[str(testset_revision_id)] = "custom"
|
|
133
|
+
|
|
134
|
+
simple_evaluation_data.testset_steps = testset_steps
|
|
135
|
+
|
|
136
|
+
if not simple_evaluation_data.testset_steps or not isinstance(
|
|
137
|
+
simple_evaluation_data.testset_steps, dict
|
|
138
|
+
):
|
|
139
|
+
raise ValueError(
|
|
140
|
+
"Invalid 'evaluate()' specs: missing or invalid testset steps",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
if simple_evaluation_data.application_steps:
|
|
144
|
+
if isinstance(simple_evaluation_data.application_steps, list):
|
|
145
|
+
application_steps: Dict[str, Origin] = {}
|
|
146
|
+
|
|
147
|
+
if all(
|
|
148
|
+
isinstance(application_revision_id, UUID)
|
|
149
|
+
for application_revision_id in simple_evaluation_data.application_steps
|
|
150
|
+
):
|
|
151
|
+
for application_revision_id in simple_evaluation_data.application_steps:
|
|
152
|
+
if isinstance(application_revision_id, UUID):
|
|
153
|
+
application_steps[str(application_revision_id)] = "custom"
|
|
154
|
+
|
|
155
|
+
elif all(
|
|
156
|
+
callable(application_handler)
|
|
157
|
+
for application_handler in simple_evaluation_data.application_steps
|
|
158
|
+
):
|
|
159
|
+
for application_handler in simple_evaluation_data.application_steps:
|
|
160
|
+
if callable(application_handler):
|
|
161
|
+
application_revision_id = await aupsert_application(
|
|
162
|
+
handler=application_handler,
|
|
163
|
+
)
|
|
164
|
+
application_steps[str(application_revision_id)] = "custom"
|
|
165
|
+
|
|
166
|
+
simple_evaluation_data.application_steps = application_steps
|
|
167
|
+
|
|
168
|
+
if not simple_evaluation_data.application_steps or not isinstance(
|
|
169
|
+
simple_evaluation_data.application_steps, dict
|
|
170
|
+
):
|
|
171
|
+
raise ValueError(
|
|
172
|
+
"Invalid 'evaluate()' specs: missing or invalid application steps",
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if simple_evaluation_data.evaluator_steps:
|
|
176
|
+
if isinstance(simple_evaluation_data.evaluator_steps, list):
|
|
177
|
+
evaluator_steps: Dict[str, Origin] = {}
|
|
178
|
+
|
|
179
|
+
if all(
|
|
180
|
+
isinstance(evaluator_revision_id, UUID)
|
|
181
|
+
for evaluator_revision_id in simple_evaluation_data.evaluator_steps
|
|
182
|
+
):
|
|
183
|
+
for evaluator_revision_id in simple_evaluation_data.evaluator_steps:
|
|
184
|
+
if isinstance(evaluator_revision_id, UUID):
|
|
185
|
+
evaluator_steps[str(evaluator_revision_id)] = "custom"
|
|
186
|
+
|
|
187
|
+
elif all(
|
|
188
|
+
callable(evaluator_handler)
|
|
189
|
+
for evaluator_handler in simple_evaluation_data.evaluator_steps
|
|
190
|
+
):
|
|
191
|
+
for evaluator_handler in simple_evaluation_data.evaluator_steps:
|
|
192
|
+
if callable(evaluator_handler):
|
|
193
|
+
evaluator_revision_id = await aupsert_evaluator(
|
|
194
|
+
handler=evaluator_handler,
|
|
195
|
+
)
|
|
196
|
+
evaluator_steps[str(evaluator_revision_id)] = "custom"
|
|
197
|
+
|
|
198
|
+
simple_evaluation_data.evaluator_steps = evaluator_steps
|
|
199
|
+
|
|
200
|
+
if not simple_evaluation_data.evaluator_steps or not isinstance(
|
|
201
|
+
simple_evaluation_data.evaluator_steps, dict
|
|
202
|
+
):
|
|
203
|
+
raise ValueError(
|
|
204
|
+
"Invalid 'evaluate()' specs: missing or invalid evaluator steps",
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return simple_evaluation_data
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
async def _retrieve_entities(
|
|
211
|
+
simple_evaluation_data: SimpleEvaluationData,
|
|
212
|
+
) -> Tuple[
|
|
213
|
+
Dict[UUID, TestsetRevision],
|
|
214
|
+
Dict[UUID, ApplicationRevision],
|
|
215
|
+
Dict[UUID, EvaluatorRevision],
|
|
216
|
+
]:
|
|
217
|
+
testset_revisions: Dict[UUID, TestsetRevision] = {}
|
|
218
|
+
# for testset_revision_id, origin in simple_evaluation_data.testset_steps.items():
|
|
219
|
+
# testset_revision = await retrieve_testset(
|
|
220
|
+
# testset_revision_id=testset_revision_id,
|
|
221
|
+
# )
|
|
222
|
+
for testset_id, origin in simple_evaluation_data.testset_steps.items():
|
|
223
|
+
testset_revision = await aretrieve_testset(
|
|
224
|
+
testset_id=testset_id,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if not testset_revision or not testset_revision.id:
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
testset_revisions[testset_revision.id] = testset_revision
|
|
231
|
+
|
|
232
|
+
application_revisions: Dict[UUID, ApplicationRevision] = {}
|
|
233
|
+
for (
|
|
234
|
+
application_revision_id,
|
|
235
|
+
origin,
|
|
236
|
+
) in simple_evaluation_data.application_steps.items():
|
|
237
|
+
application_revision = await aretrieve_application(
|
|
238
|
+
application_revision_id=application_revision_id,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if not application_revision:
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
application_revisions[application_revision_id] = application_revision
|
|
245
|
+
|
|
246
|
+
evaluator_revisions: Dict[UUID, EvaluatorRevision] = {}
|
|
247
|
+
for evaluator_revision_id, origin in simple_evaluation_data.evaluator_steps.items():
|
|
248
|
+
evaluator_revision = await aretrieve_evaluator(
|
|
249
|
+
evaluator_revision_id=evaluator_revision_id,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if not evaluator_revision:
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
evaluator_revisions[evaluator_revision_id] = evaluator_revision
|
|
256
|
+
|
|
257
|
+
return testset_revisions, application_revisions, evaluator_revisions
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _timestamp_suffix():
|
|
261
|
+
suffix = datetime.now().strftime("%y-%m-%d · %H:%M")
|
|
262
|
+
return f" [{suffix}]"
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
UNICODE = {
|
|
266
|
+
"here": "• ",
|
|
267
|
+
"root": "┌─ ",
|
|
268
|
+
"next": "├─ ",
|
|
269
|
+
"last": "└─ ",
|
|
270
|
+
"pipe": "│ ",
|
|
271
|
+
"skip": " ",
|
|
272
|
+
"this": "── ",
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# @debug
|
|
277
|
+
async def aevaluate(
|
|
278
|
+
*,
|
|
279
|
+
name: Optional[str] = None,
|
|
280
|
+
description: Optional[str] = None,
|
|
281
|
+
#
|
|
282
|
+
testsets: Optional[Target] = None,
|
|
283
|
+
applications: Optional[Target] = None,
|
|
284
|
+
evaluators: Optional[Target] = None,
|
|
285
|
+
#
|
|
286
|
+
repeats: Optional[int] = None,
|
|
287
|
+
#
|
|
288
|
+
specs: Optional[Union[EvaluateSpecs, Dict[str, Any]]] = None,
|
|
289
|
+
):
|
|
290
|
+
simple_evaluation_data = await _parse_evaluate_kwargs(
|
|
291
|
+
testsets=testsets,
|
|
292
|
+
applications=applications,
|
|
293
|
+
evaluators=evaluators,
|
|
294
|
+
repeats=repeats,
|
|
295
|
+
specs=specs,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
simple_evaluation_data = await _upsert_entities(
|
|
299
|
+
simple_evaluation_data=simple_evaluation_data,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
print()
|
|
303
|
+
print(
|
|
304
|
+
"────────────────────────────────────────────────────────────────────────────"
|
|
305
|
+
)
|
|
306
|
+
print(f"Evaluation running...")
|
|
307
|
+
print(
|
|
308
|
+
"────────────────────────────────────────────────────────────────────────────"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
suffix = _timestamp_suffix()
|
|
312
|
+
name = f"{name}{suffix}"
|
|
313
|
+
|
|
314
|
+
run = await acreate_run(
|
|
315
|
+
name=name,
|
|
316
|
+
description=description,
|
|
317
|
+
#
|
|
318
|
+
testset_steps=simple_evaluation_data.testset_steps,
|
|
319
|
+
application_steps=simple_evaluation_data.application_steps,
|
|
320
|
+
evaluator_steps=simple_evaluation_data.evaluator_steps,
|
|
321
|
+
#
|
|
322
|
+
repeats=simple_evaluation_data.repeats,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
print(
|
|
326
|
+
f"{UNICODE['here']}"
|
|
327
|
+
f"{UNICODE['skip']}"
|
|
328
|
+
f"{UNICODE['skip']}"
|
|
329
|
+
f"{UNICODE['skip']}"
|
|
330
|
+
f"{UNICODE['skip']}"
|
|
331
|
+
f" run_id={str(run.id)}",
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
if not run.id:
|
|
335
|
+
print("[failure] could not create evaluation")
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
(
|
|
339
|
+
testset_revisions,
|
|
340
|
+
application_revisions,
|
|
341
|
+
evaluator_revisions,
|
|
342
|
+
) = await _retrieve_entities(
|
|
343
|
+
simple_evaluation_data=simple_evaluation_data,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
scenarios = list()
|
|
347
|
+
|
|
348
|
+
metrics = dict()
|
|
349
|
+
|
|
350
|
+
for testset_revision in testset_revisions.values():
|
|
351
|
+
if not testset_revision.data or not testset_revision.data.testcases:
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
testcases = testset_revision.data.testcases
|
|
355
|
+
|
|
356
|
+
print(
|
|
357
|
+
f"{UNICODE['next']}"
|
|
358
|
+
f"{UNICODE['here']}"
|
|
359
|
+
f"{UNICODE['skip']}"
|
|
360
|
+
f"{UNICODE['skip']}"
|
|
361
|
+
f"{UNICODE['skip']}"
|
|
362
|
+
f" testset_id={str(testset_revision.testset_id)}",
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
for testcase_idx, testcase in enumerate(testcases):
|
|
366
|
+
print(
|
|
367
|
+
f"{UNICODE['pipe']}"
|
|
368
|
+
f"{UNICODE['pipe']}"
|
|
369
|
+
f"{UNICODE['skip']}"
|
|
370
|
+
f"{UNICODE['skip']}"
|
|
371
|
+
f"{UNICODE['skip']}"
|
|
372
|
+
"-----------------------"
|
|
373
|
+
"--------------------------------------"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
print(
|
|
377
|
+
f"{UNICODE['pipe']}"
|
|
378
|
+
f"{UNICODE['next' if testcase_idx < len(testcases) - 1 else 'last']}"
|
|
379
|
+
f"{UNICODE['here']}"
|
|
380
|
+
f"{UNICODE['skip']}"
|
|
381
|
+
f"{UNICODE['skip']}"
|
|
382
|
+
f"testcase_id={str(testcase.id)}",
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
scenario = await aadd_scenario(
|
|
386
|
+
run_id=run.id,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
print(
|
|
390
|
+
f"{UNICODE['pipe']}"
|
|
391
|
+
f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
|
|
392
|
+
f"{UNICODE['next']}"
|
|
393
|
+
f"{UNICODE['here']}"
|
|
394
|
+
f"{UNICODE['skip']}"
|
|
395
|
+
f"scenario_id={str(scenario.id)}",
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
results = dict()
|
|
399
|
+
|
|
400
|
+
result = await alog_result(
|
|
401
|
+
run_id=run.id,
|
|
402
|
+
scenario_id=scenario.id,
|
|
403
|
+
step_key="testset-" + testset_revision.slug, # type: ignore
|
|
404
|
+
testcase_id=testcase.id,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
print(
|
|
408
|
+
f"{UNICODE['pipe']}"
|
|
409
|
+
f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
|
|
410
|
+
f"{UNICODE['pipe']}"
|
|
411
|
+
f"{UNICODE['next']}"
|
|
412
|
+
f"{UNICODE['here']}"
|
|
413
|
+
f" result_id={str(result.id)} (testcase)",
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
results[testset_revision.slug] = result
|
|
417
|
+
|
|
418
|
+
_testcase = testcase.model_dump(
|
|
419
|
+
mode="json",
|
|
420
|
+
exclude_none=True,
|
|
421
|
+
) # type: ignore
|
|
422
|
+
inputs = testcase.data
|
|
423
|
+
if isinstance(inputs, dict):
|
|
424
|
+
if "testcase_dedup_id" in inputs:
|
|
425
|
+
del inputs["testcase_dedup_id"]
|
|
426
|
+
|
|
427
|
+
for application_revision in application_revisions.values():
|
|
428
|
+
if not application_revision or not application_revision.data:
|
|
429
|
+
print("Missing or invalid application revision")
|
|
430
|
+
if application_revision:
|
|
431
|
+
print(application_revision.model_dump(exclude_none=True))
|
|
432
|
+
continue
|
|
433
|
+
|
|
434
|
+
# print(f" Application {application_revision.model_dump(exclude_none=True)}") # type: ignore
|
|
435
|
+
|
|
436
|
+
references = dict(
|
|
437
|
+
testset=Reference(
|
|
438
|
+
id=testset_revision.testset_id,
|
|
439
|
+
),
|
|
440
|
+
testset_variant=Reference(
|
|
441
|
+
id=testset_revision.testset_variant_id,
|
|
442
|
+
),
|
|
443
|
+
testset_revision=Reference(
|
|
444
|
+
id=testset_revision.id,
|
|
445
|
+
slug=testset_revision.slug,
|
|
446
|
+
version=testset_revision.version,
|
|
447
|
+
),
|
|
448
|
+
application=Reference(
|
|
449
|
+
id=application_revision.application_id,
|
|
450
|
+
),
|
|
451
|
+
application_variant=Reference(
|
|
452
|
+
id=application_revision.application_variant_id,
|
|
453
|
+
),
|
|
454
|
+
application_revision=Reference(
|
|
455
|
+
id=application_revision.id,
|
|
456
|
+
slug=application_revision.slug,
|
|
457
|
+
version=application_revision.version,
|
|
458
|
+
),
|
|
459
|
+
)
|
|
460
|
+
links = None
|
|
461
|
+
|
|
462
|
+
_revision = application_revision.model_dump(
|
|
463
|
+
mode="json",
|
|
464
|
+
exclude_none=True,
|
|
465
|
+
)
|
|
466
|
+
interface = WorkflowServiceInterface(
|
|
467
|
+
**(
|
|
468
|
+
application_revision.data.model_dump()
|
|
469
|
+
if application_revision.data
|
|
470
|
+
else {}
|
|
471
|
+
)
|
|
472
|
+
)
|
|
473
|
+
configuration = WorkflowServiceConfiguration(
|
|
474
|
+
**(
|
|
475
|
+
application_revision.data.model_dump()
|
|
476
|
+
if application_revision.data
|
|
477
|
+
else {}
|
|
478
|
+
)
|
|
479
|
+
)
|
|
480
|
+
parameters = application_revision.data.parameters
|
|
481
|
+
|
|
482
|
+
_trace = None
|
|
483
|
+
outputs = None
|
|
484
|
+
|
|
485
|
+
workflow_service_request_data = WorkflowServiceRequestData(
|
|
486
|
+
revision=_revision,
|
|
487
|
+
parameters=parameters,
|
|
488
|
+
#
|
|
489
|
+
testcase=_testcase,
|
|
490
|
+
inputs=inputs,
|
|
491
|
+
#
|
|
492
|
+
trace=_trace,
|
|
493
|
+
outputs=outputs,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
application_request = ApplicationServiceRequest(
|
|
497
|
+
interface=interface,
|
|
498
|
+
configuration=configuration,
|
|
499
|
+
#
|
|
500
|
+
data=workflow_service_request_data,
|
|
501
|
+
#
|
|
502
|
+
references=references, # type: ignore
|
|
503
|
+
links=links, # type: ignore
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
application_response = await invoke_application(
|
|
507
|
+
request=application_request,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
if (
|
|
511
|
+
not application_response
|
|
512
|
+
or not application_response.data
|
|
513
|
+
or not application_response.trace_id
|
|
514
|
+
):
|
|
515
|
+
print("Missing or invalid application response")
|
|
516
|
+
if application_response:
|
|
517
|
+
print(application_response.model_dump(exclude_none=True))
|
|
518
|
+
continue
|
|
519
|
+
|
|
520
|
+
trace_id = application_response.trace_id
|
|
521
|
+
|
|
522
|
+
if not application_revision.id or not application_revision.name:
|
|
523
|
+
print("Missing application revision ID or name")
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
application_slug = get_slug_from_name_and_id(
|
|
527
|
+
name=application_revision.name,
|
|
528
|
+
id=application_revision.id,
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
trace = fetch_trace_data(trace_id, max_retries=30, delay=1.0)
|
|
532
|
+
|
|
533
|
+
result = await alog_result(
|
|
534
|
+
run_id=run.id,
|
|
535
|
+
scenario_id=scenario.id,
|
|
536
|
+
step_key="application-" + application_slug, # type: ignore
|
|
537
|
+
trace_id=trace_id,
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
print(
|
|
541
|
+
f"{UNICODE['pipe']}"
|
|
542
|
+
f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
|
|
543
|
+
f"{UNICODE['pipe']}"
|
|
544
|
+
f"{UNICODE['next']}"
|
|
545
|
+
f"{UNICODE['here']}"
|
|
546
|
+
f" result_id={str(result.id)} (invocation)",
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
results[application_slug] = result
|
|
550
|
+
|
|
551
|
+
trace = await trace
|
|
552
|
+
|
|
553
|
+
if not trace:
|
|
554
|
+
print("Failed to fetch trace data for application")
|
|
555
|
+
continue
|
|
556
|
+
|
|
557
|
+
root_span = list(trace.get("spans", {}).values())[0]
|
|
558
|
+
trace_attributes: dict = root_span.get("attributes", {})
|
|
559
|
+
trace_attributes_ag: dict = trace_attributes.get("ag", {})
|
|
560
|
+
trace_attributes_ag_data: dict = trace_attributes_ag.get("data", {})
|
|
561
|
+
outputs = trace_attributes_ag_data.get("outputs")
|
|
562
|
+
inputs = inputs or trace_attributes_ag_data.get("inputs")
|
|
563
|
+
|
|
564
|
+
for i, evaluator_revision in enumerate(evaluator_revisions.values()):
|
|
565
|
+
if not evaluator_revision or not evaluator_revision.data:
|
|
566
|
+
print("Missing or invalid evaluator revision")
|
|
567
|
+
if evaluator_revision:
|
|
568
|
+
print(evaluator_revision.model_dump(exclude_none=True))
|
|
569
|
+
continue
|
|
570
|
+
|
|
571
|
+
references = dict(
|
|
572
|
+
testset=Reference(
|
|
573
|
+
id=testset_revision.testset_id,
|
|
574
|
+
),
|
|
575
|
+
testset_variant=Reference(
|
|
576
|
+
id=testset_revision.testset_variant_id,
|
|
577
|
+
),
|
|
578
|
+
testset_revision=Reference(
|
|
579
|
+
id=testset_revision.id,
|
|
580
|
+
slug=testset_revision.slug,
|
|
581
|
+
version=testset_revision.version,
|
|
582
|
+
),
|
|
583
|
+
evaluator=Reference(
|
|
584
|
+
id=evaluator_revision.evaluator_id,
|
|
585
|
+
),
|
|
586
|
+
evaluator_variant=Reference(
|
|
587
|
+
id=evaluator_revision.evaluator_variant_id,
|
|
588
|
+
),
|
|
589
|
+
evaluator_revision=Reference(
|
|
590
|
+
id=evaluator_revision.id,
|
|
591
|
+
slug=evaluator_revision.slug,
|
|
592
|
+
version=evaluator_revision.version,
|
|
593
|
+
),
|
|
594
|
+
)
|
|
595
|
+
links = (
|
|
596
|
+
dict(
|
|
597
|
+
invocation=Link(
|
|
598
|
+
trace_id=application_response.trace_id,
|
|
599
|
+
span_id=application_response.span_id,
|
|
600
|
+
)
|
|
601
|
+
)
|
|
602
|
+
if application_response.trace_id
|
|
603
|
+
and application_response.span_id
|
|
604
|
+
else None
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
_revision = evaluator_revision.model_dump(
|
|
608
|
+
mode="json",
|
|
609
|
+
exclude_none=True,
|
|
610
|
+
)
|
|
611
|
+
interface = WorkflowServiceInterface(
|
|
612
|
+
**(
|
|
613
|
+
evaluator_revision.data.model_dump()
|
|
614
|
+
if evaluator_revision.data
|
|
615
|
+
else {}
|
|
616
|
+
)
|
|
617
|
+
)
|
|
618
|
+
configuration = WorkflowServiceConfiguration(
|
|
619
|
+
**(
|
|
620
|
+
evaluator_revision.data.model_dump()
|
|
621
|
+
if evaluator_revision.data
|
|
622
|
+
else {}
|
|
623
|
+
)
|
|
624
|
+
)
|
|
625
|
+
parameters = evaluator_revision.data.parameters
|
|
626
|
+
|
|
627
|
+
workflow_service_request_data = WorkflowServiceRequestData(
|
|
628
|
+
revision=_revision,
|
|
629
|
+
parameters=parameters,
|
|
630
|
+
#
|
|
631
|
+
testcase=_testcase,
|
|
632
|
+
inputs=inputs,
|
|
633
|
+
#
|
|
634
|
+
trace=trace,
|
|
635
|
+
outputs=outputs,
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
evaluator_request = EvaluatorServiceRequest(
|
|
639
|
+
version="2025.07.14",
|
|
640
|
+
#
|
|
641
|
+
interface=interface,
|
|
642
|
+
configuration=configuration,
|
|
643
|
+
#
|
|
644
|
+
data=workflow_service_request_data,
|
|
645
|
+
#
|
|
646
|
+
references=references, # type: ignore
|
|
647
|
+
links=links, # type: ignore
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
evaluator_response = await invoke_evaluator(
|
|
651
|
+
request=evaluator_request,
|
|
652
|
+
#
|
|
653
|
+
annotate=True,
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
if (
|
|
657
|
+
not evaluator_response
|
|
658
|
+
or not evaluator_response.data
|
|
659
|
+
or not evaluator_response.trace_id
|
|
660
|
+
):
|
|
661
|
+
print("Missing or invalid evaluator response")
|
|
662
|
+
if evaluator_response:
|
|
663
|
+
print(evaluator_response.model_dump(exclude_none=True))
|
|
664
|
+
continue
|
|
665
|
+
|
|
666
|
+
trace_id = evaluator_response.trace_id
|
|
667
|
+
|
|
668
|
+
trace = fetch_trace_data(trace_id, max_retries=20, delay=1.0)
|
|
669
|
+
|
|
670
|
+
result = await alog_result(
|
|
671
|
+
run_id=run.id,
|
|
672
|
+
scenario_id=scenario.id,
|
|
673
|
+
step_key="evaluator-" + evaluator_revision.slug, # type: ignore
|
|
674
|
+
trace_id=trace_id,
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
print(
|
|
678
|
+
f"{UNICODE['pipe']}"
|
|
679
|
+
f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
|
|
680
|
+
f"{UNICODE['pipe']}"
|
|
681
|
+
f"{UNICODE['last' if (i == len(evaluator_revisions) - 1) else 'next']}"
|
|
682
|
+
f"{UNICODE['here']}"
|
|
683
|
+
f" result_id={str(result.id)} (annotation)",
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
results[evaluator_revision.slug] = result
|
|
687
|
+
|
|
688
|
+
trace = await trace
|
|
689
|
+
|
|
690
|
+
if not trace:
|
|
691
|
+
print("Failed to fetch trace data for evaluator")
|
|
692
|
+
continue
|
|
693
|
+
|
|
694
|
+
metrics = await acompute_metrics(
|
|
695
|
+
run_id=run.id,
|
|
696
|
+
scenario_id=scenario.id,
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
print(
|
|
700
|
+
f"{UNICODE['pipe']}"
|
|
701
|
+
f"{UNICODE['pipe' if testcase_idx < len(testcases) - 1 else 'skip']}"
|
|
702
|
+
f"{UNICODE['last']}"
|
|
703
|
+
f"{UNICODE['here']}"
|
|
704
|
+
f"{UNICODE['skip']}"
|
|
705
|
+
f" metrics_id={str(metrics.id)}",
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
scenarios.append(
|
|
709
|
+
{
|
|
710
|
+
"scenario": scenario,
|
|
711
|
+
"results": results,
|
|
712
|
+
"metrics": metrics,
|
|
713
|
+
},
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
print(
|
|
717
|
+
f"{UNICODE['pipe']}"
|
|
718
|
+
f"{UNICODE['skip']}"
|
|
719
|
+
f"{UNICODE['skip']}"
|
|
720
|
+
f"{UNICODE['skip']}"
|
|
721
|
+
f"{UNICODE['skip']}"
|
|
722
|
+
"-----------------------"
|
|
723
|
+
"--------------------------------------"
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
metrics = dict()
|
|
727
|
+
|
|
728
|
+
if len(scenarios) > 0:
|
|
729
|
+
metrics = await acompute_metrics(
|
|
730
|
+
run_id=run.id,
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
print(
|
|
734
|
+
f"{UNICODE['last']}"
|
|
735
|
+
f"{UNICODE['here']}"
|
|
736
|
+
f"{UNICODE['skip']}"
|
|
737
|
+
f"{UNICODE['skip']}"
|
|
738
|
+
f"{UNICODE['skip']}"
|
|
739
|
+
f" metrics_id={str(metrics.id)}",
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
run = await aclose_run(
|
|
743
|
+
run_id=run.id,
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
run_url = await aget_url(run_id=run.id)
|
|
747
|
+
|
|
748
|
+
print(
|
|
749
|
+
"────────────────────────────────────────────────────────────────────────────"
|
|
750
|
+
)
|
|
751
|
+
print(f"Evaluation finished.")
|
|
752
|
+
print(
|
|
753
|
+
"----------------------------------------------------------------------------"
|
|
754
|
+
)
|
|
755
|
+
print(f"Evaluation URL: {run_url or '[unavailable]'}")
|
|
756
|
+
print(
|
|
757
|
+
"────────────────────────────────────────────────────────────────────────────"
|
|
758
|
+
)
|
|
759
|
+
print()
|
|
760
|
+
|
|
761
|
+
return dict(
|
|
762
|
+
run=run,
|
|
763
|
+
scenarios=scenarios,
|
|
764
|
+
metrics=metrics,
|
|
765
|
+
)
|