veadk-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of veadk-python might be problematic. Click here for more details.
- veadk/agent.py +3 -13
- veadk/agents/loop_agent.py +55 -0
- veadk/agents/parallel_agent.py +60 -0
- veadk/agents/sequential_agent.py +55 -0
- veadk/cli/cli_deploy.py +11 -0
- veadk/cli/cli_web.py +27 -0
- veadk/evaluation/adk_evaluator/__init__.py +4 -0
- veadk/evaluation/adk_evaluator/adk_evaluator.py +170 -217
- veadk/evaluation/base_evaluator.py +26 -20
- veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +8 -5
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/app.py +37 -7
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/run.sh +2 -6
- veadk/integrations/ve_faas/ve_faas.py +5 -1
- veadk/runner.py +55 -5
- veadk/tracing/base_tracer.py +25 -200
- veadk/tracing/telemetry/{metrics/__init__.py → attributes/attributes.py} +16 -0
- veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +71 -0
- veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +392 -0
- veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +70 -0
- veadk/tracing/telemetry/attributes/extractors/types.py +75 -0
- veadk/tracing/telemetry/exporters/apmplus_exporter.py +97 -38
- veadk/tracing/telemetry/exporters/base_exporter.py +10 -10
- veadk/tracing/telemetry/exporters/cozeloop_exporter.py +20 -13
- veadk/tracing/telemetry/exporters/inmemory_exporter.py +46 -32
- veadk/tracing/telemetry/exporters/tls_exporter.py +18 -12
- veadk/tracing/telemetry/opentelemetry_tracer.py +102 -102
- veadk/tracing/telemetry/telemetry.py +149 -0
- veadk/types.py +6 -1
- veadk/utils/misc.py +1 -1
- veadk/utils/patches.py +25 -0
- veadk/version.py +1 -1
- veadk_python-0.2.4.dist-info/METADATA +345 -0
- veadk_python-0.2.4.dist-info/RECORD +122 -0
- veadk/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/__pycache__/agent.cpython-310.pyc +0 -0
- veadk/__pycache__/config.cpython-310.pyc +0 -0
- veadk/__pycache__/consts.cpython-310.pyc +0 -0
- veadk/__pycache__/runner.cpython-310.pyc +0 -0
- veadk/__pycache__/types.cpython-310.pyc +0 -0
- veadk/__pycache__/version.cpython-310.pyc +0 -0
- veadk/a2a/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/a2a/__pycache__/agent_card.cpython-310.pyc +0 -0
- veadk/a2a/__pycache__/remote_ve_agent.cpython-310.pyc +0 -0
- veadk/a2a/__pycache__/ve_a2a_server.cpython-310.pyc +0 -0
- veadk/a2a/__pycache__/ve_agent_executor.cpython-310.pyc +0 -0
- veadk/cli/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/cli/__pycache__/cli.cpython-310.pyc +0 -0
- veadk/cli/__pycache__/cli_deploy.cpython-310.pyc +0 -0
- veadk/cli/__pycache__/cli_init.cpython-310.pyc +0 -0
- veadk/cli/__pycache__/cli_prompt.cpython-310.pyc +0 -0
- veadk/cli/__pycache__/cli_studio.cpython-310.pyc +0 -0
- veadk/cli/__pycache__/cli_web.cpython-310.pyc +0 -0
- veadk/cli/__pycache__/main.cpython-310.pyc +0 -0
- veadk/cloud/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/cloud/__pycache__/cloud_agent_engine.cpython-310.pyc +0 -0
- veadk/cloud/__pycache__/cloud_app.cpython-310.pyc +0 -0
- veadk/database/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/database/__pycache__/base_database.cpython-310.pyc +0 -0
- veadk/database/__pycache__/database_adapter.cpython-310.pyc +0 -0
- veadk/database/__pycache__/database_factory.cpython-310.pyc +0 -0
- veadk/database/__pycache__/local_database.cpython-310.pyc +0 -0
- veadk/database/kv/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/database/relational/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/database/vector/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/database/vector/__pycache__/opensearch_vector_database.cpython-310.pyc +0 -0
- veadk/database/vector/__pycache__/type.cpython-310.pyc +0 -0
- veadk/database/viking/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/evaluation/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/evaluation/__pycache__/base_evaluator.cpython-310.pyc +0 -0
- veadk/evaluation/__pycache__/eval_set_file_loader.cpython-310.pyc +0 -0
- veadk/evaluation/__pycache__/eval_set_recorder.cpython-310.pyc +0 -0
- veadk/evaluation/__pycache__/types.cpython-310.pyc +0 -0
- veadk/evaluation/adk_evaluator/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/evaluation/deepeval_evaluator/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/evaluation/deepeval_evaluator/__pycache__/deepeval_evaluator.cpython-310.pyc +0 -0
- veadk/evaluation/utils/__pycache__/prometheus.cpython-310.pyc +0 -0
- veadk/integrations/ve_apig/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/integrations/ve_apig/__pycache__/apig.cpython-310.pyc +0 -0
- veadk/integrations/ve_apig/__pycache__/ve_apig.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/__pycache__/types.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/__pycache__/ve_faas.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/__pycache__/ve_faas_utils.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/__pycache__/vefaas.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/__pycache__/vefaas_utils.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__pycache__/agent.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__pycache__/app.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__pycache__/studio_app.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name|replace('-', '_') }}/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name|replace('-', '_') }}/__pycache__/agent.cpython-310.pyc +0 -0
- veadk/integrations/ve_prompt_pilot/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/integrations/ve_prompt_pilot/__pycache__/agentpilot.cpython-310.pyc +0 -0
- veadk/knowledgebase/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/knowledgebase/__pycache__/knowledgebase.cpython-310.pyc +0 -0
- veadk/knowledgebase/__pycache__/knowledgebase_database_adapter.cpython-310.pyc +0 -0
- veadk/memory/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/memory/__pycache__/long_term_memory.cpython-310.pyc +0 -0
- veadk/memory/__pycache__/memory_database_adapter.cpython-310.pyc +0 -0
- veadk/memory/__pycache__/short_term_memory.cpython-310.pyc +0 -0
- veadk/memory/__pycache__/short_term_memory_processor.cpython-310.pyc +0 -0
- veadk/prompts/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/prompts/__pycache__/agent_default_prompt.cpython-310.pyc +0 -0
- veadk/prompts/__pycache__/prompt_memory_processor.cpython-310.pyc +0 -0
- veadk/prompts/__pycache__/prompt_optimization.cpython-310.pyc +0 -0
- veadk/tools/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/tools/__pycache__/demo_tools.cpython-310.pyc +0 -0
- veadk/tools/__pycache__/load_knowledgebase_tool.cpython-310.pyc +0 -0
- veadk/tools/builtin_tools/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/tools/builtin_tools/__pycache__/lark.cpython-310.pyc +0 -0
- veadk/tools/builtin_tools/__pycache__/vesearch.cpython-310.pyc +0 -0
- veadk/tools/builtin_tools/__pycache__/web_search.cpython-310.pyc +0 -0
- veadk/tools/sandbox/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/tracing/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/tracing/__pycache__/base_tracer.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/__pycache__/opentelemetry_tracer.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/exporters/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/exporters/__pycache__/apiserver_exporter.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/exporters/__pycache__/apmplus_exporter.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/exporters/__pycache__/base_exporter.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/exporters/__pycache__/cozeloop_exporter.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/exporters/__pycache__/inmemory_exporter.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/exporters/__pycache__/tls_exporter.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/metrics/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/metrics/__pycache__/opentelemetry_metrics.cpython-310.pyc +0 -0
- veadk/tracing/telemetry/metrics/opentelemetry_metrics.py +0 -73
- veadk/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- veadk/utils/__pycache__/logger.cpython-310.pyc +0 -0
- veadk/utils/__pycache__/mcp_utils.cpython-310.pyc +0 -0
- veadk/utils/__pycache__/misc.cpython-310.pyc +0 -0
- veadk/utils/__pycache__/patches.cpython-310.pyc +0 -0
- veadk/utils/__pycache__/volcengine_sign.cpython-310.pyc +0 -0
- veadk_python-0.2.2.dist-info/METADATA +0 -144
- veadk_python-0.2.2.dist-info/RECORD +0 -213
- {veadk_python-0.2.2.dist-info → veadk_python-0.2.4.dist-info}/WHEEL +0 -0
- {veadk_python-0.2.2.dist-info → veadk_python-0.2.4.dist-info}/entry_points.txt +0 -0
- {veadk_python-0.2.2.dist-info → veadk_python-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {veadk_python-0.2.2.dist-info → veadk_python-0.2.4.dist-info}/top_level.txt +0 -0
|
@@ -16,30 +16,24 @@ import os
|
|
|
16
16
|
import time
|
|
17
17
|
import uuid
|
|
18
18
|
from os import path
|
|
19
|
-
from typing import Any, Optional
|
|
20
19
|
|
|
21
|
-
from google.adk import Runner
|
|
22
|
-
from google.adk.agents.base_agent import BaseAgent
|
|
23
|
-
from google.adk.artifacts import BaseArtifactService, InMemoryArtifactService
|
|
24
20
|
from google.adk.evaluation.agent_evaluator import (
|
|
25
|
-
NUM_RUNS,
|
|
26
21
|
RESPONSE_MATCH_SCORE_KEY,
|
|
27
22
|
TOOL_TRAJECTORY_SCORE_KEY,
|
|
28
23
|
AgentEvaluator,
|
|
29
24
|
)
|
|
30
|
-
from google.adk.evaluation.eval_case import IntermediateData, Invocation
|
|
31
|
-
from google.adk.evaluation.
|
|
32
|
-
from google.adk.evaluation.evaluation_generator import (
|
|
33
|
-
EvalCaseResponses,
|
|
34
|
-
EvaluationGenerator,
|
|
35
|
-
)
|
|
36
|
-
from google.adk.evaluation.evaluator import EvalStatus, EvaluationResult
|
|
37
|
-
from google.adk.sessions import BaseSessionService, InMemorySessionService
|
|
25
|
+
from google.adk.evaluation.eval_case import IntermediateData, Invocation
|
|
26
|
+
from google.adk.evaluation.evaluator import EvalStatus
|
|
38
27
|
from typing_extensions import override
|
|
28
|
+
from veadk.evaluation.base_evaluator import BaseEvaluator
|
|
29
|
+
from types import SimpleNamespace
|
|
30
|
+
from google.genai import types as genai_types
|
|
39
31
|
|
|
40
|
-
from
|
|
41
|
-
|
|
42
|
-
|
|
32
|
+
from google.adk.evaluation.eval_metrics import EvalMetric
|
|
33
|
+
from google.adk.evaluation.metric_evaluator_registry import (
|
|
34
|
+
DEFAULT_METRIC_EVALUATOR_REGISTRY,
|
|
35
|
+
)
|
|
36
|
+
import inspect
|
|
43
37
|
|
|
44
38
|
|
|
45
39
|
def formatted_timestamp():
|
|
@@ -47,186 +41,6 @@ def formatted_timestamp():
|
|
|
47
41
|
return time.strftime("%Y%m%d%H%M%S", time.localtime())
|
|
48
42
|
|
|
49
43
|
|
|
50
|
-
class VeEvaluationGenerator(EvaluationGenerator):
|
|
51
|
-
@staticmethod
|
|
52
|
-
async def _ve_process_query( # done
|
|
53
|
-
invocations: list[Invocation],
|
|
54
|
-
agent: Agent,
|
|
55
|
-
agent_name: Optional[str] = None,
|
|
56
|
-
initial_session: Optional[SessionInput] = None,
|
|
57
|
-
):
|
|
58
|
-
agent_to_evaluate = agent
|
|
59
|
-
if agent_name:
|
|
60
|
-
agent_to_evaluate = agent.find_agent(agent_name)
|
|
61
|
-
assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
|
|
62
|
-
|
|
63
|
-
return await VeEvaluationGenerator._ve_generate_inferences_from_root_agent(
|
|
64
|
-
invocations, agent_to_evaluate, None, initial_session
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
@staticmethod
|
|
68
|
-
async def ve_generate_responses( # done
|
|
69
|
-
eval_set: EvalSet,
|
|
70
|
-
agent: Agent,
|
|
71
|
-
repeat_num: int = 3,
|
|
72
|
-
agent_name: str | None = None,
|
|
73
|
-
):
|
|
74
|
-
results = []
|
|
75
|
-
|
|
76
|
-
for eval_case in eval_set.eval_cases:
|
|
77
|
-
responses = []
|
|
78
|
-
for _ in range(repeat_num):
|
|
79
|
-
response_invocations = await VeEvaluationGenerator._ve_process_query(
|
|
80
|
-
invocations=eval_case.conversation,
|
|
81
|
-
agent=agent,
|
|
82
|
-
agent_name=agent_name,
|
|
83
|
-
initial_session=eval_case.session_input,
|
|
84
|
-
)
|
|
85
|
-
responses.append(response_invocations)
|
|
86
|
-
|
|
87
|
-
results.append(EvalCaseResponses(eval_case=eval_case, responses=responses))
|
|
88
|
-
|
|
89
|
-
return results
|
|
90
|
-
|
|
91
|
-
@staticmethod
|
|
92
|
-
async def _ve_generate_inferences_from_root_agent(
|
|
93
|
-
invocations: list[Invocation],
|
|
94
|
-
root_agent: BaseAgent,
|
|
95
|
-
reset_func: Any,
|
|
96
|
-
initial_session: Optional[SessionInput] = None,
|
|
97
|
-
session_id: Optional[str] = None,
|
|
98
|
-
session_service: Optional[BaseSessionService] = None,
|
|
99
|
-
artifact_service: Optional[BaseArtifactService] = None,
|
|
100
|
-
) -> list[Invocation]:
|
|
101
|
-
"""Scrapes the root agent given the list of Invocations."""
|
|
102
|
-
if not session_service:
|
|
103
|
-
session_service = InMemorySessionService()
|
|
104
|
-
|
|
105
|
-
app_name = (
|
|
106
|
-
initial_session.app_name if initial_session else "EvaluationGenerator"
|
|
107
|
-
)
|
|
108
|
-
user_id = initial_session.user_id if initial_session else "test_user_id"
|
|
109
|
-
session_id = session_id if session_id else str(uuid.uuid4())
|
|
110
|
-
|
|
111
|
-
_ = await session_service.create_session(
|
|
112
|
-
app_name=app_name,
|
|
113
|
-
user_id=user_id,
|
|
114
|
-
state=initial_session.state if initial_session else {},
|
|
115
|
-
session_id=session_id,
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
if not artifact_service:
|
|
119
|
-
artifact_service = InMemoryArtifactService()
|
|
120
|
-
|
|
121
|
-
runner = Runner(
|
|
122
|
-
app_name=app_name,
|
|
123
|
-
agent=root_agent,
|
|
124
|
-
artifact_service=artifact_service,
|
|
125
|
-
session_service=session_service,
|
|
126
|
-
memory_service=root_agent.long_term_memory
|
|
127
|
-
if isinstance(root_agent, Agent)
|
|
128
|
-
else None,
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
# Reset agent state for each query
|
|
132
|
-
if callable(reset_func):
|
|
133
|
-
reset_func()
|
|
134
|
-
|
|
135
|
-
response_invocations = []
|
|
136
|
-
|
|
137
|
-
for invocation in invocations:
|
|
138
|
-
final_response = None
|
|
139
|
-
user_content = invocation.user_content
|
|
140
|
-
tool_uses = []
|
|
141
|
-
invocation_id = ""
|
|
142
|
-
|
|
143
|
-
async for event in runner.run_async(
|
|
144
|
-
user_id=user_id, session_id=session_id, new_message=user_content
|
|
145
|
-
):
|
|
146
|
-
invocation_id = (
|
|
147
|
-
event.invocation_id if not invocation_id else invocation_id
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
if event.is_final_response() and event.content and event.content.parts:
|
|
151
|
-
final_response = event.content
|
|
152
|
-
elif event.get_function_calls():
|
|
153
|
-
for call in event.get_function_calls():
|
|
154
|
-
tool_uses.append(call)
|
|
155
|
-
|
|
156
|
-
response_invocations.append(
|
|
157
|
-
Invocation(
|
|
158
|
-
invocation_id=invocation_id,
|
|
159
|
-
user_content=user_content,
|
|
160
|
-
final_response=final_response,
|
|
161
|
-
intermediate_data=IntermediateData(tool_uses=tool_uses),
|
|
162
|
-
)
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
return response_invocations
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class VeAgentEvaluator(AgentEvaluator):
|
|
169
|
-
def __init__(
|
|
170
|
-
self,
|
|
171
|
-
):
|
|
172
|
-
super().__init__()
|
|
173
|
-
|
|
174
|
-
@staticmethod
|
|
175
|
-
async def ve_evaluate_eval_set(
|
|
176
|
-
agent: Agent,
|
|
177
|
-
eval_set: EvalSet,
|
|
178
|
-
criteria: dict[str, float],
|
|
179
|
-
num_runs=NUM_RUNS,
|
|
180
|
-
agent_name=None,
|
|
181
|
-
print_detailed_results: bool = True,
|
|
182
|
-
):
|
|
183
|
-
eval_case_responses_list = await VeEvaluationGenerator.ve_generate_responses(
|
|
184
|
-
eval_set=eval_set,
|
|
185
|
-
agent=agent,
|
|
186
|
-
repeat_num=num_runs,
|
|
187
|
-
agent_name=agent_name,
|
|
188
|
-
)
|
|
189
|
-
failures = []
|
|
190
|
-
evaluation_result_list = []
|
|
191
|
-
|
|
192
|
-
for eval_case_responses in eval_case_responses_list:
|
|
193
|
-
actual_invocations = [
|
|
194
|
-
invocation
|
|
195
|
-
for invocations in eval_case_responses.responses
|
|
196
|
-
for invocation in invocations
|
|
197
|
-
]
|
|
198
|
-
expected_invocations = eval_case_responses.eval_case.conversation * num_runs
|
|
199
|
-
|
|
200
|
-
for metric_name, threshold in criteria.items():
|
|
201
|
-
metric_evaluator = AgentEvaluator._get_metric_evaluator(
|
|
202
|
-
metric_name=metric_name, threshold=threshold
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
evaluation_result: EvaluationResult = (
|
|
206
|
-
metric_evaluator.evaluate_invocations(
|
|
207
|
-
actual_invocations=actual_invocations,
|
|
208
|
-
expected_invocations=expected_invocations,
|
|
209
|
-
)
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
if print_detailed_results:
|
|
213
|
-
AgentEvaluator._print_details(
|
|
214
|
-
evaluation_result=evaluation_result,
|
|
215
|
-
metric_name=metric_name,
|
|
216
|
-
threshold=threshold,
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
# Gather all the failures.
|
|
220
|
-
if evaluation_result.overall_eval_status != EvalStatus.PASSED:
|
|
221
|
-
failures.append(
|
|
222
|
-
f"{metric_name} for {agent.name} Failed. Expected {threshold},"
|
|
223
|
-
f" but got {evaluation_result.overall_score}."
|
|
224
|
-
)
|
|
225
|
-
evaluation_result_list.append(evaluation_result)
|
|
226
|
-
|
|
227
|
-
return evaluation_result_list, failures
|
|
228
|
-
|
|
229
|
-
|
|
230
44
|
class ADKEvaluator(BaseEvaluator):
|
|
231
45
|
def __init__(
|
|
232
46
|
self,
|
|
@@ -235,10 +49,8 @@ class ADKEvaluator(BaseEvaluator):
|
|
|
235
49
|
):
|
|
236
50
|
super().__init__(agent=agent, name=name)
|
|
237
51
|
|
|
238
|
-
# TODO: implement
|
|
239
|
-
|
|
240
52
|
@override
|
|
241
|
-
async def
|
|
53
|
+
async def evaluate(
|
|
242
54
|
self,
|
|
243
55
|
eval_set_file_path: str,
|
|
244
56
|
eval_id: str = f"test_{formatted_timestamp()}",
|
|
@@ -247,6 +59,26 @@ class ADKEvaluator(BaseEvaluator):
|
|
|
247
59
|
num_runs: int = 2,
|
|
248
60
|
print_detailed_results: bool = True,
|
|
249
61
|
):
|
|
62
|
+
"""
|
|
63
|
+
End-to-end evaluation flow:
|
|
64
|
+
1) Discover test files (.test.json) or accept a single path.
|
|
65
|
+
2) Build metric criteria (metric_name -> threshold).
|
|
66
|
+
3) For each file, build in-memory eval cases via BaseEvaluator.
|
|
67
|
+
4) For each eval case, construct expected ADK Invocations from expected data.
|
|
68
|
+
5) Repeat for num_runs:
|
|
69
|
+
- Reset all session_ids to isolate state.
|
|
70
|
+
- Generate actual outputs via BaseEvaluator and convert to ADK Invocations.
|
|
71
|
+
6) Repeat expected invocations to match num_runs for 1:1 alignment.
|
|
72
|
+
7) For each metric:
|
|
73
|
+
- Create EvalMetric and get the evaluator from ADK's registry.
|
|
74
|
+
- Call evaluate_invocations (await if async) to get EvaluationResult with:
|
|
75
|
+
overall_score/overall_eval_status + per_invocation_results.
|
|
76
|
+
- Optionally pretty print via AgentEvaluator._print_details.
|
|
77
|
+
- Record failure if overall status != PASSED.
|
|
78
|
+
8) Return (all evaluation_result objects, failures) to the caller.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
# Resolve eval files: accept a directory (scan *.test.json) or a single file
|
|
250
82
|
test_files = []
|
|
251
83
|
eval_dataset_file_path_or_dir = eval_set_file_path
|
|
252
84
|
if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
|
|
@@ -259,28 +91,149 @@ class ADKEvaluator(BaseEvaluator):
|
|
|
259
91
|
else:
|
|
260
92
|
test_files = [eval_dataset_file_path_or_dir]
|
|
261
93
|
|
|
262
|
-
|
|
94
|
+
# Build metric criteria (metric_name -> threshold)
|
|
95
|
+
criteria = {
|
|
96
|
+
TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold, # 1-point scale; 1.0 means perfect tool call trajectory
|
|
97
|
+
RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold, # Rouge-1 text match; 0.8 default threshold
|
|
98
|
+
}
|
|
263
99
|
|
|
100
|
+
# Aggregate all evaluation results and failures across files
|
|
264
101
|
result = []
|
|
265
102
|
failures = []
|
|
103
|
+
|
|
104
|
+
# Iterate each test file and evaluate per-case, per-metric
|
|
266
105
|
for test_file in test_files:
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
)
|
|
106
|
+
# Build in-memory evaluation cases via BaseEvaluator from the provided file
|
|
107
|
+
self.build_eval_set(test_file)
|
|
108
|
+
|
|
109
|
+
evaluation_result_list = []
|
|
110
|
+
|
|
111
|
+
# For each eval case, generate actual outputs num_runs times using BaseEvaluator
|
|
112
|
+
for case_idx, eval_case_data in enumerate(self.invocation_list):
|
|
113
|
+
# Convert BaseEvaluator's expected data into ADK Invocation list
|
|
114
|
+
expected_invocations: list[Invocation] = []
|
|
115
|
+
for inv in eval_case_data.invocations:
|
|
116
|
+
user_content = genai_types.Content(
|
|
117
|
+
role="user",
|
|
118
|
+
parts=[genai_types.Part(text=inv.input or "")],
|
|
119
|
+
)
|
|
120
|
+
expected_final = genai_types.Content(
|
|
121
|
+
role=None,
|
|
122
|
+
parts=[genai_types.Part(text=inv.expected_output or "")],
|
|
123
|
+
)
|
|
124
|
+
expected_tool_calls = [
|
|
125
|
+
SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
|
|
126
|
+
for t in (inv.expected_tool or [])
|
|
127
|
+
]
|
|
128
|
+
# Pack a full expected Invocation for ADK metrics
|
|
129
|
+
expected_invocations.append(
|
|
130
|
+
Invocation(
|
|
131
|
+
invocation_id=inv.invocation_id,
|
|
132
|
+
user_content=user_content,
|
|
133
|
+
final_response=expected_final,
|
|
134
|
+
intermediate_data=IntermediateData(
|
|
135
|
+
tool_uses=expected_tool_calls
|
|
136
|
+
),
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Collect actual invocations across runs
|
|
141
|
+
actual_invocations_all_runs: list[Invocation] = []
|
|
142
|
+
for _ in range(num_runs):
|
|
143
|
+
for agent_information in self.agent_information_list:
|
|
144
|
+
agent_information["session_id"] = str(uuid.uuid4())
|
|
145
|
+
|
|
146
|
+
# Generate actual outputs for all cases in this run via BaseEvaluator
|
|
147
|
+
await self.generate_actual_outputs()
|
|
148
|
+
|
|
149
|
+
# Convert BaseEvaluator's actual data into ADK Invocation list
|
|
150
|
+
for inv in eval_case_data.invocations:
|
|
151
|
+
user_content = genai_types.Content(
|
|
152
|
+
role="user",
|
|
153
|
+
parts=[genai_types.Part(text=inv.input or "")],
|
|
154
|
+
)
|
|
155
|
+
actual_final = genai_types.Content(
|
|
156
|
+
role=None,
|
|
157
|
+
parts=[genai_types.Part(text=inv.actual_output or "")],
|
|
158
|
+
)
|
|
159
|
+
# Collect the tool calls observed during actual execution
|
|
160
|
+
actual_tool_calls = [
|
|
161
|
+
SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
|
|
162
|
+
for t in (inv.actual_tool or [])
|
|
163
|
+
]
|
|
164
|
+
# Pack a full actual Invocation for ADK metrics
|
|
165
|
+
actual_invocations_all_runs.append(
|
|
166
|
+
Invocation(
|
|
167
|
+
invocation_id=inv.invocation_id,
|
|
168
|
+
user_content=user_content,
|
|
169
|
+
final_response=actual_final,
|
|
170
|
+
intermediate_data=IntermediateData(
|
|
171
|
+
tool_uses=actual_tool_calls
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Repeat expected invocations to align with num_runs
|
|
177
|
+
expected_invocations_repeated = expected_invocations * num_runs
|
|
178
|
+
|
|
179
|
+
# Evaluate per metric via ADK metric evaluators obtained from the registry
|
|
180
|
+
for metric_name, threshold in criteria.items():
|
|
181
|
+
eval_metric = EvalMetric(
|
|
182
|
+
metric_name=metric_name, threshold=threshold
|
|
183
|
+
)
|
|
184
|
+
metric_evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(
|
|
185
|
+
eval_metric=eval_metric
|
|
186
|
+
)
|
|
274
187
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
188
|
+
if inspect.iscoroutinefunction(
|
|
189
|
+
metric_evaluator.evaluate_invocations
|
|
190
|
+
):
|
|
191
|
+
evaluation_result = await metric_evaluator.evaluate_invocations(
|
|
192
|
+
actual_invocations=actual_invocations_all_runs,
|
|
193
|
+
expected_invocations=expected_invocations_repeated,
|
|
194
|
+
)
|
|
195
|
+
else:
|
|
196
|
+
evaluation_result = metric_evaluator.evaluate_invocations(
|
|
197
|
+
actual_invocations=actual_invocations_all_runs,
|
|
198
|
+
expected_invocations=expected_invocations_repeated,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if print_detailed_results:
|
|
202
|
+
per_items = []
|
|
203
|
+
for i, per in enumerate(
|
|
204
|
+
getattr(evaluation_result, "per_invocation_results", [])
|
|
205
|
+
or []
|
|
206
|
+
):
|
|
207
|
+
per_items.append(
|
|
208
|
+
SimpleNamespace(
|
|
209
|
+
actual_invocation=actual_invocations_all_runs[i],
|
|
210
|
+
expected_invocation=expected_invocations_repeated[
|
|
211
|
+
i
|
|
212
|
+
],
|
|
213
|
+
eval_metric_result=SimpleNamespace(
|
|
214
|
+
eval_status=per.eval_status,
|
|
215
|
+
score=per.score,
|
|
216
|
+
threshold=threshold,
|
|
217
|
+
),
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
AgentEvaluator._print_details(
|
|
222
|
+
eval_metric_result_with_invocations=per_items,
|
|
223
|
+
overall_eval_status=evaluation_result.overall_eval_status,
|
|
224
|
+
overall_score=evaluation_result.overall_score,
|
|
225
|
+
metric_name=metric_name,
|
|
226
|
+
threshold=threshold,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if evaluation_result.overall_eval_status != EvalStatus.PASSED:
|
|
230
|
+
failures.append(
|
|
231
|
+
f"{metric_name} for {self.agent.name} Failed. Expected {threshold},"
|
|
232
|
+
f" but got {evaluation_result.overall_score}."
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
evaluation_result_list.append(evaluation_result)
|
|
236
|
+
|
|
237
|
+
result.append(evaluation_result_list)
|
|
285
238
|
|
|
286
239
|
return result, failures
|
|
@@ -28,7 +28,13 @@ from pydantic import BaseModel
|
|
|
28
28
|
from veadk.utils.misc import formatted_timestamp
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
class
|
|
31
|
+
class ToolInvocation(BaseModel):
|
|
32
|
+
tool_name: str
|
|
33
|
+
tool_args: dict[str, Any] = {}
|
|
34
|
+
tool_result: Any = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Invocation(BaseModel):
|
|
32
38
|
invocation_id: str = ""
|
|
33
39
|
input: str
|
|
34
40
|
actual_output: str
|
|
@@ -38,8 +44,8 @@ class InvocationTestData(BaseModel):
|
|
|
38
44
|
latency: str = "" # ms
|
|
39
45
|
|
|
40
46
|
|
|
41
|
-
class
|
|
42
|
-
invocations: list[
|
|
47
|
+
class EvalTestCase(BaseModel):
|
|
48
|
+
invocations: list[Invocation]
|
|
43
49
|
|
|
44
50
|
|
|
45
51
|
class MetricResult(BaseModel):
|
|
@@ -78,23 +84,23 @@ class BaseEvaluator:
|
|
|
78
84
|
):
|
|
79
85
|
self.name = name
|
|
80
86
|
self.agent = agent
|
|
81
|
-
self.invocation_list: list[
|
|
87
|
+
self.invocation_list: list[EvalTestCase] = []
|
|
82
88
|
self.result_list: list[EvalResultData] = []
|
|
83
89
|
self.agent_information_list: list[dict] = []
|
|
84
90
|
|
|
85
|
-
def
|
|
86
|
-
from .eval_set_file_loader import load_eval_set_from_file
|
|
91
|
+
def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet:
|
|
92
|
+
from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file
|
|
87
93
|
|
|
88
|
-
return load_eval_set_from_file(
|
|
94
|
+
return load_eval_set_from_file(eval_json_path)
|
|
89
95
|
|
|
90
|
-
def
|
|
96
|
+
def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
|
|
91
97
|
try:
|
|
92
|
-
with open(
|
|
98
|
+
with open(tracing_json_path, "r") as f:
|
|
93
99
|
tracing_data = json.load(f)
|
|
94
100
|
except json.JSONDecodeError as e:
|
|
95
|
-
raise ValueError(f"Invalid JSON format in file {
|
|
101
|
+
raise ValueError(f"Invalid JSON format in file {tracing_json_path}: {e}")
|
|
96
102
|
except Exception as e:
|
|
97
|
-
raise ValueError(f"Error reading file {
|
|
103
|
+
raise ValueError(f"Error reading file {tracing_json_path}: {e}")
|
|
98
104
|
|
|
99
105
|
# Group spans by trace_id
|
|
100
106
|
trace_groups = {}
|
|
@@ -188,9 +194,9 @@ class BaseEvaluator:
|
|
|
188
194
|
|
|
189
195
|
return evalset
|
|
190
196
|
|
|
191
|
-
def
|
|
197
|
+
def build_eval_set(self, file_path: str):
|
|
192
198
|
"""Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
|
|
193
|
-
eval_case_data_list: list[
|
|
199
|
+
eval_case_data_list: list[EvalTestCase] = []
|
|
194
200
|
|
|
195
201
|
try:
|
|
196
202
|
with open(file_path, "r") as f:
|
|
@@ -201,7 +207,7 @@ class BaseEvaluator:
|
|
|
201
207
|
raise ValueError(f"Error reading file {file_path}: {e}")
|
|
202
208
|
|
|
203
209
|
if isinstance(file_content, dict) and "eval_cases" in file_content:
|
|
204
|
-
eval_cases = self.
|
|
210
|
+
eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
|
|
205
211
|
elif (
|
|
206
212
|
isinstance(file_content, list)
|
|
207
213
|
and len(file_content) > 0
|
|
@@ -209,14 +215,14 @@ class BaseEvaluator:
|
|
|
209
215
|
isinstance(span, dict) and "trace_id" in span for span in file_content
|
|
210
216
|
)
|
|
211
217
|
):
|
|
212
|
-
eval_cases = self.
|
|
218
|
+
eval_cases = self._build_eval_set_from_tracing_json(file_path).eval_cases
|
|
213
219
|
else:
|
|
214
220
|
raise ValueError(
|
|
215
221
|
f"Unsupported file format in {file_path}. Please provide a valid file."
|
|
216
222
|
)
|
|
217
223
|
|
|
218
224
|
for eval_case in eval_cases:
|
|
219
|
-
eval_case_data =
|
|
225
|
+
eval_case_data = EvalTestCase(invocations=[])
|
|
220
226
|
if eval_case.session_input:
|
|
221
227
|
self.agent_information_list.append(
|
|
222
228
|
{
|
|
@@ -247,7 +253,7 @@ class BaseEvaluator:
|
|
|
247
253
|
)
|
|
248
254
|
|
|
249
255
|
eval_case_data.invocations.append(
|
|
250
|
-
|
|
256
|
+
Invocation(
|
|
251
257
|
invocation_id=invocation.invocation_id,
|
|
252
258
|
input=_input,
|
|
253
259
|
actual_output="",
|
|
@@ -261,7 +267,7 @@ class BaseEvaluator:
|
|
|
261
267
|
eval_case_data_list.append(eval_case_data)
|
|
262
268
|
self.invocation_list = eval_case_data_list
|
|
263
269
|
|
|
264
|
-
async def
|
|
270
|
+
async def generate_actual_outputs(self):
|
|
265
271
|
for eval_case_data, agent_information in zip(
|
|
266
272
|
self.invocation_list, self.agent_information_list
|
|
267
273
|
):
|
|
@@ -333,7 +339,7 @@ class BaseEvaluator:
|
|
|
333
339
|
invocation.actual_tool = _actual_tool
|
|
334
340
|
invocation.latency = _latency
|
|
335
341
|
|
|
336
|
-
def
|
|
342
|
+
def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
|
|
337
343
|
"""Merge the evaluation data and return it in the format of list[list[dict]]"""
|
|
338
344
|
result = []
|
|
339
345
|
for i, eval_case in enumerate(self.invocation_list):
|
|
@@ -360,7 +366,7 @@ class BaseEvaluator:
|
|
|
360
366
|
return result
|
|
361
367
|
|
|
362
368
|
@abstractmethod
|
|
363
|
-
async def
|
|
369
|
+
async def evaluate(
|
|
364
370
|
self,
|
|
365
371
|
eval_set_file_path: str,
|
|
366
372
|
metrics: list[Any],
|
|
@@ -27,8 +27,11 @@ from veadk.config import getenv
|
|
|
27
27
|
from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
|
|
28
28
|
from veadk.utils.logger import get_logger
|
|
29
29
|
|
|
30
|
-
from
|
|
31
|
-
from
|
|
30
|
+
from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
|
|
31
|
+
from veadk.evaluation.utils.prometheus import (
|
|
32
|
+
PrometheusPushgatewayConfig,
|
|
33
|
+
push_to_prometheus,
|
|
34
|
+
)
|
|
32
35
|
|
|
33
36
|
logger = get_logger(__name__)
|
|
34
37
|
|
|
@@ -66,7 +69,7 @@ class DeepevalEvaluator(BaseEvaluator):
|
|
|
66
69
|
self.prometheus_config = prometheus_config
|
|
67
70
|
|
|
68
71
|
@override
|
|
69
|
-
async def
|
|
72
|
+
async def evaluate(
|
|
70
73
|
self,
|
|
71
74
|
eval_set_file_path: str,
|
|
72
75
|
metrics: list[BaseMetric],
|
|
@@ -74,11 +77,11 @@ class DeepevalEvaluator(BaseEvaluator):
|
|
|
74
77
|
):
|
|
75
78
|
"""Target to Google ADK, we will use the same evaluation case format as Google ADK."""
|
|
76
79
|
# Get evaluation data by parsing eval set file
|
|
77
|
-
self.
|
|
80
|
+
self.build_eval_set(eval_set_file_path)
|
|
78
81
|
|
|
79
82
|
# Get actual data by running agent
|
|
80
83
|
logger.info("Start to run agent for actual data.")
|
|
81
|
-
await self.
|
|
84
|
+
await self.generate_actual_outputs()
|
|
82
85
|
eval_case_data_list = self.invocation_list
|
|
83
86
|
|
|
84
87
|
# Build test cases in Deepeval format
|