veadk-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of veadk-python might be problematic. Click here for more details.

Files changed (138) hide show
  1. veadk/agent.py +3 -13
  2. veadk/agents/loop_agent.py +55 -0
  3. veadk/agents/parallel_agent.py +60 -0
  4. veadk/agents/sequential_agent.py +55 -0
  5. veadk/cli/cli_deploy.py +11 -0
  6. veadk/cli/cli_web.py +27 -0
  7. veadk/evaluation/adk_evaluator/__init__.py +4 -0
  8. veadk/evaluation/adk_evaluator/adk_evaluator.py +170 -217
  9. veadk/evaluation/base_evaluator.py +26 -20
  10. veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +8 -5
  11. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/app.py +37 -7
  12. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/run.sh +2 -6
  13. veadk/integrations/ve_faas/ve_faas.py +5 -1
  14. veadk/runner.py +55 -5
  15. veadk/tracing/base_tracer.py +25 -200
  16. veadk/tracing/telemetry/{metrics/__init__.py → attributes/attributes.py} +16 -0
  17. veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +71 -0
  18. veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +392 -0
  19. veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +70 -0
  20. veadk/tracing/telemetry/attributes/extractors/types.py +75 -0
  21. veadk/tracing/telemetry/exporters/apmplus_exporter.py +97 -38
  22. veadk/tracing/telemetry/exporters/base_exporter.py +10 -10
  23. veadk/tracing/telemetry/exporters/cozeloop_exporter.py +20 -13
  24. veadk/tracing/telemetry/exporters/inmemory_exporter.py +46 -32
  25. veadk/tracing/telemetry/exporters/tls_exporter.py +18 -12
  26. veadk/tracing/telemetry/opentelemetry_tracer.py +102 -102
  27. veadk/tracing/telemetry/telemetry.py +149 -0
  28. veadk/types.py +6 -1
  29. veadk/utils/misc.py +1 -1
  30. veadk/utils/patches.py +25 -0
  31. veadk/version.py +1 -1
  32. veadk_python-0.2.4.dist-info/METADATA +345 -0
  33. veadk_python-0.2.4.dist-info/RECORD +122 -0
  34. veadk/__pycache__/__init__.cpython-310.pyc +0 -0
  35. veadk/__pycache__/agent.cpython-310.pyc +0 -0
  36. veadk/__pycache__/config.cpython-310.pyc +0 -0
  37. veadk/__pycache__/consts.cpython-310.pyc +0 -0
  38. veadk/__pycache__/runner.cpython-310.pyc +0 -0
  39. veadk/__pycache__/types.cpython-310.pyc +0 -0
  40. veadk/__pycache__/version.cpython-310.pyc +0 -0
  41. veadk/a2a/__pycache__/__init__.cpython-310.pyc +0 -0
  42. veadk/a2a/__pycache__/agent_card.cpython-310.pyc +0 -0
  43. veadk/a2a/__pycache__/remote_ve_agent.cpython-310.pyc +0 -0
  44. veadk/a2a/__pycache__/ve_a2a_server.cpython-310.pyc +0 -0
  45. veadk/a2a/__pycache__/ve_agent_executor.cpython-310.pyc +0 -0
  46. veadk/cli/__pycache__/__init__.cpython-310.pyc +0 -0
  47. veadk/cli/__pycache__/cli.cpython-310.pyc +0 -0
  48. veadk/cli/__pycache__/cli_deploy.cpython-310.pyc +0 -0
  49. veadk/cli/__pycache__/cli_init.cpython-310.pyc +0 -0
  50. veadk/cli/__pycache__/cli_prompt.cpython-310.pyc +0 -0
  51. veadk/cli/__pycache__/cli_studio.cpython-310.pyc +0 -0
  52. veadk/cli/__pycache__/cli_web.cpython-310.pyc +0 -0
  53. veadk/cli/__pycache__/main.cpython-310.pyc +0 -0
  54. veadk/cloud/__pycache__/__init__.cpython-310.pyc +0 -0
  55. veadk/cloud/__pycache__/cloud_agent_engine.cpython-310.pyc +0 -0
  56. veadk/cloud/__pycache__/cloud_app.cpython-310.pyc +0 -0
  57. veadk/database/__pycache__/__init__.cpython-310.pyc +0 -0
  58. veadk/database/__pycache__/base_database.cpython-310.pyc +0 -0
  59. veadk/database/__pycache__/database_adapter.cpython-310.pyc +0 -0
  60. veadk/database/__pycache__/database_factory.cpython-310.pyc +0 -0
  61. veadk/database/__pycache__/local_database.cpython-310.pyc +0 -0
  62. veadk/database/kv/__pycache__/__init__.cpython-310.pyc +0 -0
  63. veadk/database/relational/__pycache__/__init__.cpython-310.pyc +0 -0
  64. veadk/database/vector/__pycache__/__init__.cpython-310.pyc +0 -0
  65. veadk/database/vector/__pycache__/opensearch_vector_database.cpython-310.pyc +0 -0
  66. veadk/database/vector/__pycache__/type.cpython-310.pyc +0 -0
  67. veadk/database/viking/__pycache__/__init__.cpython-310.pyc +0 -0
  68. veadk/evaluation/__pycache__/__init__.cpython-310.pyc +0 -0
  69. veadk/evaluation/__pycache__/base_evaluator.cpython-310.pyc +0 -0
  70. veadk/evaluation/__pycache__/eval_set_file_loader.cpython-310.pyc +0 -0
  71. veadk/evaluation/__pycache__/eval_set_recorder.cpython-310.pyc +0 -0
  72. veadk/evaluation/__pycache__/types.cpython-310.pyc +0 -0
  73. veadk/evaluation/adk_evaluator/__pycache__/__init__.cpython-310.pyc +0 -0
  74. veadk/evaluation/deepeval_evaluator/__pycache__/__init__.cpython-310.pyc +0 -0
  75. veadk/evaluation/deepeval_evaluator/__pycache__/deepeval_evaluator.cpython-310.pyc +0 -0
  76. veadk/evaluation/utils/__pycache__/prometheus.cpython-310.pyc +0 -0
  77. veadk/integrations/ve_apig/__pycache__/__init__.cpython-310.pyc +0 -0
  78. veadk/integrations/ve_apig/__pycache__/apig.cpython-310.pyc +0 -0
  79. veadk/integrations/ve_apig/__pycache__/ve_apig.cpython-310.pyc +0 -0
  80. veadk/integrations/ve_faas/__pycache__/__init__.cpython-310.pyc +0 -0
  81. veadk/integrations/ve_faas/__pycache__/types.cpython-310.pyc +0 -0
  82. veadk/integrations/ve_faas/__pycache__/ve_faas.cpython-310.pyc +0 -0
  83. veadk/integrations/ve_faas/__pycache__/ve_faas_utils.cpython-310.pyc +0 -0
  84. veadk/integrations/ve_faas/__pycache__/vefaas.cpython-310.pyc +0 -0
  85. veadk/integrations/ve_faas/__pycache__/vefaas_utils.cpython-310.pyc +0 -0
  86. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__pycache__/agent.cpython-310.pyc +0 -0
  87. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__pycache__/app.cpython-310.pyc +0 -0
  88. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__pycache__/studio_app.cpython-310.pyc +0 -0
  89. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name|replace('-', '_') }}/__pycache__/__init__.cpython-310.pyc +0 -0
  90. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name|replace('-', '_') }}/__pycache__/agent.cpython-310.pyc +0 -0
  91. veadk/integrations/ve_prompt_pilot/__pycache__/__init__.cpython-310.pyc +0 -0
  92. veadk/integrations/ve_prompt_pilot/__pycache__/agentpilot.cpython-310.pyc +0 -0
  93. veadk/knowledgebase/__pycache__/__init__.cpython-310.pyc +0 -0
  94. veadk/knowledgebase/__pycache__/knowledgebase.cpython-310.pyc +0 -0
  95. veadk/knowledgebase/__pycache__/knowledgebase_database_adapter.cpython-310.pyc +0 -0
  96. veadk/memory/__pycache__/__init__.cpython-310.pyc +0 -0
  97. veadk/memory/__pycache__/long_term_memory.cpython-310.pyc +0 -0
  98. veadk/memory/__pycache__/memory_database_adapter.cpython-310.pyc +0 -0
  99. veadk/memory/__pycache__/short_term_memory.cpython-310.pyc +0 -0
  100. veadk/memory/__pycache__/short_term_memory_processor.cpython-310.pyc +0 -0
  101. veadk/prompts/__pycache__/__init__.cpython-310.pyc +0 -0
  102. veadk/prompts/__pycache__/agent_default_prompt.cpython-310.pyc +0 -0
  103. veadk/prompts/__pycache__/prompt_memory_processor.cpython-310.pyc +0 -0
  104. veadk/prompts/__pycache__/prompt_optimization.cpython-310.pyc +0 -0
  105. veadk/tools/__pycache__/__init__.cpython-310.pyc +0 -0
  106. veadk/tools/__pycache__/demo_tools.cpython-310.pyc +0 -0
  107. veadk/tools/__pycache__/load_knowledgebase_tool.cpython-310.pyc +0 -0
  108. veadk/tools/builtin_tools/__pycache__/__init__.cpython-310.pyc +0 -0
  109. veadk/tools/builtin_tools/__pycache__/lark.cpython-310.pyc +0 -0
  110. veadk/tools/builtin_tools/__pycache__/vesearch.cpython-310.pyc +0 -0
  111. veadk/tools/builtin_tools/__pycache__/web_search.cpython-310.pyc +0 -0
  112. veadk/tools/sandbox/__pycache__/__init__.cpython-310.pyc +0 -0
  113. veadk/tracing/__pycache__/__init__.cpython-310.pyc +0 -0
  114. veadk/tracing/__pycache__/base_tracer.cpython-310.pyc +0 -0
  115. veadk/tracing/telemetry/__pycache__/__init__.cpython-310.pyc +0 -0
  116. veadk/tracing/telemetry/__pycache__/opentelemetry_tracer.cpython-310.pyc +0 -0
  117. veadk/tracing/telemetry/exporters/__pycache__/__init__.cpython-310.pyc +0 -0
  118. veadk/tracing/telemetry/exporters/__pycache__/apiserver_exporter.cpython-310.pyc +0 -0
  119. veadk/tracing/telemetry/exporters/__pycache__/apmplus_exporter.cpython-310.pyc +0 -0
  120. veadk/tracing/telemetry/exporters/__pycache__/base_exporter.cpython-310.pyc +0 -0
  121. veadk/tracing/telemetry/exporters/__pycache__/cozeloop_exporter.cpython-310.pyc +0 -0
  122. veadk/tracing/telemetry/exporters/__pycache__/inmemory_exporter.cpython-310.pyc +0 -0
  123. veadk/tracing/telemetry/exporters/__pycache__/tls_exporter.cpython-310.pyc +0 -0
  124. veadk/tracing/telemetry/metrics/__pycache__/__init__.cpython-310.pyc +0 -0
  125. veadk/tracing/telemetry/metrics/__pycache__/opentelemetry_metrics.cpython-310.pyc +0 -0
  126. veadk/tracing/telemetry/metrics/opentelemetry_metrics.py +0 -73
  127. veadk/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  128. veadk/utils/__pycache__/logger.cpython-310.pyc +0 -0
  129. veadk/utils/__pycache__/mcp_utils.cpython-310.pyc +0 -0
  130. veadk/utils/__pycache__/misc.cpython-310.pyc +0 -0
  131. veadk/utils/__pycache__/patches.cpython-310.pyc +0 -0
  132. veadk/utils/__pycache__/volcengine_sign.cpython-310.pyc +0 -0
  133. veadk_python-0.2.2.dist-info/METADATA +0 -144
  134. veadk_python-0.2.2.dist-info/RECORD +0 -213
  135. {veadk_python-0.2.2.dist-info → veadk_python-0.2.4.dist-info}/WHEEL +0 -0
  136. {veadk_python-0.2.2.dist-info → veadk_python-0.2.4.dist-info}/entry_points.txt +0 -0
  137. {veadk_python-0.2.2.dist-info → veadk_python-0.2.4.dist-info}/licenses/LICENSE +0 -0
  138. {veadk_python-0.2.2.dist-info → veadk_python-0.2.4.dist-info}/top_level.txt +0 -0
@@ -16,30 +16,24 @@ import os
16
16
  import time
17
17
  import uuid
18
18
  from os import path
19
- from typing import Any, Optional
20
19
 
21
- from google.adk import Runner
22
- from google.adk.agents.base_agent import BaseAgent
23
- from google.adk.artifacts import BaseArtifactService, InMemoryArtifactService
24
20
  from google.adk.evaluation.agent_evaluator import (
25
- NUM_RUNS,
26
21
  RESPONSE_MATCH_SCORE_KEY,
27
22
  TOOL_TRAJECTORY_SCORE_KEY,
28
23
  AgentEvaluator,
29
24
  )
30
- from google.adk.evaluation.eval_case import IntermediateData, Invocation, SessionInput
31
- from google.adk.evaluation.eval_set import EvalSet
32
- from google.adk.evaluation.evaluation_generator import (
33
- EvalCaseResponses,
34
- EvaluationGenerator,
35
- )
36
- from google.adk.evaluation.evaluator import EvalStatus, EvaluationResult
37
- from google.adk.sessions import BaseSessionService, InMemorySessionService
25
+ from google.adk.evaluation.eval_case import IntermediateData, Invocation
26
+ from google.adk.evaluation.evaluator import EvalStatus
38
27
  from typing_extensions import override
28
+ from veadk.evaluation.base_evaluator import BaseEvaluator
29
+ from types import SimpleNamespace
30
+ from google.genai import types as genai_types
39
31
 
40
- from veadk.agent import Agent
41
-
42
- from ..base_evaluator import BaseEvaluator
32
+ from google.adk.evaluation.eval_metrics import EvalMetric
33
+ from google.adk.evaluation.metric_evaluator_registry import (
34
+ DEFAULT_METRIC_EVALUATOR_REGISTRY,
35
+ )
36
+ import inspect
43
37
 
44
38
 
45
39
  def formatted_timestamp():
@@ -47,186 +41,6 @@ def formatted_timestamp():
47
41
  return time.strftime("%Y%m%d%H%M%S", time.localtime())
48
42
 
49
43
 
50
- class VeEvaluationGenerator(EvaluationGenerator):
51
- @staticmethod
52
- async def _ve_process_query( # done
53
- invocations: list[Invocation],
54
- agent: Agent,
55
- agent_name: Optional[str] = None,
56
- initial_session: Optional[SessionInput] = None,
57
- ):
58
- agent_to_evaluate = agent
59
- if agent_name:
60
- agent_to_evaluate = agent.find_agent(agent_name)
61
- assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
62
-
63
- return await VeEvaluationGenerator._ve_generate_inferences_from_root_agent(
64
- invocations, agent_to_evaluate, None, initial_session
65
- )
66
-
67
- @staticmethod
68
- async def ve_generate_responses( # done
69
- eval_set: EvalSet,
70
- agent: Agent,
71
- repeat_num: int = 3,
72
- agent_name: str | None = None,
73
- ):
74
- results = []
75
-
76
- for eval_case in eval_set.eval_cases:
77
- responses = []
78
- for _ in range(repeat_num):
79
- response_invocations = await VeEvaluationGenerator._ve_process_query(
80
- invocations=eval_case.conversation,
81
- agent=agent,
82
- agent_name=agent_name,
83
- initial_session=eval_case.session_input,
84
- )
85
- responses.append(response_invocations)
86
-
87
- results.append(EvalCaseResponses(eval_case=eval_case, responses=responses))
88
-
89
- return results
90
-
91
- @staticmethod
92
- async def _ve_generate_inferences_from_root_agent(
93
- invocations: list[Invocation],
94
- root_agent: BaseAgent,
95
- reset_func: Any,
96
- initial_session: Optional[SessionInput] = None,
97
- session_id: Optional[str] = None,
98
- session_service: Optional[BaseSessionService] = None,
99
- artifact_service: Optional[BaseArtifactService] = None,
100
- ) -> list[Invocation]:
101
- """Scrapes the root agent given the list of Invocations."""
102
- if not session_service:
103
- session_service = InMemorySessionService()
104
-
105
- app_name = (
106
- initial_session.app_name if initial_session else "EvaluationGenerator"
107
- )
108
- user_id = initial_session.user_id if initial_session else "test_user_id"
109
- session_id = session_id if session_id else str(uuid.uuid4())
110
-
111
- _ = await session_service.create_session(
112
- app_name=app_name,
113
- user_id=user_id,
114
- state=initial_session.state if initial_session else {},
115
- session_id=session_id,
116
- )
117
-
118
- if not artifact_service:
119
- artifact_service = InMemoryArtifactService()
120
-
121
- runner = Runner(
122
- app_name=app_name,
123
- agent=root_agent,
124
- artifact_service=artifact_service,
125
- session_service=session_service,
126
- memory_service=root_agent.long_term_memory
127
- if isinstance(root_agent, Agent)
128
- else None,
129
- )
130
-
131
- # Reset agent state for each query
132
- if callable(reset_func):
133
- reset_func()
134
-
135
- response_invocations = []
136
-
137
- for invocation in invocations:
138
- final_response = None
139
- user_content = invocation.user_content
140
- tool_uses = []
141
- invocation_id = ""
142
-
143
- async for event in runner.run_async(
144
- user_id=user_id, session_id=session_id, new_message=user_content
145
- ):
146
- invocation_id = (
147
- event.invocation_id if not invocation_id else invocation_id
148
- )
149
-
150
- if event.is_final_response() and event.content and event.content.parts:
151
- final_response = event.content
152
- elif event.get_function_calls():
153
- for call in event.get_function_calls():
154
- tool_uses.append(call)
155
-
156
- response_invocations.append(
157
- Invocation(
158
- invocation_id=invocation_id,
159
- user_content=user_content,
160
- final_response=final_response,
161
- intermediate_data=IntermediateData(tool_uses=tool_uses),
162
- )
163
- )
164
-
165
- return response_invocations
166
-
167
-
168
- class VeAgentEvaluator(AgentEvaluator):
169
- def __init__(
170
- self,
171
- ):
172
- super().__init__()
173
-
174
- @staticmethod
175
- async def ve_evaluate_eval_set(
176
- agent: Agent,
177
- eval_set: EvalSet,
178
- criteria: dict[str, float],
179
- num_runs=NUM_RUNS,
180
- agent_name=None,
181
- print_detailed_results: bool = True,
182
- ):
183
- eval_case_responses_list = await VeEvaluationGenerator.ve_generate_responses(
184
- eval_set=eval_set,
185
- agent=agent,
186
- repeat_num=num_runs,
187
- agent_name=agent_name,
188
- )
189
- failures = []
190
- evaluation_result_list = []
191
-
192
- for eval_case_responses in eval_case_responses_list:
193
- actual_invocations = [
194
- invocation
195
- for invocations in eval_case_responses.responses
196
- for invocation in invocations
197
- ]
198
- expected_invocations = eval_case_responses.eval_case.conversation * num_runs
199
-
200
- for metric_name, threshold in criteria.items():
201
- metric_evaluator = AgentEvaluator._get_metric_evaluator(
202
- metric_name=metric_name, threshold=threshold
203
- )
204
-
205
- evaluation_result: EvaluationResult = (
206
- metric_evaluator.evaluate_invocations(
207
- actual_invocations=actual_invocations,
208
- expected_invocations=expected_invocations,
209
- )
210
- )
211
-
212
- if print_detailed_results:
213
- AgentEvaluator._print_details(
214
- evaluation_result=evaluation_result,
215
- metric_name=metric_name,
216
- threshold=threshold,
217
- )
218
-
219
- # Gather all the failures.
220
- if evaluation_result.overall_eval_status != EvalStatus.PASSED:
221
- failures.append(
222
- f"{metric_name} for {agent.name} Failed. Expected {threshold},"
223
- f" but got {evaluation_result.overall_score}."
224
- )
225
- evaluation_result_list.append(evaluation_result)
226
-
227
- return evaluation_result_list, failures
228
-
229
-
230
44
  class ADKEvaluator(BaseEvaluator):
231
45
  def __init__(
232
46
  self,
@@ -235,10 +49,8 @@ class ADKEvaluator(BaseEvaluator):
235
49
  ):
236
50
  super().__init__(agent=agent, name=name)
237
51
 
238
- # TODO: implement
239
-
240
52
  @override
241
- async def eval(
53
+ async def evaluate(
242
54
  self,
243
55
  eval_set_file_path: str,
244
56
  eval_id: str = f"test_{formatted_timestamp()}",
@@ -247,6 +59,26 @@ class ADKEvaluator(BaseEvaluator):
247
59
  num_runs: int = 2,
248
60
  print_detailed_results: bool = True,
249
61
  ):
62
+ """
63
+ End-to-end evaluation flow:
64
+ 1) Discover test files (.test.json) or accept a single path.
65
+ 2) Build metric criteria (metric_name -> threshold).
66
+ 3) For each file, build in-memory eval cases via BaseEvaluator.
67
+ 4) For each eval case, construct expected ADK Invocations from expected data.
68
+ 5) Repeat for num_runs:
69
+ - Reset all session_ids to isolate state.
70
+ - Generate actual outputs via BaseEvaluator and convert to ADK Invocations.
71
+ 6) Repeat expected invocations to match num_runs for 1:1 alignment.
72
+ 7) For each metric:
73
+ - Create EvalMetric and get the evaluator from ADK's registry.
74
+ - Call evaluate_invocations (await if async) to get EvaluationResult with:
75
+ overall_score/overall_eval_status + per_invocation_results.
76
+ - Optionally pretty print via AgentEvaluator._print_details.
77
+ - Record failure if overall status != PASSED.
78
+ 8) Return (all evaluation_result objects, failures) to the caller.
79
+ """
80
+
81
+ # Resolve eval files: accept a directory (scan *.test.json) or a single file
250
82
  test_files = []
251
83
  eval_dataset_file_path_or_dir = eval_set_file_path
252
84
  if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
@@ -259,28 +91,149 @@ class ADKEvaluator(BaseEvaluator):
259
91
  else:
260
92
  test_files = [eval_dataset_file_path_or_dir]
261
93
 
262
- initial_session = AgentEvaluator._get_initial_session()
94
+ # Build metric criteria (metric_name -> threshold)
95
+ criteria = {
96
+ TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold, # 1-point scale; 1.0 means perfect tool call trajectory
97
+ RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold, # Rouge-1 text match; 0.8 default threshold
98
+ }
263
99
 
100
+ # Aggregate all evaluation results and failures across files
264
101
  result = []
265
102
  failures = []
103
+
104
+ # Iterate each test file and evaluate per-case, per-metric
266
105
  for test_file in test_files:
267
- criteria = {
268
- TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold, # 1-point scale; 1.0 is perfect.
269
- RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold, # Rouge-1 text match; 0.8 is default.
270
- }
271
- eval_set = AgentEvaluator._load_eval_set_from_file(
272
- test_file, criteria, initial_session
273
- )
106
+ # Build in-memory evaluation cases via BaseEvaluator from the provided file
107
+ self.build_eval_set(test_file)
108
+
109
+ evaluation_result_list = []
110
+
111
+ # For each eval case, generate actual outputs num_runs times using BaseEvaluator
112
+ for case_idx, eval_case_data in enumerate(self.invocation_list):
113
+ # Convert BaseEvaluator's expected data into ADK Invocation list
114
+ expected_invocations: list[Invocation] = []
115
+ for inv in eval_case_data.invocations:
116
+ user_content = genai_types.Content(
117
+ role="user",
118
+ parts=[genai_types.Part(text=inv.input or "")],
119
+ )
120
+ expected_final = genai_types.Content(
121
+ role=None,
122
+ parts=[genai_types.Part(text=inv.expected_output or "")],
123
+ )
124
+ expected_tool_calls = [
125
+ SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
126
+ for t in (inv.expected_tool or [])
127
+ ]
128
+ # Pack a full expected Invocation for ADK metrics
129
+ expected_invocations.append(
130
+ Invocation(
131
+ invocation_id=inv.invocation_id,
132
+ user_content=user_content,
133
+ final_response=expected_final,
134
+ intermediate_data=IntermediateData(
135
+ tool_uses=expected_tool_calls
136
+ ),
137
+ )
138
+ )
139
+
140
+ # Collect actual invocations across runs
141
+ actual_invocations_all_runs: list[Invocation] = []
142
+ for _ in range(num_runs):
143
+ for agent_information in self.agent_information_list:
144
+ agent_information["session_id"] = str(uuid.uuid4())
145
+
146
+ # Generate actual outputs for all cases in this run via BaseEvaluator
147
+ await self.generate_actual_outputs()
148
+
149
+ # Convert BaseEvaluator's actual data into ADK Invocation list
150
+ for inv in eval_case_data.invocations:
151
+ user_content = genai_types.Content(
152
+ role="user",
153
+ parts=[genai_types.Part(text=inv.input or "")],
154
+ )
155
+ actual_final = genai_types.Content(
156
+ role=None,
157
+ parts=[genai_types.Part(text=inv.actual_output or "")],
158
+ )
159
+ # Collect the tool calls observed during actual execution
160
+ actual_tool_calls = [
161
+ SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
162
+ for t in (inv.actual_tool or [])
163
+ ]
164
+ # Pack a full actual Invocation for ADK metrics
165
+ actual_invocations_all_runs.append(
166
+ Invocation(
167
+ invocation_id=inv.invocation_id,
168
+ user_content=user_content,
169
+ final_response=actual_final,
170
+ intermediate_data=IntermediateData(
171
+ tool_uses=actual_tool_calls
172
+ ),
173
+ )
174
+ )
175
+
176
+ # Repeat expected invocations to align with num_runs
177
+ expected_invocations_repeated = expected_invocations * num_runs
178
+
179
+ # Evaluate per metric via ADK metric evaluators obtained from the registry
180
+ for metric_name, threshold in criteria.items():
181
+ eval_metric = EvalMetric(
182
+ metric_name=metric_name, threshold=threshold
183
+ )
184
+ metric_evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(
185
+ eval_metric=eval_metric
186
+ )
274
187
 
275
- res, fail = await VeAgentEvaluator.ve_evaluate_eval_set(
276
- agent=self.agent,
277
- eval_set=eval_set,
278
- criteria=criteria,
279
- num_runs=num_runs,
280
- agent_name=self.agent.name,
281
- print_detailed_results=print_detailed_results,
282
- )
283
- result.append(res)
284
- failures.extend(fail)
188
+ if inspect.iscoroutinefunction(
189
+ metric_evaluator.evaluate_invocations
190
+ ):
191
+ evaluation_result = await metric_evaluator.evaluate_invocations(
192
+ actual_invocations=actual_invocations_all_runs,
193
+ expected_invocations=expected_invocations_repeated,
194
+ )
195
+ else:
196
+ evaluation_result = metric_evaluator.evaluate_invocations(
197
+ actual_invocations=actual_invocations_all_runs,
198
+ expected_invocations=expected_invocations_repeated,
199
+ )
200
+
201
+ if print_detailed_results:
202
+ per_items = []
203
+ for i, per in enumerate(
204
+ getattr(evaluation_result, "per_invocation_results", [])
205
+ or []
206
+ ):
207
+ per_items.append(
208
+ SimpleNamespace(
209
+ actual_invocation=actual_invocations_all_runs[i],
210
+ expected_invocation=expected_invocations_repeated[
211
+ i
212
+ ],
213
+ eval_metric_result=SimpleNamespace(
214
+ eval_status=per.eval_status,
215
+ score=per.score,
216
+ threshold=threshold,
217
+ ),
218
+ )
219
+ )
220
+
221
+ AgentEvaluator._print_details(
222
+ eval_metric_result_with_invocations=per_items,
223
+ overall_eval_status=evaluation_result.overall_eval_status,
224
+ overall_score=evaluation_result.overall_score,
225
+ metric_name=metric_name,
226
+ threshold=threshold,
227
+ )
228
+
229
+ if evaluation_result.overall_eval_status != EvalStatus.PASSED:
230
+ failures.append(
231
+ f"{metric_name} for {self.agent.name} Failed. Expected {threshold},"
232
+ f" but got {evaluation_result.overall_score}."
233
+ )
234
+
235
+ evaluation_result_list.append(evaluation_result)
236
+
237
+ result.append(evaluation_result_list)
285
238
 
286
239
  return result, failures
@@ -28,7 +28,13 @@ from pydantic import BaseModel
28
28
  from veadk.utils.misc import formatted_timestamp
29
29
 
30
30
 
31
- class InvocationTestData(BaseModel):
31
+ class ToolInvocation(BaseModel):
32
+ tool_name: str
33
+ tool_args: dict[str, Any] = {}
34
+ tool_result: Any = None
35
+
36
+
37
+ class Invocation(BaseModel):
32
38
  invocation_id: str = ""
33
39
  input: str
34
40
  actual_output: str
@@ -38,8 +44,8 @@ class InvocationTestData(BaseModel):
38
44
  latency: str = "" # ms
39
45
 
40
46
 
41
- class EvalCaseData(BaseModel):
42
- invocations: list[InvocationTestData]
47
+ class EvalTestCase(BaseModel):
48
+ invocations: list[Invocation]
43
49
 
44
50
 
45
51
  class MetricResult(BaseModel):
@@ -78,23 +84,23 @@ class BaseEvaluator:
78
84
  ):
79
85
  self.name = name
80
86
  self.agent = agent
81
- self.invocation_list: list[EvalCaseData] = []
87
+ self.invocation_list: list[EvalTestCase] = []
82
88
  self.result_list: list[EvalResultData] = []
83
89
  self.agent_information_list: list[dict] = []
84
90
 
85
- def _load_eval_set(self, eval_set_file: str) -> EvalSet:
86
- from .eval_set_file_loader import load_eval_set_from_file
91
+ def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet:
92
+ from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file
87
93
 
88
- return load_eval_set_from_file(eval_set_file)
94
+ return load_eval_set_from_file(eval_json_path)
89
95
 
90
- def _load_eval_set_from_tracing(self, tracing_file: str) -> EvalSet:
96
+ def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
91
97
  try:
92
- with open(tracing_file, "r") as f:
98
+ with open(tracing_json_path, "r") as f:
93
99
  tracing_data = json.load(f)
94
100
  except json.JSONDecodeError as e:
95
- raise ValueError(f"Invalid JSON format in file {tracing_file}: {e}")
101
+ raise ValueError(f"Invalid JSON format in file {tracing_json_path}: {e}")
96
102
  except Exception as e:
97
- raise ValueError(f"Error reading file {tracing_file}: {e}")
103
+ raise ValueError(f"Error reading file {tracing_json_path}: {e}")
98
104
 
99
105
  # Group spans by trace_id
100
106
  trace_groups = {}
@@ -188,9 +194,9 @@ class BaseEvaluator:
188
194
 
189
195
  return evalset
190
196
 
191
- def generate_eval_data(self, file_path: str):
197
+ def build_eval_set(self, file_path: str):
192
198
  """Generate evaluation data from a given file and assign it to the class attribute `invocation_list`."""
193
- eval_case_data_list: list[EvalCaseData] = []
199
+ eval_case_data_list: list[EvalTestCase] = []
194
200
 
195
201
  try:
196
202
  with open(file_path, "r") as f:
@@ -201,7 +207,7 @@ class BaseEvaluator:
201
207
  raise ValueError(f"Error reading file {file_path}: {e}")
202
208
 
203
209
  if isinstance(file_content, dict) and "eval_cases" in file_content:
204
- eval_cases = self._load_eval_set(file_path).eval_cases
210
+ eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
205
211
  elif (
206
212
  isinstance(file_content, list)
207
213
  and len(file_content) > 0
@@ -209,14 +215,14 @@ class BaseEvaluator:
209
215
  isinstance(span, dict) and "trace_id" in span for span in file_content
210
216
  )
211
217
  ):
212
- eval_cases = self._load_eval_set_from_tracing(file_path).eval_cases
218
+ eval_cases = self._build_eval_set_from_tracing_json(file_path).eval_cases
213
219
  else:
214
220
  raise ValueError(
215
221
  f"Unsupported file format in {file_path}. Please provide a valid file."
216
222
  )
217
223
 
218
224
  for eval_case in eval_cases:
219
- eval_case_data = EvalCaseData(invocations=[])
225
+ eval_case_data = EvalTestCase(invocations=[])
220
226
  if eval_case.session_input:
221
227
  self.agent_information_list.append(
222
228
  {
@@ -247,7 +253,7 @@ class BaseEvaluator:
247
253
  )
248
254
 
249
255
  eval_case_data.invocations.append(
250
- InvocationTestData(
256
+ Invocation(
251
257
  invocation_id=invocation.invocation_id,
252
258
  input=_input,
253
259
  actual_output="",
@@ -261,7 +267,7 @@ class BaseEvaluator:
261
267
  eval_case_data_list.append(eval_case_data)
262
268
  self.invocation_list = eval_case_data_list
263
269
 
264
- async def _run_agent_for_actual_data(self):
270
+ async def generate_actual_outputs(self):
265
271
  for eval_case_data, agent_information in zip(
266
272
  self.invocation_list, self.agent_information_list
267
273
  ):
@@ -333,7 +339,7 @@ class BaseEvaluator:
333
339
  invocation.actual_tool = _actual_tool
334
340
  invocation.latency = _latency
335
341
 
336
- def get_data(self) -> list[list[dict[str, Any]]]:
342
+ def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
337
343
  """Merge the evaluation data and return it in the format of list[list[dict]]"""
338
344
  result = []
339
345
  for i, eval_case in enumerate(self.invocation_list):
@@ -360,7 +366,7 @@ class BaseEvaluator:
360
366
  return result
361
367
 
362
368
  @abstractmethod
363
- async def eval(
369
+ async def evaluate(
364
370
  self,
365
371
  eval_set_file_path: str,
366
372
  metrics: list[Any],
@@ -27,8 +27,11 @@ from veadk.config import getenv
27
27
  from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
28
28
  from veadk.utils.logger import get_logger
29
29
 
30
- from ..base_evaluator import BaseEvaluator, EvalResultData, MetricResult
31
- from ..utils.prometheus import PrometheusPushgatewayConfig, push_to_prometheus
30
+ from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
31
+ from veadk.evaluation.utils.prometheus import (
32
+ PrometheusPushgatewayConfig,
33
+ push_to_prometheus,
34
+ )
32
35
 
33
36
  logger = get_logger(__name__)
34
37
 
@@ -66,7 +69,7 @@ class DeepevalEvaluator(BaseEvaluator):
66
69
  self.prometheus_config = prometheus_config
67
70
 
68
71
  @override
69
- async def eval(
72
+ async def evaluate(
70
73
  self,
71
74
  eval_set_file_path: str,
72
75
  metrics: list[BaseMetric],
@@ -74,11 +77,11 @@ class DeepevalEvaluator(BaseEvaluator):
74
77
  ):
75
78
  """Target to Google ADK, we will use the same evaluation case format as Google ADK."""
76
79
  # Get evaluation data by parsing eval set file
77
- self.generate_eval_data(eval_set_file_path)
80
+ self.build_eval_set(eval_set_file_path)
78
81
 
79
82
  # Get actual data by running agent
80
83
  logger.info("Start to run agent for actual data.")
81
- await self._run_agent_for_actual_data()
84
+ await self.generate_actual_outputs()
82
85
  eval_case_data_list = self.invocation_list
83
86
 
84
87
  # Build test cases in Deepeval format