veadk-python 0.2.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. veadk/__init__.py +37 -0
  2. veadk/a2a/__init__.py +13 -0
  3. veadk/a2a/agent_card.py +45 -0
  4. veadk/a2a/remote_ve_agent.py +390 -0
  5. veadk/a2a/utils/__init__.py +13 -0
  6. veadk/a2a/utils/agent_to_a2a.py +170 -0
  7. veadk/a2a/ve_a2a_server.py +93 -0
  8. veadk/a2a/ve_agent_executor.py +78 -0
  9. veadk/a2a/ve_middlewares.py +313 -0
  10. veadk/a2a/ve_task_store.py +37 -0
  11. veadk/agent.py +402 -0
  12. veadk/agent_builder.py +93 -0
  13. veadk/agents/loop_agent.py +68 -0
  14. veadk/agents/parallel_agent.py +72 -0
  15. veadk/agents/sequential_agent.py +64 -0
  16. veadk/auth/__init__.py +13 -0
  17. veadk/auth/base_auth.py +22 -0
  18. veadk/auth/ve_credential_service.py +203 -0
  19. veadk/auth/veauth/__init__.py +13 -0
  20. veadk/auth/veauth/apmplus_veauth.py +58 -0
  21. veadk/auth/veauth/ark_veauth.py +75 -0
  22. veadk/auth/veauth/base_veauth.py +50 -0
  23. veadk/auth/veauth/cozeloop_veauth.py +13 -0
  24. veadk/auth/veauth/opensearch_veauth.py +75 -0
  25. veadk/auth/veauth/postgresql_veauth.py +75 -0
  26. veadk/auth/veauth/prompt_pilot_veauth.py +60 -0
  27. veadk/auth/veauth/speech_veauth.py +54 -0
  28. veadk/auth/veauth/utils.py +69 -0
  29. veadk/auth/veauth/vesearch_veauth.py +62 -0
  30. veadk/auth/veauth/viking_mem0_veauth.py +91 -0
  31. veadk/cli/__init__.py +13 -0
  32. veadk/cli/cli.py +58 -0
  33. veadk/cli/cli_clean.py +87 -0
  34. veadk/cli/cli_create.py +163 -0
  35. veadk/cli/cli_deploy.py +233 -0
  36. veadk/cli/cli_eval.py +215 -0
  37. veadk/cli/cli_init.py +214 -0
  38. veadk/cli/cli_kb.py +110 -0
  39. veadk/cli/cli_pipeline.py +285 -0
  40. veadk/cli/cli_prompt.py +86 -0
  41. veadk/cli/cli_update.py +106 -0
  42. veadk/cli/cli_uploadevalset.py +139 -0
  43. veadk/cli/cli_web.py +143 -0
  44. veadk/cloud/__init__.py +13 -0
  45. veadk/cloud/cloud_agent_engine.py +485 -0
  46. veadk/cloud/cloud_app.py +475 -0
  47. veadk/config.py +115 -0
  48. veadk/configs/__init__.py +13 -0
  49. veadk/configs/auth_configs.py +133 -0
  50. veadk/configs/database_configs.py +132 -0
  51. veadk/configs/model_configs.py +78 -0
  52. veadk/configs/tool_configs.py +54 -0
  53. veadk/configs/tracing_configs.py +110 -0
  54. veadk/consts.py +74 -0
  55. veadk/evaluation/__init__.py +17 -0
  56. veadk/evaluation/adk_evaluator/__init__.py +17 -0
  57. veadk/evaluation/adk_evaluator/adk_evaluator.py +302 -0
  58. veadk/evaluation/base_evaluator.py +642 -0
  59. veadk/evaluation/deepeval_evaluator/__init__.py +17 -0
  60. veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +339 -0
  61. veadk/evaluation/eval_set_file_loader.py +48 -0
  62. veadk/evaluation/eval_set_recorder.py +146 -0
  63. veadk/evaluation/types.py +65 -0
  64. veadk/evaluation/utils/prometheus.py +196 -0
  65. veadk/integrations/__init__.py +13 -0
  66. veadk/integrations/ve_apig/__init__.py +13 -0
  67. veadk/integrations/ve_apig/ve_apig.py +349 -0
  68. veadk/integrations/ve_apig/ve_apig_utils.py +332 -0
  69. veadk/integrations/ve_code_pipeline/__init__.py +13 -0
  70. veadk/integrations/ve_code_pipeline/ve_code_pipeline.py +431 -0
  71. veadk/integrations/ve_cozeloop/__init__.py +13 -0
  72. veadk/integrations/ve_cozeloop/ve_cozeloop.py +96 -0
  73. veadk/integrations/ve_cr/__init__.py +13 -0
  74. veadk/integrations/ve_cr/ve_cr.py +220 -0
  75. veadk/integrations/ve_faas/__init__.py +13 -0
  76. veadk/integrations/ve_faas/template/cookiecutter.json +15 -0
  77. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
  78. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
  79. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/config.yaml.example +6 -0
  80. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/deploy.py +106 -0
  81. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__init__.py +13 -0
  82. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/agent.py +25 -0
  83. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/app.py +202 -0
  84. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/requirements.txt +3 -0
  85. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/run.sh +49 -0
  86. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/__init__.py +14 -0
  87. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/agent.py +27 -0
  88. veadk/integrations/ve_faas/ve_faas.py +754 -0
  89. veadk/integrations/ve_faas/ve_faas_utils.py +408 -0
  90. veadk/integrations/ve_faas/web_template/cookiecutter.json +20 -0
  91. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
  92. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
  93. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/config.yaml.example +2 -0
  94. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/deploy.py +44 -0
  95. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/Dockerfile +23 -0
  96. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/app.py +123 -0
  97. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/init_db.py +46 -0
  98. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/models.py +36 -0
  99. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/requirements.txt +4 -0
  100. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/run.sh +21 -0
  101. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/css/style.css +368 -0
  102. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/js/admin.js +0 -0
  103. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/dashboard.html +21 -0
  104. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/edit_post.html +24 -0
  105. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/login.html +21 -0
  106. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/posts.html +53 -0
  107. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/base.html +45 -0
  108. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/index.html +29 -0
  109. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/post.html +14 -0
  110. veadk/integrations/ve_identity/__init__.py +110 -0
  111. veadk/integrations/ve_identity/auth_config.py +261 -0
  112. veadk/integrations/ve_identity/auth_mixins.py +650 -0
  113. veadk/integrations/ve_identity/auth_processor.py +385 -0
  114. veadk/integrations/ve_identity/function_tool.py +158 -0
  115. veadk/integrations/ve_identity/identity_client.py +864 -0
  116. veadk/integrations/ve_identity/mcp_tool.py +181 -0
  117. veadk/integrations/ve_identity/mcp_toolset.py +431 -0
  118. veadk/integrations/ve_identity/models.py +228 -0
  119. veadk/integrations/ve_identity/token_manager.py +188 -0
  120. veadk/integrations/ve_identity/utils.py +151 -0
  121. veadk/integrations/ve_prompt_pilot/__init__.py +13 -0
  122. veadk/integrations/ve_prompt_pilot/ve_prompt_pilot.py +85 -0
  123. veadk/integrations/ve_tls/__init__.py +13 -0
  124. veadk/integrations/ve_tls/utils.py +116 -0
  125. veadk/integrations/ve_tls/ve_tls.py +212 -0
  126. veadk/integrations/ve_tos/ve_tos.py +710 -0
  127. veadk/integrations/ve_viking_db_memory/__init__.py +13 -0
  128. veadk/integrations/ve_viking_db_memory/ve_viking_db_memory.py +308 -0
  129. veadk/knowledgebase/__init__.py +17 -0
  130. veadk/knowledgebase/backends/__init__.py +13 -0
  131. veadk/knowledgebase/backends/base_backend.py +72 -0
  132. veadk/knowledgebase/backends/in_memory_backend.py +91 -0
  133. veadk/knowledgebase/backends/opensearch_backend.py +162 -0
  134. veadk/knowledgebase/backends/redis_backend.py +172 -0
  135. veadk/knowledgebase/backends/utils.py +92 -0
  136. veadk/knowledgebase/backends/vikingdb_knowledge_backend.py +608 -0
  137. veadk/knowledgebase/entry.py +25 -0
  138. veadk/knowledgebase/knowledgebase.py +307 -0
  139. veadk/memory/__init__.py +35 -0
  140. veadk/memory/long_term_memory.py +365 -0
  141. veadk/memory/long_term_memory_backends/__init__.py +13 -0
  142. veadk/memory/long_term_memory_backends/base_backend.py +35 -0
  143. veadk/memory/long_term_memory_backends/in_memory_backend.py +67 -0
  144. veadk/memory/long_term_memory_backends/mem0_backend.py +155 -0
  145. veadk/memory/long_term_memory_backends/opensearch_backend.py +124 -0
  146. veadk/memory/long_term_memory_backends/redis_backend.py +140 -0
  147. veadk/memory/long_term_memory_backends/vikingdb_memory_backend.py +189 -0
  148. veadk/memory/short_term_memory.py +252 -0
  149. veadk/memory/short_term_memory_backends/__init__.py +13 -0
  150. veadk/memory/short_term_memory_backends/base_backend.py +31 -0
  151. veadk/memory/short_term_memory_backends/mysql_backend.py +49 -0
  152. veadk/memory/short_term_memory_backends/postgresql_backend.py +49 -0
  153. veadk/memory/short_term_memory_backends/sqlite_backend.py +55 -0
  154. veadk/memory/short_term_memory_processor.py +100 -0
  155. veadk/processors/__init__.py +26 -0
  156. veadk/processors/base_run_processor.py +120 -0
  157. veadk/prompts/__init__.py +13 -0
  158. veadk/prompts/agent_default_prompt.py +30 -0
  159. veadk/prompts/prompt_evaluator.py +20 -0
  160. veadk/prompts/prompt_memory_processor.py +55 -0
  161. veadk/prompts/prompt_optimization.py +150 -0
  162. veadk/runner.py +732 -0
  163. veadk/tools/__init__.py +13 -0
  164. veadk/tools/builtin_tools/__init__.py +13 -0
  165. veadk/tools/builtin_tools/agent_authorization.py +94 -0
  166. veadk/tools/builtin_tools/generate_image.py +23 -0
  167. veadk/tools/builtin_tools/image_edit.py +300 -0
  168. veadk/tools/builtin_tools/image_generate.py +446 -0
  169. veadk/tools/builtin_tools/lark.py +67 -0
  170. veadk/tools/builtin_tools/las.py +24 -0
  171. veadk/tools/builtin_tools/link_reader.py +66 -0
  172. veadk/tools/builtin_tools/llm_shield.py +381 -0
  173. veadk/tools/builtin_tools/load_knowledgebase.py +97 -0
  174. veadk/tools/builtin_tools/mcp_router.py +29 -0
  175. veadk/tools/builtin_tools/run_code.py +113 -0
  176. veadk/tools/builtin_tools/tts.py +253 -0
  177. veadk/tools/builtin_tools/vesearch.py +49 -0
  178. veadk/tools/builtin_tools/video_generate.py +363 -0
  179. veadk/tools/builtin_tools/web_scraper.py +76 -0
  180. veadk/tools/builtin_tools/web_search.py +83 -0
  181. veadk/tools/demo_tools.py +58 -0
  182. veadk/tools/load_knowledgebase_tool.py +149 -0
  183. veadk/tools/sandbox/__init__.py +13 -0
  184. veadk/tools/sandbox/browser_sandbox.py +37 -0
  185. veadk/tools/sandbox/code_sandbox.py +40 -0
  186. veadk/tools/sandbox/computer_sandbox.py +34 -0
  187. veadk/tracing/__init__.py +13 -0
  188. veadk/tracing/base_tracer.py +58 -0
  189. veadk/tracing/telemetry/__init__.py +13 -0
  190. veadk/tracing/telemetry/attributes/attributes.py +29 -0
  191. veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +180 -0
  192. veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +858 -0
  193. veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +152 -0
  194. veadk/tracing/telemetry/attributes/extractors/types.py +164 -0
  195. veadk/tracing/telemetry/exporters/__init__.py +13 -0
  196. veadk/tracing/telemetry/exporters/apmplus_exporter.py +558 -0
  197. veadk/tracing/telemetry/exporters/base_exporter.py +39 -0
  198. veadk/tracing/telemetry/exporters/cozeloop_exporter.py +129 -0
  199. veadk/tracing/telemetry/exporters/inmemory_exporter.py +248 -0
  200. veadk/tracing/telemetry/exporters/tls_exporter.py +139 -0
  201. veadk/tracing/telemetry/opentelemetry_tracer.py +320 -0
  202. veadk/tracing/telemetry/telemetry.py +411 -0
  203. veadk/types.py +47 -0
  204. veadk/utils/__init__.py +13 -0
  205. veadk/utils/audio_manager.py +95 -0
  206. veadk/utils/auth.py +294 -0
  207. veadk/utils/logger.py +59 -0
  208. veadk/utils/mcp_utils.py +44 -0
  209. veadk/utils/misc.py +184 -0
  210. veadk/utils/patches.py +101 -0
  211. veadk/utils/volcengine_sign.py +205 -0
  212. veadk/version.py +15 -0
  213. veadk_python-0.2.27.dist-info/METADATA +373 -0
  214. veadk_python-0.2.27.dist-info/RECORD +218 -0
  215. veadk_python-0.2.27.dist-info/WHEEL +5 -0
  216. veadk_python-0.2.27.dist-info/entry_points.txt +2 -0
  217. veadk_python-0.2.27.dist-info/licenses/LICENSE +201 -0
  218. veadk_python-0.2.27.dist-info/top_level.txt +1 -0
@@ -0,0 +1,302 @@
1
+ # Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import time
17
+ import uuid
18
+ from os import path
19
+
20
+ from google.adk.evaluation.agent_evaluator import (
21
+ RESPONSE_MATCH_SCORE_KEY,
22
+ TOOL_TRAJECTORY_SCORE_KEY,
23
+ AgentEvaluator,
24
+ )
25
+ from google.adk.evaluation.eval_case import IntermediateData, Invocation
26
+ from google.adk.evaluation.evaluator import EvalStatus
27
+ from google.adk.evaluation.eval_set import EvalSet
28
+ from typing import Optional
29
+ from typing_extensions import override
30
+ from veadk.evaluation.base_evaluator import BaseEvaluator
31
+ from types import SimpleNamespace
32
+ from google.genai import types as genai_types
33
+
34
+ from google.adk.evaluation.eval_metrics import EvalMetric
35
+ from google.adk.evaluation.metric_evaluator_registry import (
36
+ DEFAULT_METRIC_EVALUATOR_REGISTRY,
37
+ )
38
+ import inspect
39
+
40
+
41
+ def formatted_timestamp():
42
+ """Generates a formatted timestamp string in YYYYMMDDHHMMSS format.
43
+
44
+ This function creates a string representation of the current time.
45
+ It uses local time for formatting.
46
+
47
+ Returns:
48
+ str: Timestamp string like '20251028123045'.
49
+ """
50
+ # YYYYMMDDHHMMSS
51
+ return time.strftime("%Y%m%d%H%M%S", time.localtime())
52
+
53
+
54
+ class ADKEvaluator(BaseEvaluator):
55
+ """Evaluates agents using Google ADK metrics.
56
+
57
+ This class uses Google's Agent Development Kit (ADK) to test agents.
58
+ It checks tool usage and response quality.
59
+ Runs tests multiple times for reliable results.
60
+
61
+ Attributes:
62
+ name (str): Name of this evaluator. Defaults to 'veadk_adk_evaluator'.
63
+
64
+ Note:
65
+ Works with .test.json files and folders of files.
66
+ Default thresholds: tool=1.0, response=0.8.
67
+ Runs each test multiple times (default 2) for average scores.
68
+
69
+ Examples:
70
+ ```python
71
+ agent = Agent(tools=[get_city_weather])
72
+ evaluator = ADKEvaluator(agent=agent)
73
+ results, failures = await evaluator.evaluate(eval_set_file_path="test_folder")
74
+ ```
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ agent,
80
+ name: str = "veadk_adk_evaluator",
81
+ ):
82
+ """Initializes the ADK evaluator with agent and name.
83
+
84
+ Args:
85
+ agent: The agent to evaluate.
86
+ name (str): Name of the evaluator. Defaults to 'veadk_adk_evaluator'.
87
+
88
+ Raises:
89
+ ValueError: If agent is invalid.
90
+ """
91
+ super().__init__(agent=agent, name=name)
92
+
93
+ @override
94
+ async def evaluate(
95
+ self,
96
+ eval_set: Optional[EvalSet] = None,
97
+ eval_set_file_path: Optional[str] = None,
98
+ eval_id: str = f"test_{formatted_timestamp()}",
99
+ tool_score_threshold: float = 1.0,
100
+ response_match_score_threshold: float = 0.8,
101
+ num_runs: int = 2,
102
+ print_detailed_results: bool = True,
103
+ ):
104
+ """Tests agent using ADK metrics on test cases.
105
+
106
+ This method does these steps:
107
+ 1. Finds test files in folder or single file
108
+ 2. Sets up scoring rules with thresholds
109
+ 3. Runs agent multiple times for each test
110
+ 4. Converts data to ADK format
111
+ 5. Scores tool usage and response quality
112
+ 6. Collects results and failures
113
+
114
+ Args:
115
+ eval_set: Test cases in memory. If given, used first.
116
+ eval_set_file_path: Path to test file or folder. Used if no eval_set.
117
+ eval_id: Unique name for this test run.
118
+ tool_score_threshold: Minimum score for tool usage. 1.0 means perfect.
119
+ response_match_score_threshold: Minimum score for response match.
120
+ Uses text similarity. 0.8 is default.
121
+ num_runs: How many times to run each test. More runs = more reliable.
122
+ print_detailed_results: If True, shows detailed scores for each test.
123
+
124
+ Returns:
125
+ tuple[list, list]: Two lists:
126
+ - List of evaluation results with scores
127
+ - List of failure messages if tests failed
128
+
129
+ Raises:
130
+ ValueError: If no test cases found or thresholds wrong.
131
+ FileNotFoundError: If test file not found.
132
+ EvaluationError: If agent fails or scoring fails.
133
+
134
+ Examples:
135
+ ```python
136
+ results, failures = await evaluator.evaluate(
137
+ eval_set_file_path="tests/",
138
+ tool_score_threshold=0.9,
139
+ num_runs=3)
140
+ print(f"Results: {len(results)}, Failures: {len(failures)}")
141
+ ```
142
+ """
143
+
144
+ # Resolve eval files: accept a directory (scan *.test.json) or a single file
145
+ test_files = []
146
+ eval_dataset_file_path_or_dir = eval_set_file_path
147
+ if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
148
+ eval_dataset_file_path_or_dir
149
+ ):
150
+ for root, _, files in os.walk(eval_dataset_file_path_or_dir):
151
+ for file in files:
152
+ if file.endswith(".test.json"):
153
+ test_files.append(path.join(root, file))
154
+ else:
155
+ test_files = [eval_dataset_file_path_or_dir]
156
+
157
+ # Build metric criteria (metric_name -> threshold)
158
+ criteria = {
159
+ TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold, # 1-point scale; 1.0 means perfect tool call trajectory
160
+ RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold, # Rouge-1 text match; 0.8 default threshold
161
+ }
162
+
163
+ # Aggregate all evaluation results and failures across files
164
+ result = []
165
+ failures = []
166
+
167
+ # Iterate each test file and evaluate per-case, per-metric
168
+ for test_file in test_files:
169
+ # Build in-memory evaluation cases via BaseEvaluator from the provided file
170
+ self.build_eval_set(eval_set, test_file)
171
+
172
+ evaluation_result_list = []
173
+
174
+ # For each eval case, generate actual outputs num_runs times using BaseEvaluator
175
+ for case_idx, eval_case_data in enumerate(self.invocation_list):
176
+ # Convert BaseEvaluator's expected data into ADK Invocation list
177
+ expected_invocations: list[Invocation] = []
178
+ for inv in eval_case_data.invocations:
179
+ user_content = genai_types.Content(
180
+ role="user",
181
+ parts=[genai_types.Part(text=inv.input or "")],
182
+ )
183
+ expected_final = genai_types.Content(
184
+ role=None,
185
+ parts=[genai_types.Part(text=inv.expected_output or "")],
186
+ )
187
+ expected_tool_calls = [
188
+ SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
189
+ for t in (inv.expected_tool or [])
190
+ ]
191
+ # Pack a full expected Invocation for ADK metrics
192
+ expected_invocations.append(
193
+ Invocation(
194
+ invocation_id=inv.invocation_id,
195
+ user_content=user_content,
196
+ final_response=expected_final,
197
+ intermediate_data=IntermediateData(
198
+ tool_uses=expected_tool_calls
199
+ ),
200
+ )
201
+ )
202
+
203
+ # Collect actual invocations across runs
204
+ actual_invocations_all_runs: list[Invocation] = []
205
+ for _ in range(num_runs):
206
+ for agent_information in self.agent_information_list:
207
+ agent_information["session_id"] = str(uuid.uuid4())
208
+
209
+ # Generate actual outputs for all cases in this run via BaseEvaluator
210
+ await self.generate_actual_outputs()
211
+
212
+ # Convert BaseEvaluator's actual data into ADK Invocation list
213
+ for inv in eval_case_data.invocations:
214
+ user_content = genai_types.Content(
215
+ role="user",
216
+ parts=[genai_types.Part(text=inv.input or "")],
217
+ )
218
+ actual_final = genai_types.Content(
219
+ role=None,
220
+ parts=[genai_types.Part(text=inv.actual_output or "")],
221
+ )
222
+ # Collect the tool calls observed during actual execution
223
+ actual_tool_calls = [
224
+ SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
225
+ for t in (inv.actual_tool or [])
226
+ ]
227
+ # Pack a full actual Invocation for ADK metrics
228
+ actual_invocations_all_runs.append(
229
+ Invocation(
230
+ invocation_id=inv.invocation_id,
231
+ user_content=user_content,
232
+ final_response=actual_final,
233
+ intermediate_data=IntermediateData(
234
+ tool_uses=actual_tool_calls
235
+ ),
236
+ )
237
+ )
238
+
239
+ # Repeat expected invocations to align with num_runs
240
+ expected_invocations_repeated = expected_invocations * num_runs
241
+
242
+ # Evaluate per metric via ADK metric evaluators obtained from the registry
243
+ for metric_name, threshold in criteria.items():
244
+ eval_metric = EvalMetric(
245
+ metric_name=metric_name, threshold=threshold
246
+ )
247
+ metric_evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(
248
+ eval_metric=eval_metric
249
+ )
250
+
251
+ if inspect.iscoroutinefunction(
252
+ metric_evaluator.evaluate_invocations
253
+ ):
254
+ evaluation_result = await metric_evaluator.evaluate_invocations(
255
+ actual_invocations=actual_invocations_all_runs,
256
+ expected_invocations=expected_invocations_repeated,
257
+ )
258
+ else:
259
+ evaluation_result = metric_evaluator.evaluate_invocations(
260
+ actual_invocations=actual_invocations_all_runs,
261
+ expected_invocations=expected_invocations_repeated,
262
+ )
263
+
264
+ if print_detailed_results:
265
+ per_items = []
266
+ for i, per in enumerate(
267
+ getattr(evaluation_result, "per_invocation_results", [])
268
+ or []
269
+ ):
270
+ per_items.append(
271
+ SimpleNamespace(
272
+ actual_invocation=actual_invocations_all_runs[i],
273
+ expected_invocation=expected_invocations_repeated[
274
+ i
275
+ ],
276
+ eval_metric_result=SimpleNamespace(
277
+ eval_status=per.eval_status,
278
+ score=per.score,
279
+ threshold=threshold,
280
+ ),
281
+ )
282
+ )
283
+
284
+ AgentEvaluator._print_details(
285
+ eval_metric_result_with_invocations=per_items,
286
+ overall_eval_status=evaluation_result.overall_eval_status,
287
+ overall_score=evaluation_result.overall_score,
288
+ metric_name=metric_name,
289
+ threshold=threshold,
290
+ )
291
+
292
+ if evaluation_result.overall_eval_status != EvalStatus.PASSED:
293
+ failures.append(
294
+ f"{metric_name} for {self.agent.name} Failed. Expected {threshold},"
295
+ f" but got {evaluation_result.overall_score}."
296
+ )
297
+
298
+ evaluation_result_list.append(evaluation_result)
299
+
300
+ result.append(evaluation_result_list)
301
+
302
+ return result, failures