veadk-python 0.2.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. veadk/__init__.py +37 -0
  2. veadk/a2a/__init__.py +13 -0
  3. veadk/a2a/agent_card.py +45 -0
  4. veadk/a2a/remote_ve_agent.py +390 -0
  5. veadk/a2a/utils/__init__.py +13 -0
  6. veadk/a2a/utils/agent_to_a2a.py +170 -0
  7. veadk/a2a/ve_a2a_server.py +93 -0
  8. veadk/a2a/ve_agent_executor.py +78 -0
  9. veadk/a2a/ve_middlewares.py +313 -0
  10. veadk/a2a/ve_task_store.py +37 -0
  11. veadk/agent.py +402 -0
  12. veadk/agent_builder.py +93 -0
  13. veadk/agents/loop_agent.py +68 -0
  14. veadk/agents/parallel_agent.py +72 -0
  15. veadk/agents/sequential_agent.py +64 -0
  16. veadk/auth/__init__.py +13 -0
  17. veadk/auth/base_auth.py +22 -0
  18. veadk/auth/ve_credential_service.py +203 -0
  19. veadk/auth/veauth/__init__.py +13 -0
  20. veadk/auth/veauth/apmplus_veauth.py +58 -0
  21. veadk/auth/veauth/ark_veauth.py +75 -0
  22. veadk/auth/veauth/base_veauth.py +50 -0
  23. veadk/auth/veauth/cozeloop_veauth.py +13 -0
  24. veadk/auth/veauth/opensearch_veauth.py +75 -0
  25. veadk/auth/veauth/postgresql_veauth.py +75 -0
  26. veadk/auth/veauth/prompt_pilot_veauth.py +60 -0
  27. veadk/auth/veauth/speech_veauth.py +54 -0
  28. veadk/auth/veauth/utils.py +69 -0
  29. veadk/auth/veauth/vesearch_veauth.py +62 -0
  30. veadk/auth/veauth/viking_mem0_veauth.py +91 -0
  31. veadk/cli/__init__.py +13 -0
  32. veadk/cli/cli.py +58 -0
  33. veadk/cli/cli_clean.py +87 -0
  34. veadk/cli/cli_create.py +163 -0
  35. veadk/cli/cli_deploy.py +233 -0
  36. veadk/cli/cli_eval.py +215 -0
  37. veadk/cli/cli_init.py +214 -0
  38. veadk/cli/cli_kb.py +110 -0
  39. veadk/cli/cli_pipeline.py +285 -0
  40. veadk/cli/cli_prompt.py +86 -0
  41. veadk/cli/cli_update.py +106 -0
  42. veadk/cli/cli_uploadevalset.py +139 -0
  43. veadk/cli/cli_web.py +143 -0
  44. veadk/cloud/__init__.py +13 -0
  45. veadk/cloud/cloud_agent_engine.py +485 -0
  46. veadk/cloud/cloud_app.py +475 -0
  47. veadk/config.py +115 -0
  48. veadk/configs/__init__.py +13 -0
  49. veadk/configs/auth_configs.py +133 -0
  50. veadk/configs/database_configs.py +132 -0
  51. veadk/configs/model_configs.py +78 -0
  52. veadk/configs/tool_configs.py +54 -0
  53. veadk/configs/tracing_configs.py +110 -0
  54. veadk/consts.py +74 -0
  55. veadk/evaluation/__init__.py +17 -0
  56. veadk/evaluation/adk_evaluator/__init__.py +17 -0
  57. veadk/evaluation/adk_evaluator/adk_evaluator.py +302 -0
  58. veadk/evaluation/base_evaluator.py +642 -0
  59. veadk/evaluation/deepeval_evaluator/__init__.py +17 -0
  60. veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +339 -0
  61. veadk/evaluation/eval_set_file_loader.py +48 -0
  62. veadk/evaluation/eval_set_recorder.py +146 -0
  63. veadk/evaluation/types.py +65 -0
  64. veadk/evaluation/utils/prometheus.py +196 -0
  65. veadk/integrations/__init__.py +13 -0
  66. veadk/integrations/ve_apig/__init__.py +13 -0
  67. veadk/integrations/ve_apig/ve_apig.py +349 -0
  68. veadk/integrations/ve_apig/ve_apig_utils.py +332 -0
  69. veadk/integrations/ve_code_pipeline/__init__.py +13 -0
  70. veadk/integrations/ve_code_pipeline/ve_code_pipeline.py +431 -0
  71. veadk/integrations/ve_cozeloop/__init__.py +13 -0
  72. veadk/integrations/ve_cozeloop/ve_cozeloop.py +96 -0
  73. veadk/integrations/ve_cr/__init__.py +13 -0
  74. veadk/integrations/ve_cr/ve_cr.py +220 -0
  75. veadk/integrations/ve_faas/__init__.py +13 -0
  76. veadk/integrations/ve_faas/template/cookiecutter.json +15 -0
  77. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
  78. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
  79. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/config.yaml.example +6 -0
  80. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/deploy.py +106 -0
  81. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__init__.py +13 -0
  82. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/agent.py +25 -0
  83. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/app.py +202 -0
  84. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/requirements.txt +3 -0
  85. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/run.sh +49 -0
  86. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/__init__.py +14 -0
  87. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/agent.py +27 -0
  88. veadk/integrations/ve_faas/ve_faas.py +754 -0
  89. veadk/integrations/ve_faas/ve_faas_utils.py +408 -0
  90. veadk/integrations/ve_faas/web_template/cookiecutter.json +20 -0
  91. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
  92. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
  93. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/config.yaml.example +2 -0
  94. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/deploy.py +44 -0
  95. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/Dockerfile +23 -0
  96. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/app.py +123 -0
  97. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/init_db.py +46 -0
  98. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/models.py +36 -0
  99. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/requirements.txt +4 -0
  100. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/run.sh +21 -0
  101. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/css/style.css +368 -0
  102. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/js/admin.js +0 -0
  103. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/dashboard.html +21 -0
  104. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/edit_post.html +24 -0
  105. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/login.html +21 -0
  106. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/posts.html +53 -0
  107. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/base.html +45 -0
  108. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/index.html +29 -0
  109. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/post.html +14 -0
  110. veadk/integrations/ve_identity/__init__.py +110 -0
  111. veadk/integrations/ve_identity/auth_config.py +261 -0
  112. veadk/integrations/ve_identity/auth_mixins.py +650 -0
  113. veadk/integrations/ve_identity/auth_processor.py +385 -0
  114. veadk/integrations/ve_identity/function_tool.py +158 -0
  115. veadk/integrations/ve_identity/identity_client.py +864 -0
  116. veadk/integrations/ve_identity/mcp_tool.py +181 -0
  117. veadk/integrations/ve_identity/mcp_toolset.py +431 -0
  118. veadk/integrations/ve_identity/models.py +228 -0
  119. veadk/integrations/ve_identity/token_manager.py +188 -0
  120. veadk/integrations/ve_identity/utils.py +151 -0
  121. veadk/integrations/ve_prompt_pilot/__init__.py +13 -0
  122. veadk/integrations/ve_prompt_pilot/ve_prompt_pilot.py +85 -0
  123. veadk/integrations/ve_tls/__init__.py +13 -0
  124. veadk/integrations/ve_tls/utils.py +116 -0
  125. veadk/integrations/ve_tls/ve_tls.py +212 -0
  126. veadk/integrations/ve_tos/ve_tos.py +710 -0
  127. veadk/integrations/ve_viking_db_memory/__init__.py +13 -0
  128. veadk/integrations/ve_viking_db_memory/ve_viking_db_memory.py +308 -0
  129. veadk/knowledgebase/__init__.py +17 -0
  130. veadk/knowledgebase/backends/__init__.py +13 -0
  131. veadk/knowledgebase/backends/base_backend.py +72 -0
  132. veadk/knowledgebase/backends/in_memory_backend.py +91 -0
  133. veadk/knowledgebase/backends/opensearch_backend.py +162 -0
  134. veadk/knowledgebase/backends/redis_backend.py +172 -0
  135. veadk/knowledgebase/backends/utils.py +92 -0
  136. veadk/knowledgebase/backends/vikingdb_knowledge_backend.py +608 -0
  137. veadk/knowledgebase/entry.py +25 -0
  138. veadk/knowledgebase/knowledgebase.py +307 -0
  139. veadk/memory/__init__.py +35 -0
  140. veadk/memory/long_term_memory.py +365 -0
  141. veadk/memory/long_term_memory_backends/__init__.py +13 -0
  142. veadk/memory/long_term_memory_backends/base_backend.py +35 -0
  143. veadk/memory/long_term_memory_backends/in_memory_backend.py +67 -0
  144. veadk/memory/long_term_memory_backends/mem0_backend.py +155 -0
  145. veadk/memory/long_term_memory_backends/opensearch_backend.py +124 -0
  146. veadk/memory/long_term_memory_backends/redis_backend.py +140 -0
  147. veadk/memory/long_term_memory_backends/vikingdb_memory_backend.py +189 -0
  148. veadk/memory/short_term_memory.py +252 -0
  149. veadk/memory/short_term_memory_backends/__init__.py +13 -0
  150. veadk/memory/short_term_memory_backends/base_backend.py +31 -0
  151. veadk/memory/short_term_memory_backends/mysql_backend.py +49 -0
  152. veadk/memory/short_term_memory_backends/postgresql_backend.py +49 -0
  153. veadk/memory/short_term_memory_backends/sqlite_backend.py +55 -0
  154. veadk/memory/short_term_memory_processor.py +100 -0
  155. veadk/processors/__init__.py +26 -0
  156. veadk/processors/base_run_processor.py +120 -0
  157. veadk/prompts/__init__.py +13 -0
  158. veadk/prompts/agent_default_prompt.py +30 -0
  159. veadk/prompts/prompt_evaluator.py +20 -0
  160. veadk/prompts/prompt_memory_processor.py +55 -0
  161. veadk/prompts/prompt_optimization.py +150 -0
  162. veadk/runner.py +732 -0
  163. veadk/tools/__init__.py +13 -0
  164. veadk/tools/builtin_tools/__init__.py +13 -0
  165. veadk/tools/builtin_tools/agent_authorization.py +94 -0
  166. veadk/tools/builtin_tools/generate_image.py +23 -0
  167. veadk/tools/builtin_tools/image_edit.py +300 -0
  168. veadk/tools/builtin_tools/image_generate.py +446 -0
  169. veadk/tools/builtin_tools/lark.py +67 -0
  170. veadk/tools/builtin_tools/las.py +24 -0
  171. veadk/tools/builtin_tools/link_reader.py +66 -0
  172. veadk/tools/builtin_tools/llm_shield.py +381 -0
  173. veadk/tools/builtin_tools/load_knowledgebase.py +97 -0
  174. veadk/tools/builtin_tools/mcp_router.py +29 -0
  175. veadk/tools/builtin_tools/run_code.py +113 -0
  176. veadk/tools/builtin_tools/tts.py +253 -0
  177. veadk/tools/builtin_tools/vesearch.py +49 -0
  178. veadk/tools/builtin_tools/video_generate.py +363 -0
  179. veadk/tools/builtin_tools/web_scraper.py +76 -0
  180. veadk/tools/builtin_tools/web_search.py +83 -0
  181. veadk/tools/demo_tools.py +58 -0
  182. veadk/tools/load_knowledgebase_tool.py +149 -0
  183. veadk/tools/sandbox/__init__.py +13 -0
  184. veadk/tools/sandbox/browser_sandbox.py +37 -0
  185. veadk/tools/sandbox/code_sandbox.py +40 -0
  186. veadk/tools/sandbox/computer_sandbox.py +34 -0
  187. veadk/tracing/__init__.py +13 -0
  188. veadk/tracing/base_tracer.py +58 -0
  189. veadk/tracing/telemetry/__init__.py +13 -0
  190. veadk/tracing/telemetry/attributes/attributes.py +29 -0
  191. veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +180 -0
  192. veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +858 -0
  193. veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +152 -0
  194. veadk/tracing/telemetry/attributes/extractors/types.py +164 -0
  195. veadk/tracing/telemetry/exporters/__init__.py +13 -0
  196. veadk/tracing/telemetry/exporters/apmplus_exporter.py +558 -0
  197. veadk/tracing/telemetry/exporters/base_exporter.py +39 -0
  198. veadk/tracing/telemetry/exporters/cozeloop_exporter.py +129 -0
  199. veadk/tracing/telemetry/exporters/inmemory_exporter.py +248 -0
  200. veadk/tracing/telemetry/exporters/tls_exporter.py +139 -0
  201. veadk/tracing/telemetry/opentelemetry_tracer.py +320 -0
  202. veadk/tracing/telemetry/telemetry.py +411 -0
  203. veadk/types.py +47 -0
  204. veadk/utils/__init__.py +13 -0
  205. veadk/utils/audio_manager.py +95 -0
  206. veadk/utils/auth.py +294 -0
  207. veadk/utils/logger.py +59 -0
  208. veadk/utils/mcp_utils.py +44 -0
  209. veadk/utils/misc.py +184 -0
  210. veadk/utils/patches.py +101 -0
  211. veadk/utils/volcengine_sign.py +205 -0
  212. veadk/version.py +15 -0
  213. veadk_python-0.2.27.dist-info/METADATA +373 -0
  214. veadk_python-0.2.27.dist-info/RECORD +218 -0
  215. veadk_python-0.2.27.dist-info/WHEEL +5 -0
  216. veadk_python-0.2.27.dist-info/entry_points.txt +2 -0
  217. veadk_python-0.2.27.dist-info/licenses/LICENSE +201 -0
  218. veadk_python-0.2.27.dist-info/top_level.txt +1 -0
@@ -0,0 +1,339 @@
1
+ # Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import time
16
+ from typing import Optional
17
+
18
+ from deepeval import evaluate
19
+ from deepeval.evaluate import CacheConfig
20
+ from deepeval.evaluate.types import EvaluationResult
21
+ from deepeval.metrics import BaseMetric
22
+ from deepeval.models import LocalModel
23
+ from deepeval.test_case import LLMTestCase
24
+ from deepeval.test_case.llm_test_case import ToolCall
25
+ from google.adk.evaluation.eval_set import EvalSet
26
+ from typing_extensions import override
27
+
28
+ from veadk.config import getenv
29
+ from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
30
+ from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
31
+ from veadk.evaluation.utils.prometheus import (
32
+ PrometheusPushgatewayConfig,
33
+ push_to_prometheus,
34
+ )
35
+ from veadk.utils.logger import get_logger
36
+
37
+ logger = get_logger(__name__)
38
+
39
+
40
+ def formatted_timestamp():
41
+ """Generates a formatted timestamp string in YYYYMMDDHHMMSS format.
42
+
43
+ This function creates a string representation of the current time.
44
+ It uses local time for formatting.
45
+
46
+ Returns:
47
+ str: Timestamp string like '20251028123045'.
48
+ """
49
+ # YYYYMMDDHHMMSS
50
+ return time.strftime("%Y%m%d%H%M%S", time.localtime())
51
+
52
+
53
+ class DeepevalEvaluator(BaseEvaluator):
54
+ """Evaluates agents using DeepEval metrics with Prometheus export.
55
+
56
+ This class uses DeepEval to test agent performance.
57
+ It runs agents on test cases and scores them.
58
+ Results can be sent to Prometheus for monitoring.
59
+
60
+ Attributes:
61
+ judge_model_name (str): Name of the model that judges the agent.
62
+ judge_model (LocalModel): The judge model instance.
63
+ prometheus_config (PrometheusPushgatewayConfig | None): Settings for
64
+ Prometheus export. If None, no export happens.
65
+
66
+ Note:
67
+ Needs judge model credentials from environment if not given.
68
+ Turns off cache to get fresh results each time.
69
+
70
+ Examples:
71
+ ```python
72
+ agent = Agent(tools=[get_city_weather])
73
+ evaluator = DeepevalEvaluator(agent=agent)
74
+ metrics = [GEval(threshold=0.8)]
75
+ results = await evaluator.evaluate(metrics, eval_set_file_path="test.json")
76
+ ```
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ agent,
82
+ judge_model_api_key: str = "",
83
+ judge_model_name: str = "",
84
+ judge_model_api_base: str = "",
85
+ name: str = "veadk_deepeval_evaluator",
86
+ prometheus_config: PrometheusPushgatewayConfig | None = None,
87
+ ):
88
+ """Sets up the DeepEval evaluator with agent and judge model.
89
+
90
+ Args:
91
+ agent: The agent to test.
92
+ judge_model_api_key: API key for the judge model. If empty,
93
+ gets from MODEL_JUDGE_API_KEY environment variable.
94
+ judge_model_name: Name of the judge model. If empty,
95
+ gets from MODEL_JUDGE_NAME environment variable.
96
+ judge_model_api_base: Base URL for judge model API. If empty,
97
+ gets from MODEL_JUDGE_API_BASE environment variable.
98
+ name: Name for this evaluator. Defaults to 'veadk_deepeval_evaluator'.
99
+ prometheus_config: Settings for Prometheus export. If None,
100
+ no export happens.
101
+
102
+ Raises:
103
+ ValueError: If model settings are wrong.
104
+ EnvironmentError: If environment variables are missing.
105
+
106
+ Examples:
107
+ ```python
108
+ evaluator = DeepevalEvaluator(
109
+ agent=my_agent,
110
+ judge_model_api_key="sk-...",
111
+ prometheus_config=prometheus_config)
112
+ ```
113
+ """
114
+ super().__init__(agent=agent, name=name)
115
+
116
+ if not judge_model_api_key:
117
+ judge_model_api_key = getenv("MODEL_JUDGE_API_KEY") or getenv(
118
+ "MODEL_AGENT_API_KEY"
119
+ )
120
+ if not judge_model_name:
121
+ judge_model_name = getenv(
122
+ "MODEL_JUDGE_NAME",
123
+ "doubao-seed-1-6-250615",
124
+ )
125
+ if not judge_model_api_base:
126
+ judge_model_api_base = getenv(
127
+ "MODEL_JUDGE_API_BASE",
128
+ "https://ark.cn-beijing.volces.com/api/v3/",
129
+ )
130
+
131
+ self.judge_model_name = judge_model_name
132
+ self.judge_model = LocalModel(
133
+ model=judge_model_name,
134
+ base_url=judge_model_api_base,
135
+ api_key=judge_model_api_key,
136
+ )
137
+
138
+ self.prometheus_config = prometheus_config
139
+
140
+ @override
141
+ async def evaluate(
142
+ self,
143
+ metrics: list[BaseMetric],
144
+ eval_set: Optional[EvalSet] = None,
145
+ eval_set_file_path: Optional[str] = None,
146
+ eval_id: str = f"test_{formatted_timestamp()}",
147
+ ):
148
+ """Tests agent using DeepEval on given test cases.
149
+
150
+ This method does these steps:
151
+ 1. Loads test cases from memory or file
152
+ 2. Runs agent to get actual responses
153
+ 3. Converts to DeepEval test format
154
+ 4. Runs metrics evaluation
155
+ 5. Sends results to Prometheus if needed
156
+
157
+ Args:
158
+ metrics: List of DeepEval metrics to use for scoring.
159
+ eval_set: Test cases in memory. If given, used first.
160
+ eval_set_file_path: Path to test case file. Used if no eval_set.
161
+ eval_id: Unique name for this test run. Used for tracking.
162
+
163
+ Returns:
164
+ EvaluationResult: Results from DeepEval with scores and details.
165
+
166
+ Raises:
167
+ ValueError: If no test cases found.
168
+ FileNotFoundError: If test file not found.
169
+ EvaluationError: If agent fails or metrics fail.
170
+
171
+ Examples:
172
+ ```python
173
+ metrics = [GEval(threshold=0.8), ToolCorrectnessMetric(threshold=0.5)]
174
+ results = await evaluator.evaluate(
175
+ metrics=metrics,
176
+ eval_set_file_path="test_cases.json")
177
+ print(f"Test cases run: {len(results.test_results)}")
178
+ ```
179
+ """
180
+ # Get evaluation data by parsing eval set file
181
+ self.build_eval_set(eval_set, eval_set_file_path)
182
+
183
+ # Get actual data by running agent
184
+ logger.info("Start to run agent for actual data.")
185
+ await self.generate_actual_outputs()
186
+ eval_case_data_list = self.invocation_list
187
+
188
+ # Build test cases in Deepeval format
189
+ logger.info("Start to build test cases in Deepeval format.")
190
+ test_cases = []
191
+ for eval_case_data in eval_case_data_list:
192
+ for invocation in eval_case_data.invocations:
193
+ invocations_context_actual: str = (
194
+ "" # {"role": "user", "content": "xxxxx"}
195
+ )
196
+ invocations_context_expect: str = ""
197
+
198
+ test_case = LLMTestCase(
199
+ input=invocation.input,
200
+ actual_output=invocation.actual_output,
201
+ expected_output=invocation.expected_output,
202
+ tools_called=[
203
+ ToolCall(name=tool["name"], input_parameters=tool["args"])
204
+ for tool in invocation.actual_tool
205
+ ],
206
+ expected_tools=[
207
+ ToolCall(name=tool["name"], input_parameters=tool["args"])
208
+ for tool in invocation.expected_tool
209
+ ],
210
+ additional_metadata={"latency": invocation.latency},
211
+ context=[
212
+ "actual_conversation_history: "
213
+ + (invocations_context_actual or "Empty"),
214
+ "expect_conversation_history: "
215
+ + (invocations_context_expect or "Empty"),
216
+ ],
217
+ )
218
+ invocations_context_actual += (
219
+ f'{{"role": "user", "content": "{invocation.input}"}}\n'
220
+ )
221
+ invocations_context_actual += f'{{"role": "assistant", "content": "{invocation.actual_output}"}}\n'
222
+ invocations_context_expect += (
223
+ f'{{"role": "user", "content": "{invocation.input}"}}\n'
224
+ )
225
+ invocations_context_expect += f'{{"role": "assistant", "content": "{invocation.expected_output}"}}\n'
226
+
227
+ test_cases.append(test_case)
228
+
229
+ # Run Deepeval evaluation according to metrics
230
+ logger.info("Start to run Deepeval evaluation according to metrics.")
231
+ test_results = evaluate(
232
+ test_cases=test_cases,
233
+ metrics=metrics,
234
+ cache_config=CacheConfig(write_cache=False),
235
+ )
236
+ for test_result in test_results.test_results:
237
+ eval_result_data = EvalResultData(metric_results=[])
238
+ for metrics_data_item in test_result.metrics_data:
239
+ metric_result = MetricResult(
240
+ metric_type=metrics_data_item.name,
241
+ success=metrics_data_item.success,
242
+ score=metrics_data_item.score,
243
+ reason=metrics_data_item.reason,
244
+ )
245
+ eval_result_data.metric_results.append(metric_result)
246
+
247
+ eval_result_data.call_before_append() # calculate average score and generate total reason
248
+ self.result_list.append(eval_result_data)
249
+ self.result_list.reverse() # deepeval test_results is in reverse order
250
+
251
+ # export to Prometheus if needed
252
+ if self.prometheus_config is not None:
253
+ self.export_results(eval_id, test_results)
254
+
255
+ return test_results
256
+
257
+ def export_results(self, eval_id: str, test_results: EvaluationResult):
258
+ """Sends evaluation results to Prometheus for monitoring.
259
+
260
+ This method takes test results, counts passes and failures,
261
+ and sends metrics to Prometheus.
262
+
263
+ Args:
264
+ eval_id: Unique name for this test. Used as label in Prometheus.
265
+ test_results: Results from DeepEval evaluation.
266
+
267
+ Returns:
268
+ None: Results are sent directly to Prometheus.
269
+
270
+ Raises:
271
+ PrometheusConnectionError: If cannot connect to Prometheus.
272
+ PrometheusPushError: If sending data fails.
273
+
274
+ Note:
275
+ Uses fixed thresholds for now: case_threshold=0.5, diff_threshold=0.2.
276
+ These may change later.
277
+
278
+ Examples:
279
+ ```python
280
+ evaluator.export_results("test_20240101", test_results)
281
+ ```
282
+ """
283
+ # fixed attributions
284
+ test_name = eval_id
285
+ test_cases_total = len(test_results.test_results)
286
+ eval_data = EvalResultMetadata(
287
+ tested_model=self.agent.model_name,
288
+ judge_model=self.judge_model_name,
289
+ )
290
+ # parsed attributions
291
+ test_cases_failure = 0
292
+ test_cases_pass = 0
293
+ test_data_list = []
294
+ # NOTE: we hard-coding the following two attributions for development
295
+ case_threshold = 0.5
296
+ diff_threshold = 0.2
297
+
298
+ for idx, test_result in enumerate(test_results.test_results):
299
+ pass_flag = "PASSED"
300
+ if test_result.success:
301
+ test_cases_pass += 1
302
+ else:
303
+ pass_flag = "FAILURE"
304
+ test_cases_failure += 1
305
+
306
+ test_data_list.append(
307
+ EvalResultCaseData(
308
+ id=str(idx),
309
+ input=test_result.input,
310
+ actual_output=test_result.actual_output,
311
+ expected_output=test_result.expected_output,
312
+ # [temporary] score: This method is not generally applicable now and is currently only available in the GEval mode.
313
+ score=str(test_result.metrics_data[0].score),
314
+ reason=test_result.metrics_data[0].reason,
315
+ status=pass_flag,
316
+ latency=test_result.additional_metadata["latency"],
317
+ )
318
+ )
319
+
320
+ exported_data = {
321
+ "test_name": test_name,
322
+ "test_cases_total": test_cases_total,
323
+ "test_cases_failure": test_cases_failure,
324
+ "test_cases_pass": test_cases_pass,
325
+ "test_data_list": test_data_list,
326
+ "eval_data": eval_data,
327
+ "case_threshold": case_threshold,
328
+ "diff_threshold": diff_threshold,
329
+ }
330
+
331
+ push_to_prometheus(
332
+ **exported_data,
333
+ url=self.prometheus_config.url,
334
+ username=self.prometheus_config.username,
335
+ password=self.prometheus_config.password,
336
+ )
337
+ logger.info(
338
+ f"Upload to Prometheus Pushgateway ({self.prometheus_config.url}) successfully! Test name: {eval_id}"
339
+ )
@@ -0,0 +1,48 @@
1
+ # Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from google.adk.evaluation.eval_set import EvalSet
16
+ from google.adk.evaluation.local_eval_sets_manager import (
17
+ load_eval_set_from_file as adk_load_eval_set_from_file,
18
+ )
19
+
20
+
21
+ def load_eval_set_from_file(eval_set_file_path: str) -> EvalSet:
22
+ """Loads an evaluation set from a JSON file.
23
+
24
+ This function uses ADK's loader to parse the file into an EvalSet object.
25
+ It handles errors in file reading or parsing.
26
+
27
+ Args:
28
+ eval_set_file_path (str): Path to the JSON eval set file.
29
+
30
+ Returns:
31
+ EvalSet: Loaded evaluation set object.
32
+
33
+ Raises:
34
+ Exception: If file loading or parsing fails, with details.
35
+
36
+ Examples:
37
+ ```python
38
+ eval_set = load_eval_set_from_file("my_eval.json")
39
+ print(len(eval_set.eval_cases))
40
+ ```
41
+ """
42
+ try:
43
+ eval_set = adk_load_eval_set_from_file(eval_set_file_path, eval_set_file_path)
44
+ except Exception as e:
45
+ raise Exception(
46
+ f"Failed to load eval set from file {eval_set_file_path}"
47
+ ) from e
48
+ return eval_set
@@ -0,0 +1,146 @@
1
+ # Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import time
16
+ from pathlib import Path
17
+ import os
18
+ from google.adk.cli.utils import evals
19
+ from google.adk.evaluation.eval_case import EvalCase, SessionInput
20
+ from google.adk.evaluation.local_eval_sets_manager import LocalEvalSetsManager
21
+ from google.adk.sessions import BaseSessionService
22
+
23
+ from veadk.utils.logger import get_logger
24
+ from veadk.utils.misc import formatted_timestamp, get_agents_dir
25
+
26
+ logger = get_logger(__name__)
27
+
28
+
29
+ class EvalSetRecorder(LocalEvalSetsManager):
30
+ """Records evaluation sets from sessions for later use in testing.
31
+
32
+ This class extends LocalEvalSetsManager to add sessions to eval sets.
33
+ It handles dumping eval sets to files from session data.
34
+
35
+ Attributes:
36
+ eval_set_id (str): ID of the eval set. Defaults to 'default'.
37
+ session_service (BaseSessionService): Service for session management.
38
+
39
+ Note:
40
+ Uses temporary directory for storing eval sets.
41
+ Creates eval cases from session invocations.
42
+ """
43
+
44
+ def __init__(
45
+ self, session_service: BaseSessionService, eval_set_id: str = "default"
46
+ ):
47
+ """Initializes the eval set recorder with session service and ID.
48
+
49
+ Args:
50
+ session_service (BaseSessionService): Service to retrieve sessions.
51
+ eval_set_id (str): ID for the eval set. Defaults to 'default'.
52
+
53
+ Raises:
54
+ ValueError: If eval_set_id is invalid.
55
+ """
56
+ super().__init__(agents_dir=get_agents_dir())
57
+ self.eval_set_id = eval_set_id if eval_set_id != "" else "default"
58
+ self.session_service: BaseSessionService = session_service
59
+
60
+ # adapted from google.adk.cli.fast_api
61
+ async def add_session_to_eval_set(
62
+ self,
63
+ app_name: str,
64
+ eval_set_id: str,
65
+ session_id: str,
66
+ user_id: str,
67
+ ):
68
+ """Adds a session to the evaluation set as an eval case.
69
+
70
+ This method retrieves a session and converts it to eval invocations.
71
+ It creates a new eval case with timestamp.
72
+
73
+ Args:
74
+ app_name (str): Name of the app for the session.
75
+ eval_set_id (str): ID of the eval set to add to.
76
+ session_id (str): ID of the session to add.
77
+ user_id (str): ID of the user owning the session.
78
+
79
+ Raises:
80
+ AssertionError: If session not found.
81
+ ValueError: If adding eval case fails.
82
+ """
83
+ eval_id = f"veadk_eval_{formatted_timestamp()}"
84
+
85
+ # Get the session
86
+ session = await self.session_service.get_session(
87
+ app_name=app_name, user_id=user_id, session_id=session_id
88
+ )
89
+ assert session, "Session not found."
90
+
91
+ # Convert the session data to eval invocations
92
+ invocations = evals.convert_session_to_eval_invocations(session)
93
+
94
+ # Populate the session with initial session state.
95
+ # initial_session_state = create_empty_state(agent_loader.load_agent(app_name))
96
+
97
+ new_eval_case = EvalCase(
98
+ eval_id=eval_id,
99
+ conversation=invocations,
100
+ session_input=SessionInput(app_name=app_name, user_id=user_id),
101
+ creation_timestamp=time.time(),
102
+ )
103
+
104
+ try:
105
+ self.add_eval_case(app_name, eval_set_id, new_eval_case)
106
+ except ValueError as ve:
107
+ raise ValueError(f"Add eval case to eval set error: {ve}")
108
+
109
+ async def dump(
110
+ self,
111
+ app_name: str,
112
+ user_id: str,
113
+ session_id: str,
114
+ ) -> str:
115
+ """Dumps the current eval set to a file path.
116
+
117
+ This method creates the eval set if needed and adds the session.
118
+ It ensures directory exists and logs the dump path.
119
+
120
+ Args:
121
+ app_name (str): Name of the app.
122
+ user_id (str): ID of the user.
123
+ session_id (str): ID of the session to dump.
124
+
125
+ Returns:
126
+ str: Path where the eval set was dumped.
127
+
128
+ Raises:
129
+ ValueError: If dump operation fails.
130
+ """
131
+ dump_path = self._get_eval_set_file_path(app_name, self.eval_set_id)
132
+ Path(dump_path).parent.mkdir(parents=True, exist_ok=True)
133
+
134
+ if not os.path.exists(dump_path):
135
+ self.create_eval_set(app_name=app_name, eval_set_id=self.eval_set_id)
136
+
137
+ await self.add_session_to_eval_set(
138
+ app_name=app_name,
139
+ eval_set_id=self.eval_set_id,
140
+ session_id=session_id,
141
+ user_id=user_id,
142
+ )
143
+
144
+ logger.info(f"Dump eval set to {dump_path}")
145
+
146
+ return dump_path
@@ -0,0 +1,65 @@
1
+ # Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from dataclasses import dataclass
16
+
17
+
18
+ @dataclass
19
+ class EvalResultCaseData:
20
+ """Holds data for a single evaluation case result.
21
+
22
+ This dataclass stores input, outputs, score, and status for one test case.
23
+ Used in evaluation reporting and metrics export.
24
+
25
+ Attributes:
26
+ id (str): Unique ID of the case.
27
+ input (str): User input for the case.
28
+ actual_output (str): Actual agent response.
29
+ expected_output (str): Expected agent response.
30
+ score (str): Score as string from evaluation.
31
+ reason (str): Reason for the score.
32
+ status (str): Status like 'PASSED' or 'FAILURE'.
33
+ latency (str): Latency in milliseconds as string.
34
+
35
+ Note:
36
+ Score and latency are strings for compatibility with external systems.
37
+ """
38
+
39
+ id: str
40
+ input: str
41
+ actual_output: str
42
+ expected_output: str
43
+ score: str
44
+ reason: str
45
+ status: str # `PASSED` or `FAILURE`
46
+ latency: str
47
+
48
+
49
+ @dataclass
50
+ class EvalResultMetadata:
51
+ """Stores metadata about the evaluation run.
52
+
53
+ This dataclass captures model information for the evaluation.
54
+ Used in reporting and tracing.
55
+
56
+ Attributes:
57
+ tested_model (str): Name of the model being tested.
58
+ judge_model (str): Name of the judge model used.
59
+
60
+ Note:
61
+ Simple structure for quick metadata access.
62
+ """
63
+
64
+ tested_model: str
65
+ judge_model: str