veadk-python 0.2.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. veadk/__init__.py +37 -0
  2. veadk/a2a/__init__.py +13 -0
  3. veadk/a2a/agent_card.py +45 -0
  4. veadk/a2a/remote_ve_agent.py +390 -0
  5. veadk/a2a/utils/__init__.py +13 -0
  6. veadk/a2a/utils/agent_to_a2a.py +170 -0
  7. veadk/a2a/ve_a2a_server.py +93 -0
  8. veadk/a2a/ve_agent_executor.py +78 -0
  9. veadk/a2a/ve_middlewares.py +313 -0
  10. veadk/a2a/ve_task_store.py +37 -0
  11. veadk/agent.py +402 -0
  12. veadk/agent_builder.py +93 -0
  13. veadk/agents/loop_agent.py +68 -0
  14. veadk/agents/parallel_agent.py +72 -0
  15. veadk/agents/sequential_agent.py +64 -0
  16. veadk/auth/__init__.py +13 -0
  17. veadk/auth/base_auth.py +22 -0
  18. veadk/auth/ve_credential_service.py +203 -0
  19. veadk/auth/veauth/__init__.py +13 -0
  20. veadk/auth/veauth/apmplus_veauth.py +58 -0
  21. veadk/auth/veauth/ark_veauth.py +75 -0
  22. veadk/auth/veauth/base_veauth.py +50 -0
  23. veadk/auth/veauth/cozeloop_veauth.py +13 -0
  24. veadk/auth/veauth/opensearch_veauth.py +75 -0
  25. veadk/auth/veauth/postgresql_veauth.py +75 -0
  26. veadk/auth/veauth/prompt_pilot_veauth.py +60 -0
  27. veadk/auth/veauth/speech_veauth.py +54 -0
  28. veadk/auth/veauth/utils.py +69 -0
  29. veadk/auth/veauth/vesearch_veauth.py +62 -0
  30. veadk/auth/veauth/viking_mem0_veauth.py +91 -0
  31. veadk/cli/__init__.py +13 -0
  32. veadk/cli/cli.py +58 -0
  33. veadk/cli/cli_clean.py +87 -0
  34. veadk/cli/cli_create.py +163 -0
  35. veadk/cli/cli_deploy.py +233 -0
  36. veadk/cli/cli_eval.py +215 -0
  37. veadk/cli/cli_init.py +214 -0
  38. veadk/cli/cli_kb.py +110 -0
  39. veadk/cli/cli_pipeline.py +285 -0
  40. veadk/cli/cli_prompt.py +86 -0
  41. veadk/cli/cli_update.py +106 -0
  42. veadk/cli/cli_uploadevalset.py +139 -0
  43. veadk/cli/cli_web.py +143 -0
  44. veadk/cloud/__init__.py +13 -0
  45. veadk/cloud/cloud_agent_engine.py +485 -0
  46. veadk/cloud/cloud_app.py +475 -0
  47. veadk/config.py +115 -0
  48. veadk/configs/__init__.py +13 -0
  49. veadk/configs/auth_configs.py +133 -0
  50. veadk/configs/database_configs.py +132 -0
  51. veadk/configs/model_configs.py +78 -0
  52. veadk/configs/tool_configs.py +54 -0
  53. veadk/configs/tracing_configs.py +110 -0
  54. veadk/consts.py +74 -0
  55. veadk/evaluation/__init__.py +17 -0
  56. veadk/evaluation/adk_evaluator/__init__.py +17 -0
  57. veadk/evaluation/adk_evaluator/adk_evaluator.py +302 -0
  58. veadk/evaluation/base_evaluator.py +642 -0
  59. veadk/evaluation/deepeval_evaluator/__init__.py +17 -0
  60. veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +339 -0
  61. veadk/evaluation/eval_set_file_loader.py +48 -0
  62. veadk/evaluation/eval_set_recorder.py +146 -0
  63. veadk/evaluation/types.py +65 -0
  64. veadk/evaluation/utils/prometheus.py +196 -0
  65. veadk/integrations/__init__.py +13 -0
  66. veadk/integrations/ve_apig/__init__.py +13 -0
  67. veadk/integrations/ve_apig/ve_apig.py +349 -0
  68. veadk/integrations/ve_apig/ve_apig_utils.py +332 -0
  69. veadk/integrations/ve_code_pipeline/__init__.py +13 -0
  70. veadk/integrations/ve_code_pipeline/ve_code_pipeline.py +431 -0
  71. veadk/integrations/ve_cozeloop/__init__.py +13 -0
  72. veadk/integrations/ve_cozeloop/ve_cozeloop.py +96 -0
  73. veadk/integrations/ve_cr/__init__.py +13 -0
  74. veadk/integrations/ve_cr/ve_cr.py +220 -0
  75. veadk/integrations/ve_faas/__init__.py +13 -0
  76. veadk/integrations/ve_faas/template/cookiecutter.json +15 -0
  77. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
  78. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
  79. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/config.yaml.example +6 -0
  80. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/deploy.py +106 -0
  81. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__init__.py +13 -0
  82. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/agent.py +25 -0
  83. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/app.py +202 -0
  84. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/requirements.txt +3 -0
  85. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/run.sh +49 -0
  86. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/__init__.py +14 -0
  87. veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/agent.py +27 -0
  88. veadk/integrations/ve_faas/ve_faas.py +754 -0
  89. veadk/integrations/ve_faas/ve_faas_utils.py +408 -0
  90. veadk/integrations/ve_faas/web_template/cookiecutter.json +20 -0
  91. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
  92. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
  93. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/config.yaml.example +2 -0
  94. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/deploy.py +44 -0
  95. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/Dockerfile +23 -0
  96. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/app.py +123 -0
  97. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/init_db.py +46 -0
  98. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/models.py +36 -0
  99. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/requirements.txt +4 -0
  100. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/run.sh +21 -0
  101. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/css/style.css +368 -0
  102. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/js/admin.js +0 -0
  103. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/dashboard.html +21 -0
  104. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/edit_post.html +24 -0
  105. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/login.html +21 -0
  106. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/posts.html +53 -0
  107. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/base.html +45 -0
  108. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/index.html +29 -0
  109. veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/post.html +14 -0
  110. veadk/integrations/ve_identity/__init__.py +110 -0
  111. veadk/integrations/ve_identity/auth_config.py +261 -0
  112. veadk/integrations/ve_identity/auth_mixins.py +650 -0
  113. veadk/integrations/ve_identity/auth_processor.py +385 -0
  114. veadk/integrations/ve_identity/function_tool.py +158 -0
  115. veadk/integrations/ve_identity/identity_client.py +864 -0
  116. veadk/integrations/ve_identity/mcp_tool.py +181 -0
  117. veadk/integrations/ve_identity/mcp_toolset.py +431 -0
  118. veadk/integrations/ve_identity/models.py +228 -0
  119. veadk/integrations/ve_identity/token_manager.py +188 -0
  120. veadk/integrations/ve_identity/utils.py +151 -0
  121. veadk/integrations/ve_prompt_pilot/__init__.py +13 -0
  122. veadk/integrations/ve_prompt_pilot/ve_prompt_pilot.py +85 -0
  123. veadk/integrations/ve_tls/__init__.py +13 -0
  124. veadk/integrations/ve_tls/utils.py +116 -0
  125. veadk/integrations/ve_tls/ve_tls.py +212 -0
  126. veadk/integrations/ve_tos/ve_tos.py +710 -0
  127. veadk/integrations/ve_viking_db_memory/__init__.py +13 -0
  128. veadk/integrations/ve_viking_db_memory/ve_viking_db_memory.py +308 -0
  129. veadk/knowledgebase/__init__.py +17 -0
  130. veadk/knowledgebase/backends/__init__.py +13 -0
  131. veadk/knowledgebase/backends/base_backend.py +72 -0
  132. veadk/knowledgebase/backends/in_memory_backend.py +91 -0
  133. veadk/knowledgebase/backends/opensearch_backend.py +162 -0
  134. veadk/knowledgebase/backends/redis_backend.py +172 -0
  135. veadk/knowledgebase/backends/utils.py +92 -0
  136. veadk/knowledgebase/backends/vikingdb_knowledge_backend.py +608 -0
  137. veadk/knowledgebase/entry.py +25 -0
  138. veadk/knowledgebase/knowledgebase.py +307 -0
  139. veadk/memory/__init__.py +35 -0
  140. veadk/memory/long_term_memory.py +365 -0
  141. veadk/memory/long_term_memory_backends/__init__.py +13 -0
  142. veadk/memory/long_term_memory_backends/base_backend.py +35 -0
  143. veadk/memory/long_term_memory_backends/in_memory_backend.py +67 -0
  144. veadk/memory/long_term_memory_backends/mem0_backend.py +155 -0
  145. veadk/memory/long_term_memory_backends/opensearch_backend.py +124 -0
  146. veadk/memory/long_term_memory_backends/redis_backend.py +140 -0
  147. veadk/memory/long_term_memory_backends/vikingdb_memory_backend.py +189 -0
  148. veadk/memory/short_term_memory.py +252 -0
  149. veadk/memory/short_term_memory_backends/__init__.py +13 -0
  150. veadk/memory/short_term_memory_backends/base_backend.py +31 -0
  151. veadk/memory/short_term_memory_backends/mysql_backend.py +49 -0
  152. veadk/memory/short_term_memory_backends/postgresql_backend.py +49 -0
  153. veadk/memory/short_term_memory_backends/sqlite_backend.py +55 -0
  154. veadk/memory/short_term_memory_processor.py +100 -0
  155. veadk/processors/__init__.py +26 -0
  156. veadk/processors/base_run_processor.py +120 -0
  157. veadk/prompts/__init__.py +13 -0
  158. veadk/prompts/agent_default_prompt.py +30 -0
  159. veadk/prompts/prompt_evaluator.py +20 -0
  160. veadk/prompts/prompt_memory_processor.py +55 -0
  161. veadk/prompts/prompt_optimization.py +150 -0
  162. veadk/runner.py +732 -0
  163. veadk/tools/__init__.py +13 -0
  164. veadk/tools/builtin_tools/__init__.py +13 -0
  165. veadk/tools/builtin_tools/agent_authorization.py +94 -0
  166. veadk/tools/builtin_tools/generate_image.py +23 -0
  167. veadk/tools/builtin_tools/image_edit.py +300 -0
  168. veadk/tools/builtin_tools/image_generate.py +446 -0
  169. veadk/tools/builtin_tools/lark.py +67 -0
  170. veadk/tools/builtin_tools/las.py +24 -0
  171. veadk/tools/builtin_tools/link_reader.py +66 -0
  172. veadk/tools/builtin_tools/llm_shield.py +381 -0
  173. veadk/tools/builtin_tools/load_knowledgebase.py +97 -0
  174. veadk/tools/builtin_tools/mcp_router.py +29 -0
  175. veadk/tools/builtin_tools/run_code.py +113 -0
  176. veadk/tools/builtin_tools/tts.py +253 -0
  177. veadk/tools/builtin_tools/vesearch.py +49 -0
  178. veadk/tools/builtin_tools/video_generate.py +363 -0
  179. veadk/tools/builtin_tools/web_scraper.py +76 -0
  180. veadk/tools/builtin_tools/web_search.py +83 -0
  181. veadk/tools/demo_tools.py +58 -0
  182. veadk/tools/load_knowledgebase_tool.py +149 -0
  183. veadk/tools/sandbox/__init__.py +13 -0
  184. veadk/tools/sandbox/browser_sandbox.py +37 -0
  185. veadk/tools/sandbox/code_sandbox.py +40 -0
  186. veadk/tools/sandbox/computer_sandbox.py +34 -0
  187. veadk/tracing/__init__.py +13 -0
  188. veadk/tracing/base_tracer.py +58 -0
  189. veadk/tracing/telemetry/__init__.py +13 -0
  190. veadk/tracing/telemetry/attributes/attributes.py +29 -0
  191. veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +180 -0
  192. veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +858 -0
  193. veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +152 -0
  194. veadk/tracing/telemetry/attributes/extractors/types.py +164 -0
  195. veadk/tracing/telemetry/exporters/__init__.py +13 -0
  196. veadk/tracing/telemetry/exporters/apmplus_exporter.py +558 -0
  197. veadk/tracing/telemetry/exporters/base_exporter.py +39 -0
  198. veadk/tracing/telemetry/exporters/cozeloop_exporter.py +129 -0
  199. veadk/tracing/telemetry/exporters/inmemory_exporter.py +248 -0
  200. veadk/tracing/telemetry/exporters/tls_exporter.py +139 -0
  201. veadk/tracing/telemetry/opentelemetry_tracer.py +320 -0
  202. veadk/tracing/telemetry/telemetry.py +411 -0
  203. veadk/types.py +47 -0
  204. veadk/utils/__init__.py +13 -0
  205. veadk/utils/audio_manager.py +95 -0
  206. veadk/utils/auth.py +294 -0
  207. veadk/utils/logger.py +59 -0
  208. veadk/utils/mcp_utils.py +44 -0
  209. veadk/utils/misc.py +184 -0
  210. veadk/utils/patches.py +101 -0
  211. veadk/utils/volcengine_sign.py +205 -0
  212. veadk/version.py +15 -0
  213. veadk_python-0.2.27.dist-info/METADATA +373 -0
  214. veadk_python-0.2.27.dist-info/RECORD +218 -0
  215. veadk_python-0.2.27.dist-info/WHEEL +5 -0
  216. veadk_python-0.2.27.dist-info/entry_points.txt +2 -0
  217. veadk_python-0.2.27.dist-info/licenses/LICENSE +201 -0
  218. veadk_python-0.2.27.dist-info/top_level.txt +1 -0
@@ -0,0 +1,642 @@
1
+ # Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import json
17
+ import time
18
+ import uuid
19
+ from abc import abstractmethod
20
+ from typing import Any, Optional
21
+
22
+ from google.adk import Runner
23
+ from google.adk.evaluation.eval_set import EvalSet
24
+ from google.adk.sessions import InMemorySessionService
25
+ from google.genai import types
26
+ from pydantic import BaseModel
27
+
28
+ from veadk.utils.misc import formatted_timestamp
29
+
30
+
31
+ class ToolInvocation(BaseModel):
32
+ """Represents a single tool invocation in agent execution.
33
+
34
+ This model holds tool name, arguments, and result.
35
+ Used in tracking tool usage during evaluation.
36
+
37
+ Attributes:
38
+ tool_name (str): Name of the tool called.
39
+ tool_args (dict[str, Any]): Arguments passed to the tool. Defaults to empty dict.
40
+ tool_result (Any): Result from tool execution. Defaults to None.
41
+
42
+ Note:
43
+ Flexible for various tool types and results.
44
+ """
45
+
46
+ tool_name: str
47
+ tool_args: dict[str, Any] = {}
48
+ tool_result: Any = None
49
+
50
+
51
+ class Invocation(BaseModel):
52
+ """Models a single invocation in the evaluation process.
53
+
54
+ This class stores input, expected and actual outputs, tools, and latency.
55
+ Essential for comparing agent behavior.
56
+
57
+ Attributes:
58
+ invocation_id (str): Unique ID for the invocation. Defaults to empty.
59
+ input (str): User input prompt.
60
+ actual_output (str): Actual response from agent.
61
+ expected_output (str): Expected response.
62
+ actual_tool (list[dict]): List of actual tools called with details.
63
+ expected_tool (list[dict]): List of expected tools.
64
+ latency (str): Execution time in ms. Defaults to empty.
65
+
66
+ Note:
67
+ Tools are dicts with 'name' and 'args'.
68
+ """
69
+
70
+ invocation_id: str = ""
71
+ input: str
72
+ actual_output: str
73
+ expected_output: str
74
+ actual_tool: list[dict] = []
75
+ expected_tool: list[dict] = []
76
+ latency: str = "" # ms
77
+
78
+
79
+ class EvalTestCase(BaseModel):
80
+ """Groups invocations for a single test case.
81
+
82
+ This model contains a list of invocations for one evaluation scenario.
83
+ Used to structure test data.
84
+
85
+ Attributes:
86
+ invocations (list[Invocation]): List of invocation objects in the case.
87
+
88
+ Note:
89
+ Each case corresponds to one session or conversation.
90
+ """
91
+
92
+ invocations: list[Invocation]
93
+
94
+
95
+ class MetricResult(BaseModel):
96
+ """Stores result of a single metric evaluation.
97
+
98
+ This model holds the outcome of one metric application.
99
+ Includes success, score, and reason.
100
+
101
+ Attributes:
102
+ metric_type (str): Type or name of the metric.
103
+ success (bool): If the metric passed.
104
+ score (float): Numerical score from evaluation.
105
+ reason (str): Explanation for the score.
106
+
107
+ Note:
108
+ Score is float between 0 and 1 typically.
109
+ """
110
+
111
+ metric_type: str
112
+ success: bool
113
+ score: float
114
+ reason: str
115
+
116
+
117
+ class EvalResultData(BaseModel):
118
+ """Aggregates metric results for an evaluation.
119
+
120
+ This class collects multiple metric results and computes averages.
121
+ Used for overall case scoring.
122
+
123
+ Attributes:
124
+ metric_results (list[MetricResult]): List of individual metric outcomes.
125
+ average_score (float): Computed average score. Defaults to 0.0.
126
+ total_reason (str): Combined reasons. Defaults to empty.
127
+
128
+ Note:
129
+ Call call_before_append to compute averages and reasons.
130
+ """
131
+
132
+ metric_results: list[MetricResult]
133
+ average_score: float = 0.0
134
+ total_reason: str = ""
135
+
136
+ def calculate_average_score(self):
137
+ """Calculates the average score from metric results.
138
+
139
+ This method sums scores and divides by count.
140
+ Updates average_score attribute.
141
+
142
+ Returns:
143
+ None: Updates internal state.
144
+
145
+ Raises:
146
+ ZeroDivisionError: If no metrics.
147
+ """
148
+ total_score = sum(result.score for result in self.metric_results)
149
+ self.average_score = (
150
+ total_score / len(self.metric_results) if self.metric_results else 0.0
151
+ )
152
+
153
+ def generate_total_reason(self):
154
+ """Generates a combined reason string from all metrics.
155
+
156
+ This method joins reasons with metric types.
157
+ Updates total_reason attribute.
158
+
159
+ Returns:
160
+ None: Updates internal state.
161
+
162
+ Note:
163
+ Format: 'metric_type: reason'
164
+ """
165
+ self.total_reason = "\n".join(
166
+ f"{result.metric_type:}:{result.reason}" for result in self.metric_results
167
+ )
168
+
169
+ def call_before_append(self):
170
+ """Computes average score and total reason before adding to list.
171
+
172
+ This method calls calculate_average_score and generate_total_reason.
173
+ Ensures data is ready for storage.
174
+
175
+ Returns:
176
+ None: Updates internal state.
177
+ """
178
+ self.calculate_average_score()
179
+ self.generate_total_reason()
180
+
181
+
182
+ class BaseEvaluator:
183
+ """Base class for all evaluators in the system.
184
+
185
+ This abstract class provides common functionality for evaluation.
186
+ Handles building eval sets, generating outputs, and abstract evaluate.
187
+
188
+ Attributes:
189
+ name (str): Name of the evaluator.
190
+ agent: The agent being evaluated.
191
+ invocation_list (list[EvalTestCase]): List of test cases.
192
+ result_list (list[EvalResultData]): List of evaluation results.
193
+ agent_information_list (list[dict]): List of agent config info.
194
+
195
+ Note:
196
+ Subclasses must implement evaluate method.
197
+ Supports JSON and tracing formats for input.
198
+ """
199
+
200
+ def __init__(
201
+ self,
202
+ agent,
203
+ name: str,
204
+ ):
205
+ """Initializes the base evaluator with agent and name.
206
+
207
+ Args:
208
+ agent: Agent instance to evaluate.
209
+ name (str): Identifier for the evaluator.
210
+
211
+ Raises:
212
+ ValueError: If agent or name invalid.
213
+ """
214
+ self.name = name
215
+ self.agent = agent
216
+ self.invocation_list: list[EvalTestCase] = []
217
+ self.result_list: list[EvalResultData] = []
218
+ self.agent_information_list: list[dict] = []
219
+
220
+ def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet:
221
+ """Builds eval set from standard eval JSON file.
222
+
223
+ This private method loads using file loader.
224
+
225
+ Args:
226
+ eval_json_path (str): Path to JSON file.
227
+
228
+ Returns:
229
+ EvalSet: Loaded set.
230
+
231
+ Raises:
232
+ ValueError: If loading fails.
233
+ """
234
+ from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file
235
+
236
+ return load_eval_set_from_file(eval_json_path)
237
+
238
+ def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
239
+ """Builds eval set from tracing JSON spans.
240
+
241
+ This private method parses spans, groups by trace, extracts tools and conversation.
242
+
243
+ Args:
244
+ tracing_json_path (str): Path to tracing JSON.
245
+
246
+ Returns:
247
+ EvalSet: Constructed set from traces.
248
+
249
+ Raises:
250
+ ValueError: If JSON invalid or parsing fails.
251
+ json.JSONDecodeError: For malformed JSON.
252
+
253
+ Note:
254
+ Assumes spans have gen_ai attributes for tools and content.
255
+ """
256
+ try:
257
+ with open(tracing_json_path, "r") as f:
258
+ tracing_data = json.load(f)
259
+ except json.JSONDecodeError as e:
260
+ raise ValueError(f"Invalid JSON format in file {tracing_json_path}: {e}")
261
+ except Exception as e:
262
+ raise ValueError(f"Error reading file {tracing_json_path}: {e}")
263
+
264
+ # Group spans by trace_id
265
+ trace_groups = {}
266
+ for span in tracing_data:
267
+ trace_id = span["trace_id"]
268
+ if trace_id not in trace_groups:
269
+ trace_groups[trace_id] = []
270
+ trace_groups[trace_id].append(span)
271
+
272
+ # Convert to evalset format
273
+ eval_cases, conversation = [], []
274
+ app_name, user_id = "", ""
275
+ creation_timestamp = 0
276
+ for trace_id, spans in trace_groups.items():
277
+ tool_uses = []
278
+
279
+ # Extract tool_uses from spans with name starting with "execute_tool"
280
+ for span in spans:
281
+ if span["name"].startswith("execute_tool"):
282
+ # Extract tool parameters from gen_ai.tool.input
283
+ tool_input_str = span["attributes"].get("gen_ai.tool.input", "{}")
284
+ try:
285
+ tool_input = json.loads(tool_input_str)
286
+ tool_args = tool_input.get("parameters", {})
287
+ except json.JSONDecodeError:
288
+ tool_args = {}
289
+
290
+ # Extract the tool call ID from gen_ai.tool.output
291
+ tool_output_str = span["attributes"].get("gen_ai.tool.output", "{}")
292
+ tool_call_id = None
293
+ try:
294
+ tool_output = json.loads(tool_output_str)
295
+ tool_call_id = tool_output.get("id", None)
296
+ except json.JSONDecodeError:
297
+ tool_call_id = None
298
+
299
+ tool_uses.append(
300
+ {
301
+ "id": tool_call_id,
302
+ "args": tool_args,
303
+ "name": span["attributes"].get("gen_ai.tool.name", None),
304
+ }
305
+ )
306
+
307
+ # Extract conversation data from call_llm spans
308
+ user_input = ""
309
+ final_output = ""
310
+
311
+ # Find the first call_llm span for user input and the last one for final output
312
+ call_llm_spans = [span for span in spans if span["name"] == "call_llm"]
313
+
314
+ if call_llm_spans:
315
+ # Get user input from the first call_llm span
316
+ first_span = call_llm_spans[0]
317
+ user_input = first_span["attributes"].get("gen_ai.prompt.0.content", "")
318
+
319
+ # Get final output from the last call_llm span
320
+ last_span = call_llm_spans[-1]
321
+ final_output = last_span["attributes"].get(
322
+ "gen_ai.completion.0.content", ""
323
+ )
324
+
325
+ # Get metadata from any span
326
+ app_name = first_span["attributes"].get("gen_ai.app.name", "")
327
+ user_id = first_span["attributes"].get("gen_ai.user.id", "")
328
+ creation_timestamp = first_span["start_time"] / 1e9
329
+
330
+ if user_input and final_output:
331
+ # Create user_content and final_response in the expected format
332
+ user_content = {"role": "user", "parts": [{"text": user_input}]}
333
+
334
+ final_response = {"role": "model", "parts": [{"text": final_output}]}
335
+
336
+ conversation.append(
337
+ {
338
+ "invocation_id": str(uuid.uuid4()),
339
+ "user_content": user_content,
340
+ "final_response": final_response,
341
+ "intermediate_data": {
342
+ "tool_uses": tool_uses,
343
+ "intermediate_responses": [],
344
+ },
345
+ "creation_timestamp": creation_timestamp,
346
+ }
347
+ )
348
+
349
+ eval_cases.append(
350
+ {
351
+ "eval_id": f"veadk_eval_{formatted_timestamp()}",
352
+ "conversation": conversation,
353
+ "session_input": {
354
+ "app_name": app_name,
355
+ "user_id": user_id,
356
+ "state": {},
357
+ },
358
+ "creation_timestamp": creation_timestamp,
359
+ }
360
+ )
361
+
362
+ evalset = EvalSet(
363
+ eval_set_id="default",
364
+ name="default",
365
+ description=None,
366
+ eval_cases=eval_cases,
367
+ creation_timestamp=creation_timestamp,
368
+ )
369
+
370
+ return evalset
371
+
372
+ def build_eval_set(
373
+ self, eval_set: Optional[EvalSet] = None, file_path: Optional[str] = None
374
+ ):
375
+ """Builds invocation list from eval set or file.
376
+
377
+ This method parses input, extracts invocations with expected data.
378
+ Supports eval JSON and tracing JSON formats.
379
+
380
+ Args:
381
+ eval_set (Optional[EvalSet]): Direct eval set object.
382
+ file_path (Optional[str]): Path to file for loading.
383
+
384
+ Raises:
385
+ ValueError: If neither provided or format unsupported.
386
+
387
+ Note:
388
+ Generates random session IDs for isolation.
389
+ """
390
+
391
+ if eval_set is None and file_path is None:
392
+ raise ValueError("eval_set or file_path is required")
393
+ if eval_set:
394
+ eval_cases = eval_set.eval_cases
395
+ else:
396
+ try:
397
+ with open(file_path, "r", encoding="utf-8") as f:
398
+ file_content = json.load(f)
399
+ except json.JSONDecodeError as e:
400
+ raise ValueError(f"Invalid JSON format in file {file_path}: {e}")
401
+ except Exception as e:
402
+ raise ValueError(f"Error reading file {file_path}: {e}")
403
+
404
+ if isinstance(file_content, dict) and "eval_cases" in file_content:
405
+ eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
406
+ elif (
407
+ isinstance(file_content, list)
408
+ and len(file_content) > 0
409
+ and all(
410
+ isinstance(span, dict) and "trace_id" in span
411
+ for span in file_content
412
+ )
413
+ ):
414
+ eval_cases = self._build_eval_set_from_tracing_json(
415
+ file_path
416
+ ).eval_cases
417
+ else:
418
+ raise ValueError(
419
+ f"Unsupported file format in {file_path}. Please provide a valid file."
420
+ )
421
+
422
+ eval_case_data_list: list[EvalTestCase] = []
423
+ for eval_case in eval_cases:
424
+ eval_case_data = EvalTestCase(invocations=[])
425
+ if eval_case.session_input:
426
+ self.agent_information_list.append(
427
+ {
428
+ "app_name": eval_case.session_input.app_name,
429
+ "user_id": eval_case.session_input.user_id,
430
+ "session_id": str(
431
+ uuid.uuid4()
432
+ ), # random session id for evaluation,
433
+ }
434
+ )
435
+
436
+ for invocation in eval_case.conversation:
437
+ _input: str = ""
438
+ _expected_output: str = ""
439
+ _expected_tool: list[dict] = []
440
+
441
+ user_content = invocation.user_content
442
+ _input = user_content.parts[0].text
443
+ _expected_output = invocation.final_response.parts[0].text
444
+
445
+ if (
446
+ hasattr(invocation.intermediate_data, "tool_uses")
447
+ and invocation.intermediate_data.tool_uses
448
+ ):
449
+ for expected_tool_use in invocation.intermediate_data.tool_uses:
450
+ _expected_tool.append(
451
+ {
452
+ "name": expected_tool_use.name,
453
+ "args": expected_tool_use.args,
454
+ }
455
+ )
456
+
457
+ elif (
458
+ hasattr(invocation.intermediate_data, "invocation_events")
459
+ and invocation.intermediate_data.invocation_events
460
+ ):
461
+ for event in invocation.intermediate_data.invocation_events:
462
+ if hasattr(event, "content") and hasattr(
463
+ event.content, "parts"
464
+ ):
465
+ for part in event.content.parts:
466
+ if (
467
+ hasattr(part, "function_call")
468
+ and part.function_call is not None
469
+ ):
470
+ _expected_tool.append(
471
+ {
472
+ "name": part.function_call.name,
473
+ "args": part.function_call.args,
474
+ }
475
+ )
476
+
477
+ eval_case_data.invocations.append(
478
+ Invocation(
479
+ invocation_id=invocation.invocation_id,
480
+ input=_input,
481
+ actual_output="",
482
+ actual_tool=[],
483
+ expected_output=_expected_output,
484
+ expected_tool=_expected_tool,
485
+ latency="",
486
+ )
487
+ )
488
+
489
+ eval_case_data_list.append(eval_case_data)
490
+ self.invocation_list = eval_case_data_list
491
+
492
+ async def generate_actual_outputs(self):
493
+ """Generates actual outputs by running the agent on inputs.
494
+
495
+ This method uses Runner to execute agent for each invocation.
496
+ Captures outputs, tools, and latency.
497
+
498
+ Returns:
499
+ None: Updates invocation actual fields.
500
+
501
+ Raises:
502
+ Exception: If runner or execution fails.
503
+
504
+ Note:
505
+ Uses InMemorySessionService for isolation.
506
+ Supports long-term memory if present.
507
+ """
508
+ for eval_case_data, agent_information in zip(
509
+ self.invocation_list, self.agent_information_list
510
+ ):
511
+ session_service = InMemorySessionService()
512
+ _ = await session_service.create_session(
513
+ app_name=agent_information["app_name"],
514
+ user_id=agent_information["user_id"],
515
+ state={},
516
+ session_id=agent_information["session_id"],
517
+ )
518
+
519
+ if getattr(self.agent, "long_term_memory", None):
520
+ runner = Runner(
521
+ app_name=agent_information["app_name"],
522
+ agent=self.agent,
523
+ session_service=session_service,
524
+ memory_service=self.agent.long_term_memory,
525
+ )
526
+ else:
527
+ runner = Runner(
528
+ app_name=agent_information["app_name"],
529
+ agent=self.agent,
530
+ session_service=session_service,
531
+ )
532
+
533
+ for invocation in eval_case_data.invocations:
534
+ _actual_output: str = ""
535
+ _actual_tool: list[dict] = []
536
+ _latency: str = ""
537
+ final_response = None
538
+ tool_uses = []
539
+ invocation_id = ""
540
+
541
+ user_content = types.Content(
542
+ role="user", parts=[types.Part(text=invocation.input)]
543
+ )
544
+ tik = time.time()
545
+ async for event in runner.run_async(
546
+ user_id=agent_information["user_id"],
547
+ session_id=agent_information["session_id"],
548
+ new_message=user_content,
549
+ ):
550
+ invocation_id = (
551
+ event.invocation_id if not invocation_id else invocation_id
552
+ )
553
+ if (
554
+ event.is_final_response()
555
+ and event.content
556
+ and event.content.parts
557
+ ):
558
+ final_response = event.content
559
+ elif event.get_function_calls():
560
+ for call in event.get_function_calls():
561
+ tool_uses.append(call)
562
+ tok = time.time()
563
+ _latency = str((tok - tik) * 1000)
564
+
565
+ if final_response and final_response.parts:
566
+ _actual_output = final_response.parts[0].text
567
+ for tool_use in tool_uses:
568
+ _actual_tool.append(
569
+ {
570
+ "name": tool_use.name,
571
+ "args": tool_use.args,
572
+ }
573
+ )
574
+
575
+ invocation.actual_output = _actual_output
576
+ invocation.actual_tool = _actual_tool
577
+ invocation.latency = _latency
578
+
579
+ def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
580
+ """Retrieves combined evaluation information.
581
+
582
+ This method merges invocations and results into dict lists.
583
+ Useful for reporting.
584
+
585
+ Returns:
586
+ list[list[dict[str, Any]]]: Nested list of case data dicts.
587
+
588
+ Note:
589
+ Defaults to empty results if not evaluated yet.
590
+ """
591
+ result = []
592
+ for i, eval_case in enumerate(self.invocation_list):
593
+ case_data = []
594
+ # Get corresponding eval_result or use default if not available
595
+ eval_result = (
596
+ self.result_list[i]
597
+ if i < len(self.result_list)
598
+ else EvalResultData(metric_results=[])
599
+ )
600
+ for invocation in eval_case.invocations:
601
+ data = {
602
+ "input": invocation.input,
603
+ "expected_output": invocation.expected_output,
604
+ "actual_output": invocation.actual_output,
605
+ "expected_tool": invocation.expected_tool,
606
+ "actual_tool": invocation.actual_tool,
607
+ "score": eval_result.average_score,
608
+ "reason": eval_result.total_reason,
609
+ "latency": invocation.latency,
610
+ }
611
+ case_data.append(data)
612
+ result.append(case_data)
613
+ return result
614
+
615
+ @abstractmethod
616
+ async def evaluate(
617
+ self,
618
+ metrics: list[Any],
619
+ eval_set: Optional[EvalSet],
620
+ eval_set_file_path: Optional[str],
621
+ eval_id: str,
622
+ ):
623
+ """Abstract method for performing the evaluation.
624
+
625
+ Subclasses implement specific metric evaluation logic.
626
+
627
+ Args:
628
+ metrics (list[Any]): Metrics to apply.
629
+ eval_set (Optional[EvalSet]): Eval set.
630
+ eval_set_file_path (Optional[str]): File path.
631
+ eval_id (str): Evaluation ID.
632
+
633
+ Returns:
634
+ Any: Evaluation results specific to subclass.
635
+
636
+ Raises:
637
+ NotImplementedError: If not overridden.
638
+
639
+ Note:
640
+ Must populate result_list after evaluation.
641
+ """
642
+ pass
@@ -0,0 +1,17 @@
1
+ # Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .deepeval_evaluator import DeepevalEvaluator
16
+
17
+ __all__ = ["DeepevalEvaluator"]