veadk-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of veadk-python might be problematic. Click here for more details.
- veadk/__init__.py +31 -0
- veadk/a2a/__init__.py +13 -0
- veadk/a2a/agent_card.py +45 -0
- veadk/a2a/remote_ve_agent.py +19 -0
- veadk/a2a/ve_a2a_server.py +77 -0
- veadk/a2a/ve_agent_executor.py +78 -0
- veadk/a2a/ve_task_store.py +37 -0
- veadk/agent.py +253 -0
- veadk/cli/__init__.py +13 -0
- veadk/cli/main.py +278 -0
- veadk/cli/services/agentpilot/__init__.py +17 -0
- veadk/cli/services/agentpilot/agentpilot.py +77 -0
- veadk/cli/services/veapig/__init__.py +17 -0
- veadk/cli/services/veapig/apig.py +224 -0
- veadk/cli/services/veapig/apig_utils.py +332 -0
- veadk/cli/services/vefaas/__init__.py +17 -0
- veadk/cli/services/vefaas/template/deploy.py +44 -0
- veadk/cli/services/vefaas/template/src/app.py +30 -0
- veadk/cli/services/vefaas/template/src/config.py +58 -0
- veadk/cli/services/vefaas/vefaas.py +346 -0
- veadk/cli/services/vefaas/vefaas_utils.py +408 -0
- veadk/cli/services/vetls/__init__.py +17 -0
- veadk/cli/services/vetls/vetls.py +87 -0
- veadk/cli/studio/__init__.py +13 -0
- veadk/cli/studio/agent_processor.py +247 -0
- veadk/cli/studio/fast_api.py +232 -0
- veadk/cli/studio/model.py +116 -0
- veadk/cloud/__init__.py +13 -0
- veadk/cloud/cloud_agent_engine.py +144 -0
- veadk/cloud/cloud_app.py +123 -0
- veadk/cloud/template/app.py +30 -0
- veadk/cloud/template/config.py +55 -0
- veadk/config.py +131 -0
- veadk/consts.py +17 -0
- veadk/database/__init__.py +17 -0
- veadk/database/base_database.py +45 -0
- veadk/database/database_factory.py +80 -0
- veadk/database/kv/__init__.py +13 -0
- veadk/database/kv/redis_database.py +109 -0
- veadk/database/local_database.py +43 -0
- veadk/database/relational/__init__.py +13 -0
- veadk/database/relational/mysql_database.py +114 -0
- veadk/database/vector/__init__.py +13 -0
- veadk/database/vector/opensearch_vector_database.py +205 -0
- veadk/database/vector/type.py +50 -0
- veadk/database/viking/__init__.py +13 -0
- veadk/database/viking/viking_database.py +378 -0
- veadk/database/viking/viking_memory_db.py +521 -0
- veadk/evaluation/__init__.py +17 -0
- veadk/evaluation/adk_evaluator/__init__.py +13 -0
- veadk/evaluation/adk_evaluator/adk_evaluator.py +291 -0
- veadk/evaluation/base_evaluator.py +242 -0
- veadk/evaluation/deepeval_evaluator/__init__.py +17 -0
- veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +223 -0
- veadk/evaluation/eval_set_file_loader.py +28 -0
- veadk/evaluation/eval_set_recorder.py +91 -0
- veadk/evaluation/utils/prometheus.py +142 -0
- veadk/knowledgebase/__init__.py +17 -0
- veadk/knowledgebase/knowledgebase.py +83 -0
- veadk/knowledgebase/knowledgebase_database_adapter.py +259 -0
- veadk/memory/__init__.py +13 -0
- veadk/memory/long_term_memory.py +119 -0
- veadk/memory/memory_database_adapter.py +235 -0
- veadk/memory/short_term_memory.py +124 -0
- veadk/memory/short_term_memory_processor.py +90 -0
- veadk/prompts/__init__.py +13 -0
- veadk/prompts/agent_default_prompt.py +30 -0
- veadk/prompts/prompt_evaluator.py +20 -0
- veadk/prompts/prompt_memory_processor.py +55 -0
- veadk/prompts/prompt_optimization.py +158 -0
- veadk/runner.py +252 -0
- veadk/tools/__init__.py +13 -0
- veadk/tools/builtin_tools/__init__.py +13 -0
- veadk/tools/builtin_tools/lark.py +67 -0
- veadk/tools/builtin_tools/las.py +23 -0
- veadk/tools/builtin_tools/vesearch.py +49 -0
- veadk/tools/builtin_tools/web_scraper.py +76 -0
- veadk/tools/builtin_tools/web_search.py +192 -0
- veadk/tools/demo_tools.py +58 -0
- veadk/tools/load_knowledgebase_tool.py +144 -0
- veadk/tools/sandbox/__init__.py +13 -0
- veadk/tools/sandbox/browser_sandbox.py +27 -0
- veadk/tools/sandbox/code_sandbox.py +30 -0
- veadk/tools/sandbox/computer_sandbox.py +27 -0
- veadk/tracing/__init__.py +13 -0
- veadk/tracing/base_tracer.py +172 -0
- veadk/tracing/telemetry/__init__.py +13 -0
- veadk/tracing/telemetry/exporters/__init__.py +13 -0
- veadk/tracing/telemetry/exporters/apiserver_exporter.py +60 -0
- veadk/tracing/telemetry/exporters/apmplus_exporter.py +101 -0
- veadk/tracing/telemetry/exporters/base_exporter.py +28 -0
- veadk/tracing/telemetry/exporters/cozeloop_exporter.py +69 -0
- veadk/tracing/telemetry/exporters/inmemory_exporter.py +88 -0
- veadk/tracing/telemetry/exporters/tls_exporter.py +78 -0
- veadk/tracing/telemetry/metrics/__init__.py +13 -0
- veadk/tracing/telemetry/metrics/opentelemetry_metrics.py +73 -0
- veadk/tracing/telemetry/opentelemetry_tracer.py +167 -0
- veadk/types.py +23 -0
- veadk/utils/__init__.py +13 -0
- veadk/utils/logger.py +59 -0
- veadk/utils/misc.py +33 -0
- veadk/utils/patches.py +85 -0
- veadk/utils/volcengine_sign.py +199 -0
- veadk/version.py +15 -0
- veadk_python-0.1.0.dist-info/METADATA +124 -0
- veadk_python-0.1.0.dist-info/RECORD +110 -0
- veadk_python-0.1.0.dist-info/WHEEL +5 -0
- veadk_python-0.1.0.dist-info/entry_points.txt +2 -0
- veadk_python-0.1.0.dist-info/licenses/LICENSE +201 -0
- veadk_python-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import time
|
|
17
|
+
import uuid
|
|
18
|
+
from os import path
|
|
19
|
+
from typing import Any, Optional
|
|
20
|
+
|
|
21
|
+
from google.adk import Runner
|
|
22
|
+
from google.adk.artifacts import BaseArtifactService, InMemoryArtifactService
|
|
23
|
+
from google.adk.evaluation.agent_evaluator import (
|
|
24
|
+
NUM_RUNS,
|
|
25
|
+
RESPONSE_MATCH_SCORE_KEY,
|
|
26
|
+
TOOL_TRAJECTORY_SCORE_KEY,
|
|
27
|
+
AgentEvaluator,
|
|
28
|
+
)
|
|
29
|
+
from google.adk.evaluation.eval_case import IntermediateData, Invocation, SessionInput
|
|
30
|
+
from google.adk.evaluation.eval_set import EvalSet
|
|
31
|
+
from google.adk.evaluation.evaluation_generator import (
|
|
32
|
+
EvalCaseResponses,
|
|
33
|
+
EvaluationGenerator,
|
|
34
|
+
)
|
|
35
|
+
from google.adk.evaluation.evaluator import EvalStatus, EvaluationResult
|
|
36
|
+
from google.adk.sessions import BaseSessionService, InMemorySessionService
|
|
37
|
+
from typing_extensions import override
|
|
38
|
+
|
|
39
|
+
from veadk.agent import Agent
|
|
40
|
+
|
|
41
|
+
from ..base_evaluator import BaseEvaluator
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def formatted_timestamp():
|
|
45
|
+
# YYYYMMDDHHMMSS
|
|
46
|
+
return time.strftime("%Y%m%d%H%M%S", time.localtime())
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class VeEvaluationGenerator(EvaluationGenerator):
|
|
50
|
+
@staticmethod
|
|
51
|
+
async def _ve_process_query( # done
|
|
52
|
+
invocations: list[Invocation],
|
|
53
|
+
agent: Agent,
|
|
54
|
+
agent_name: Optional[str] = None,
|
|
55
|
+
initial_session: Optional[SessionInput] = None,
|
|
56
|
+
):
|
|
57
|
+
agent_to_evaluate = agent
|
|
58
|
+
if agent_name:
|
|
59
|
+
agent_to_evaluate = agent.find_agent(agent_name)
|
|
60
|
+
assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
|
|
61
|
+
|
|
62
|
+
return await VeEvaluationGenerator._ve_generate_inferences_from_root_agent(
|
|
63
|
+
invocations, agent_to_evaluate, None, initial_session
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
async def ve_generate_responses( # done
|
|
68
|
+
eval_set: EvalSet,
|
|
69
|
+
agent: Agent,
|
|
70
|
+
repeat_num: int = 3,
|
|
71
|
+
agent_name: str = None,
|
|
72
|
+
):
|
|
73
|
+
results = []
|
|
74
|
+
|
|
75
|
+
for eval_case in eval_set.eval_cases:
|
|
76
|
+
responses = []
|
|
77
|
+
for _ in range(repeat_num):
|
|
78
|
+
response_invocations = await VeEvaluationGenerator._ve_process_query(
|
|
79
|
+
invocations=eval_case.conversation,
|
|
80
|
+
agent=agent,
|
|
81
|
+
agent_name=agent_name,
|
|
82
|
+
initial_session=eval_case.session_input,
|
|
83
|
+
)
|
|
84
|
+
responses.append(response_invocations)
|
|
85
|
+
|
|
86
|
+
results.append(EvalCaseResponses(eval_case=eval_case, responses=responses))
|
|
87
|
+
|
|
88
|
+
return results
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
async def _ve_generate_inferences_from_root_agent(
|
|
92
|
+
invocations: list[Invocation],
|
|
93
|
+
root_agent: Agent,
|
|
94
|
+
reset_func: Any,
|
|
95
|
+
initial_session: Optional[SessionInput] = None,
|
|
96
|
+
session_id: Optional[str] = None,
|
|
97
|
+
session_service: Optional[BaseSessionService] = None,
|
|
98
|
+
artifact_service: Optional[BaseArtifactService] = None,
|
|
99
|
+
) -> list[Invocation]:
|
|
100
|
+
"""Scrapes the root agent given the list of Invocations."""
|
|
101
|
+
if not session_service:
|
|
102
|
+
session_service = InMemorySessionService()
|
|
103
|
+
|
|
104
|
+
app_name = (
|
|
105
|
+
initial_session.app_name if initial_session else "EvaluationGenerator"
|
|
106
|
+
)
|
|
107
|
+
user_id = initial_session.user_id if initial_session else "test_user_id"
|
|
108
|
+
session_id = session_id if session_id else str(uuid.uuid4())
|
|
109
|
+
|
|
110
|
+
_ = await session_service.create_session(
|
|
111
|
+
app_name=app_name,
|
|
112
|
+
user_id=user_id,
|
|
113
|
+
state=initial_session.state if initial_session else {},
|
|
114
|
+
session_id=session_id,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if not artifact_service:
|
|
118
|
+
artifact_service = InMemoryArtifactService()
|
|
119
|
+
|
|
120
|
+
if getattr(root_agent, "long_term_memory", None) is not None:
|
|
121
|
+
runner = Runner(
|
|
122
|
+
app_name=app_name,
|
|
123
|
+
agent=root_agent,
|
|
124
|
+
artifact_service=artifact_service,
|
|
125
|
+
session_service=session_service,
|
|
126
|
+
memory_service=root_agent.long_term_memory, # add long_term_memory
|
|
127
|
+
)
|
|
128
|
+
else:
|
|
129
|
+
runner = Runner(
|
|
130
|
+
app_name=app_name,
|
|
131
|
+
agent=root_agent,
|
|
132
|
+
artifact_service=artifact_service,
|
|
133
|
+
session_service=session_service,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Reset agent state for each query
|
|
137
|
+
if callable(reset_func):
|
|
138
|
+
reset_func()
|
|
139
|
+
|
|
140
|
+
response_invocations = []
|
|
141
|
+
|
|
142
|
+
for invocation in invocations:
|
|
143
|
+
final_response = None
|
|
144
|
+
user_content = invocation.user_content
|
|
145
|
+
tool_uses = []
|
|
146
|
+
invocation_id = ""
|
|
147
|
+
|
|
148
|
+
async for event in runner.run_async(
|
|
149
|
+
user_id=user_id, session_id=session_id, new_message=user_content
|
|
150
|
+
):
|
|
151
|
+
invocation_id = (
|
|
152
|
+
event.invocation_id if not invocation_id else invocation_id
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if event.is_final_response() and event.content and event.content.parts:
|
|
156
|
+
final_response = event.content
|
|
157
|
+
elif event.get_function_calls():
|
|
158
|
+
for call in event.get_function_calls():
|
|
159
|
+
tool_uses.append(call)
|
|
160
|
+
|
|
161
|
+
response_invocations.append(
|
|
162
|
+
Invocation(
|
|
163
|
+
invocation_id=invocation_id,
|
|
164
|
+
user_content=user_content,
|
|
165
|
+
final_response=final_response,
|
|
166
|
+
intermediate_data=IntermediateData(tool_uses=tool_uses),
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return response_invocations
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class VeAgentEvaluator(AgentEvaluator):
|
|
174
|
+
def __init__(
|
|
175
|
+
self,
|
|
176
|
+
):
|
|
177
|
+
super().__init__()
|
|
178
|
+
|
|
179
|
+
@staticmethod
|
|
180
|
+
async def ve_evaluate_eval_set(
|
|
181
|
+
agent: Agent,
|
|
182
|
+
eval_set: EvalSet,
|
|
183
|
+
criteria: dict[str, float],
|
|
184
|
+
num_runs=NUM_RUNS,
|
|
185
|
+
agent_name=None,
|
|
186
|
+
print_detailed_results: bool = True,
|
|
187
|
+
):
|
|
188
|
+
eval_case_responses_list = await VeEvaluationGenerator.ve_generate_responses(
|
|
189
|
+
eval_set=eval_set,
|
|
190
|
+
agent=agent,
|
|
191
|
+
repeat_num=num_runs,
|
|
192
|
+
agent_name=agent_name,
|
|
193
|
+
)
|
|
194
|
+
failures = []
|
|
195
|
+
evaluation_result_list = []
|
|
196
|
+
|
|
197
|
+
for eval_case_responses in eval_case_responses_list:
|
|
198
|
+
actual_invocations = [
|
|
199
|
+
invocation
|
|
200
|
+
for invocations in eval_case_responses.responses
|
|
201
|
+
for invocation in invocations
|
|
202
|
+
]
|
|
203
|
+
expected_invocations = eval_case_responses.eval_case.conversation * num_runs
|
|
204
|
+
|
|
205
|
+
for metric_name, threshold in criteria.items():
|
|
206
|
+
metric_evaluator = AgentEvaluator._get_metric_evaluator(
|
|
207
|
+
metric_name=metric_name, threshold=threshold
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
evaluation_result: EvaluationResult = (
|
|
211
|
+
metric_evaluator.evaluate_invocations(
|
|
212
|
+
actual_invocations=actual_invocations,
|
|
213
|
+
expected_invocations=expected_invocations,
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
if print_detailed_results:
|
|
218
|
+
AgentEvaluator._print_details(
|
|
219
|
+
evaluation_result=evaluation_result,
|
|
220
|
+
metric_name=metric_name,
|
|
221
|
+
threshold=threshold,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Gather all the failures.
|
|
225
|
+
if evaluation_result.overall_eval_status != EvalStatus.PASSED:
|
|
226
|
+
failures.append(
|
|
227
|
+
f"{metric_name} for {agent.name} Failed. Expected {threshold},"
|
|
228
|
+
f" but got {evaluation_result.overall_score}."
|
|
229
|
+
)
|
|
230
|
+
evaluation_result_list.append(evaluation_result)
|
|
231
|
+
|
|
232
|
+
return evaluation_result_list, failures
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class ADKEvaluator(BaseEvaluator):
|
|
236
|
+
def __init__(
|
|
237
|
+
self,
|
|
238
|
+
agent,
|
|
239
|
+
name: str = "veadk_adk_evaluator",
|
|
240
|
+
):
|
|
241
|
+
super().__init__(agent=agent, name=name)
|
|
242
|
+
|
|
243
|
+
# TODO: implement
|
|
244
|
+
|
|
245
|
+
@override
|
|
246
|
+
async def eval(
|
|
247
|
+
self,
|
|
248
|
+
eval_set_file_path: str,
|
|
249
|
+
eval_id: str = f"test_{formatted_timestamp()}",
|
|
250
|
+
tool_score_threshold: float = 1.0,
|
|
251
|
+
response_match_score_threshold: float = 0.8,
|
|
252
|
+
num_runs: int = 2,
|
|
253
|
+
print_detailed_results: bool = True,
|
|
254
|
+
):
|
|
255
|
+
test_files = []
|
|
256
|
+
eval_dataset_file_path_or_dir = eval_set_file_path
|
|
257
|
+
if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
|
|
258
|
+
eval_dataset_file_path_or_dir
|
|
259
|
+
):
|
|
260
|
+
for root, _, files in os.walk(eval_dataset_file_path_or_dir):
|
|
261
|
+
for file in files:
|
|
262
|
+
if file.endswith(".test.json"):
|
|
263
|
+
test_files.append(path.join(root, file))
|
|
264
|
+
else:
|
|
265
|
+
test_files = [eval_dataset_file_path_or_dir]
|
|
266
|
+
|
|
267
|
+
initial_session = AgentEvaluator._get_initial_session()
|
|
268
|
+
|
|
269
|
+
result = []
|
|
270
|
+
failures = []
|
|
271
|
+
for test_file in test_files:
|
|
272
|
+
criteria = {
|
|
273
|
+
TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold, # 1-point scale; 1.0 is perfect.
|
|
274
|
+
RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold, # Rouge-1 text match; 0.8 is default.
|
|
275
|
+
}
|
|
276
|
+
eval_set = AgentEvaluator._load_eval_set_from_file(
|
|
277
|
+
test_file, criteria, initial_session
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
res, fail = await VeAgentEvaluator.ve_evaluate_eval_set(
|
|
281
|
+
agent=self.agent,
|
|
282
|
+
eval_set=eval_set,
|
|
283
|
+
criteria=criteria,
|
|
284
|
+
num_runs=num_runs,
|
|
285
|
+
agent_name=self.agent.name,
|
|
286
|
+
print_detailed_results=print_detailed_results,
|
|
287
|
+
)
|
|
288
|
+
result.append(res)
|
|
289
|
+
failures.extend(fail)
|
|
290
|
+
|
|
291
|
+
return result, failures
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
import time
|
|
17
|
+
import uuid
|
|
18
|
+
from abc import abstractmethod
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from google.adk import Runner
|
|
22
|
+
from google.adk.evaluation.eval_set import EvalSet
|
|
23
|
+
from google.adk.sessions import InMemorySessionService
|
|
24
|
+
from google.genai import types
|
|
25
|
+
from pydantic import BaseModel
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class InvocationTestData(BaseModel):
|
|
29
|
+
invocation_id: str = ""
|
|
30
|
+
input: str
|
|
31
|
+
actual_output: str
|
|
32
|
+
expected_output: str
|
|
33
|
+
actual_tool: list[dict] = []
|
|
34
|
+
expected_tool: list[dict] = []
|
|
35
|
+
latency: str = "" # ms
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EvalCaseData(BaseModel):
|
|
39
|
+
invocations: list[InvocationTestData]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MetricResult(BaseModel):
|
|
43
|
+
metric_type: str
|
|
44
|
+
success: bool
|
|
45
|
+
score: float
|
|
46
|
+
reason: str
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class EvalResultData(BaseModel):
|
|
50
|
+
metric_results: list[MetricResult]
|
|
51
|
+
average_score: float = 0.0
|
|
52
|
+
total_reason: str = ""
|
|
53
|
+
|
|
54
|
+
def calculate_average_score(self):
|
|
55
|
+
total_score = sum(result.score for result in self.metric_results)
|
|
56
|
+
self.average_score = (
|
|
57
|
+
total_score / len(self.metric_results) if self.metric_results else 0.0
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def generate_total_reason(self):
|
|
61
|
+
self.total_reason = "\n".join(
|
|
62
|
+
f"{result.metric_type:}:{result.reason}" for result in self.metric_results
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def call_before_append(self):
|
|
66
|
+
self.calculate_average_score()
|
|
67
|
+
self.generate_total_reason()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class BaseEvaluator:
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
agent,
|
|
74
|
+
name: str,
|
|
75
|
+
):
|
|
76
|
+
self.name = name
|
|
77
|
+
self.agent = agent
|
|
78
|
+
self.invocation_list: list[EvalCaseData] = []
|
|
79
|
+
self.result_list: list[EvalResultData] = []
|
|
80
|
+
self.agent_information_list: list[dict] = []
|
|
81
|
+
|
|
82
|
+
def load_eval_set(self, eval_set_file: str) -> list[EvalSet]:
|
|
83
|
+
from .eval_set_file_loader import load_eval_set_from_file
|
|
84
|
+
|
|
85
|
+
return load_eval_set_from_file(eval_set_file)
|
|
86
|
+
|
|
87
|
+
def generate_eval_data(self, eval_set_file_path: str):
|
|
88
|
+
eval_case_data_list: list[EvalCaseData] = []
|
|
89
|
+
|
|
90
|
+
eval_cases = self.load_eval_set(eval_set_file_path).eval_cases
|
|
91
|
+
for eval_case in eval_cases:
|
|
92
|
+
eval_case_data = EvalCaseData(invocations=[])
|
|
93
|
+
self.agent_information_list.append(
|
|
94
|
+
{
|
|
95
|
+
"app_name": eval_case.session_input.app_name,
|
|
96
|
+
"user_id": eval_case.session_input.user_id,
|
|
97
|
+
"session_id": str(
|
|
98
|
+
uuid.uuid4()
|
|
99
|
+
), # random session id for evaluation,
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
for invocation in eval_case.conversation:
|
|
104
|
+
_input: str = ""
|
|
105
|
+
_expected_output: str = ""
|
|
106
|
+
_expected_tool: list[dict] = []
|
|
107
|
+
|
|
108
|
+
user_content = invocation.user_content
|
|
109
|
+
_input = user_content.parts[0].text
|
|
110
|
+
_expected_output = invocation.final_response.parts[0].text
|
|
111
|
+
|
|
112
|
+
if invocation.intermediate_data.tool_uses:
|
|
113
|
+
for expected_tool_use in invocation.intermediate_data.tool_uses:
|
|
114
|
+
_expected_tool.append(
|
|
115
|
+
{
|
|
116
|
+
"name": expected_tool_use.name,
|
|
117
|
+
"args": expected_tool_use.args,
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
eval_case_data.invocations.append(
|
|
122
|
+
InvocationTestData(
|
|
123
|
+
invocation_id=invocation.invocation_id,
|
|
124
|
+
input=_input,
|
|
125
|
+
actual_output="",
|
|
126
|
+
actual_tool=[],
|
|
127
|
+
expected_output=_expected_output,
|
|
128
|
+
expected_tool=_expected_tool,
|
|
129
|
+
latency="",
|
|
130
|
+
)
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
eval_case_data_list.append(eval_case_data)
|
|
134
|
+
self.invocation_list = eval_case_data_list
|
|
135
|
+
|
|
136
|
+
async def _run_agent_for_actual_data(self):
|
|
137
|
+
for eval_case_data, agent_information in zip(
|
|
138
|
+
self.invocation_list, self.agent_information_list
|
|
139
|
+
):
|
|
140
|
+
session_service = InMemorySessionService()
|
|
141
|
+
_ = await session_service.create_session(
|
|
142
|
+
app_name=agent_information["app_name"],
|
|
143
|
+
user_id=agent_information["user_id"],
|
|
144
|
+
state={},
|
|
145
|
+
session_id=agent_information["session_id"],
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if getattr(self.agent, "long_term_memory", None):
|
|
149
|
+
runner = Runner(
|
|
150
|
+
app_name=agent_information["app_name"],
|
|
151
|
+
agent=self.agent,
|
|
152
|
+
session_service=session_service,
|
|
153
|
+
memory_service=self.agent.long_term_memory,
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
runner = Runner(
|
|
157
|
+
app_name=agent_information["app_name"],
|
|
158
|
+
agent=self.agent,
|
|
159
|
+
session_service=session_service,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
for invocation in eval_case_data.invocations:
|
|
163
|
+
_actual_output: str = ""
|
|
164
|
+
_actual_tool: list[dict] = []
|
|
165
|
+
_latency: str = ""
|
|
166
|
+
final_response = None
|
|
167
|
+
tool_uses = []
|
|
168
|
+
invocation_id = ""
|
|
169
|
+
|
|
170
|
+
user_content = types.Content(
|
|
171
|
+
role="user", parts=[types.Part(text=invocation.input)]
|
|
172
|
+
)
|
|
173
|
+
tik = time.time()
|
|
174
|
+
async for event in runner.run_async(
|
|
175
|
+
user_id=agent_information["user_id"],
|
|
176
|
+
session_id=agent_information["session_id"],
|
|
177
|
+
new_message=user_content,
|
|
178
|
+
):
|
|
179
|
+
invocation_id = (
|
|
180
|
+
event.invocation_id if not invocation_id else invocation_id
|
|
181
|
+
)
|
|
182
|
+
if (
|
|
183
|
+
event.is_final_response()
|
|
184
|
+
and event.content
|
|
185
|
+
and event.content.parts
|
|
186
|
+
):
|
|
187
|
+
final_response = event.content
|
|
188
|
+
elif event.get_function_calls():
|
|
189
|
+
for call in event.get_function_calls():
|
|
190
|
+
tool_uses.append(call)
|
|
191
|
+
tok = time.time()
|
|
192
|
+
_latency = str((tok - tik) * 1000)
|
|
193
|
+
|
|
194
|
+
if final_response and final_response.parts:
|
|
195
|
+
_actual_output = final_response.parts[0].text
|
|
196
|
+
for tool_use in tool_uses:
|
|
197
|
+
_actual_tool.append(
|
|
198
|
+
{
|
|
199
|
+
"name": tool_use.name,
|
|
200
|
+
"args": tool_use.args,
|
|
201
|
+
}
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
invocation.actual_output = _actual_output
|
|
205
|
+
invocation.actual_tool = _actual_tool
|
|
206
|
+
invocation.latency = _latency
|
|
207
|
+
|
|
208
|
+
def get_data(self) -> list[list[dict[str, Any]]]:
|
|
209
|
+
"""Merge the evaluation data and return it in the format of list[list[dict]]"""
|
|
210
|
+
result = []
|
|
211
|
+
for i, eval_case in enumerate(self.invocation_list):
|
|
212
|
+
case_data = []
|
|
213
|
+
# Get corresponding eval_result or use default if not available
|
|
214
|
+
eval_result = (
|
|
215
|
+
self.result_list[i]
|
|
216
|
+
if i < len(self.result_list)
|
|
217
|
+
else EvalResultData(metric_results=[])
|
|
218
|
+
)
|
|
219
|
+
for invocation in eval_case.invocations:
|
|
220
|
+
data = {
|
|
221
|
+
"input": invocation.input,
|
|
222
|
+
"expected_output": invocation.expected_output,
|
|
223
|
+
"actual_output": invocation.actual_output,
|
|
224
|
+
"expected_tool": invocation.expected_tool,
|
|
225
|
+
"actual_tool": invocation.actual_tool,
|
|
226
|
+
"score": eval_result.average_score,
|
|
227
|
+
"reason": eval_result.total_reason,
|
|
228
|
+
"latency": invocation.latency,
|
|
229
|
+
}
|
|
230
|
+
case_data.append(data)
|
|
231
|
+
result.append(case_data)
|
|
232
|
+
return result
|
|
233
|
+
|
|
234
|
+
@abstractmethod
|
|
235
|
+
async def eval(
|
|
236
|
+
self,
|
|
237
|
+
eval_set_file_path: str,
|
|
238
|
+
metrics: list[Any],
|
|
239
|
+
eval_id: str,
|
|
240
|
+
):
|
|
241
|
+
"""An abstract method for evaluation based on metrics。"""
|
|
242
|
+
pass
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .deepeval_evaluator import DeepevalEvaluator
|
|
16
|
+
|
|
17
|
+
__all__ = ["DeepevalEvaluator"]
|