veadk-python 0.2.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veadk/__init__.py +37 -0
- veadk/a2a/__init__.py +13 -0
- veadk/a2a/agent_card.py +45 -0
- veadk/a2a/remote_ve_agent.py +390 -0
- veadk/a2a/utils/__init__.py +13 -0
- veadk/a2a/utils/agent_to_a2a.py +170 -0
- veadk/a2a/ve_a2a_server.py +93 -0
- veadk/a2a/ve_agent_executor.py +78 -0
- veadk/a2a/ve_middlewares.py +313 -0
- veadk/a2a/ve_task_store.py +37 -0
- veadk/agent.py +402 -0
- veadk/agent_builder.py +93 -0
- veadk/agents/loop_agent.py +68 -0
- veadk/agents/parallel_agent.py +72 -0
- veadk/agents/sequential_agent.py +64 -0
- veadk/auth/__init__.py +13 -0
- veadk/auth/base_auth.py +22 -0
- veadk/auth/ve_credential_service.py +203 -0
- veadk/auth/veauth/__init__.py +13 -0
- veadk/auth/veauth/apmplus_veauth.py +58 -0
- veadk/auth/veauth/ark_veauth.py +75 -0
- veadk/auth/veauth/base_veauth.py +50 -0
- veadk/auth/veauth/cozeloop_veauth.py +13 -0
- veadk/auth/veauth/opensearch_veauth.py +75 -0
- veadk/auth/veauth/postgresql_veauth.py +75 -0
- veadk/auth/veauth/prompt_pilot_veauth.py +60 -0
- veadk/auth/veauth/speech_veauth.py +54 -0
- veadk/auth/veauth/utils.py +69 -0
- veadk/auth/veauth/vesearch_veauth.py +62 -0
- veadk/auth/veauth/viking_mem0_veauth.py +91 -0
- veadk/cli/__init__.py +13 -0
- veadk/cli/cli.py +58 -0
- veadk/cli/cli_clean.py +87 -0
- veadk/cli/cli_create.py +163 -0
- veadk/cli/cli_deploy.py +233 -0
- veadk/cli/cli_eval.py +215 -0
- veadk/cli/cli_init.py +214 -0
- veadk/cli/cli_kb.py +110 -0
- veadk/cli/cli_pipeline.py +285 -0
- veadk/cli/cli_prompt.py +86 -0
- veadk/cli/cli_update.py +106 -0
- veadk/cli/cli_uploadevalset.py +139 -0
- veadk/cli/cli_web.py +143 -0
- veadk/cloud/__init__.py +13 -0
- veadk/cloud/cloud_agent_engine.py +485 -0
- veadk/cloud/cloud_app.py +475 -0
- veadk/config.py +115 -0
- veadk/configs/__init__.py +13 -0
- veadk/configs/auth_configs.py +133 -0
- veadk/configs/database_configs.py +132 -0
- veadk/configs/model_configs.py +78 -0
- veadk/configs/tool_configs.py +54 -0
- veadk/configs/tracing_configs.py +110 -0
- veadk/consts.py +74 -0
- veadk/evaluation/__init__.py +17 -0
- veadk/evaluation/adk_evaluator/__init__.py +17 -0
- veadk/evaluation/adk_evaluator/adk_evaluator.py +302 -0
- veadk/evaluation/base_evaluator.py +642 -0
- veadk/evaluation/deepeval_evaluator/__init__.py +17 -0
- veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +339 -0
- veadk/evaluation/eval_set_file_loader.py +48 -0
- veadk/evaluation/eval_set_recorder.py +146 -0
- veadk/evaluation/types.py +65 -0
- veadk/evaluation/utils/prometheus.py +196 -0
- veadk/integrations/__init__.py +13 -0
- veadk/integrations/ve_apig/__init__.py +13 -0
- veadk/integrations/ve_apig/ve_apig.py +349 -0
- veadk/integrations/ve_apig/ve_apig_utils.py +332 -0
- veadk/integrations/ve_code_pipeline/__init__.py +13 -0
- veadk/integrations/ve_code_pipeline/ve_code_pipeline.py +431 -0
- veadk/integrations/ve_cozeloop/__init__.py +13 -0
- veadk/integrations/ve_cozeloop/ve_cozeloop.py +96 -0
- veadk/integrations/ve_cr/__init__.py +13 -0
- veadk/integrations/ve_cr/ve_cr.py +220 -0
- veadk/integrations/ve_faas/__init__.py +13 -0
- veadk/integrations/ve_faas/template/cookiecutter.json +15 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/config.yaml.example +6 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/deploy.py +106 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__init__.py +13 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/agent.py +25 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/app.py +202 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/requirements.txt +3 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/run.sh +49 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/__init__.py +14 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/agent.py +27 -0
- veadk/integrations/ve_faas/ve_faas.py +754 -0
- veadk/integrations/ve_faas/ve_faas_utils.py +408 -0
- veadk/integrations/ve_faas/web_template/cookiecutter.json +20 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/config.yaml.example +2 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/deploy.py +44 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/Dockerfile +23 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/app.py +123 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/init_db.py +46 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/models.py +36 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/requirements.txt +4 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/run.sh +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/css/style.css +368 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/js/admin.js +0 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/dashboard.html +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/edit_post.html +24 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/login.html +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/posts.html +53 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/base.html +45 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/index.html +29 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/post.html +14 -0
- veadk/integrations/ve_identity/__init__.py +110 -0
- veadk/integrations/ve_identity/auth_config.py +261 -0
- veadk/integrations/ve_identity/auth_mixins.py +650 -0
- veadk/integrations/ve_identity/auth_processor.py +385 -0
- veadk/integrations/ve_identity/function_tool.py +158 -0
- veadk/integrations/ve_identity/identity_client.py +864 -0
- veadk/integrations/ve_identity/mcp_tool.py +181 -0
- veadk/integrations/ve_identity/mcp_toolset.py +431 -0
- veadk/integrations/ve_identity/models.py +228 -0
- veadk/integrations/ve_identity/token_manager.py +188 -0
- veadk/integrations/ve_identity/utils.py +151 -0
- veadk/integrations/ve_prompt_pilot/__init__.py +13 -0
- veadk/integrations/ve_prompt_pilot/ve_prompt_pilot.py +85 -0
- veadk/integrations/ve_tls/__init__.py +13 -0
- veadk/integrations/ve_tls/utils.py +116 -0
- veadk/integrations/ve_tls/ve_tls.py +212 -0
- veadk/integrations/ve_tos/ve_tos.py +710 -0
- veadk/integrations/ve_viking_db_memory/__init__.py +13 -0
- veadk/integrations/ve_viking_db_memory/ve_viking_db_memory.py +308 -0
- veadk/knowledgebase/__init__.py +17 -0
- veadk/knowledgebase/backends/__init__.py +13 -0
- veadk/knowledgebase/backends/base_backend.py +72 -0
- veadk/knowledgebase/backends/in_memory_backend.py +91 -0
- veadk/knowledgebase/backends/opensearch_backend.py +162 -0
- veadk/knowledgebase/backends/redis_backend.py +172 -0
- veadk/knowledgebase/backends/utils.py +92 -0
- veadk/knowledgebase/backends/vikingdb_knowledge_backend.py +608 -0
- veadk/knowledgebase/entry.py +25 -0
- veadk/knowledgebase/knowledgebase.py +307 -0
- veadk/memory/__init__.py +35 -0
- veadk/memory/long_term_memory.py +365 -0
- veadk/memory/long_term_memory_backends/__init__.py +13 -0
- veadk/memory/long_term_memory_backends/base_backend.py +35 -0
- veadk/memory/long_term_memory_backends/in_memory_backend.py +67 -0
- veadk/memory/long_term_memory_backends/mem0_backend.py +155 -0
- veadk/memory/long_term_memory_backends/opensearch_backend.py +124 -0
- veadk/memory/long_term_memory_backends/redis_backend.py +140 -0
- veadk/memory/long_term_memory_backends/vikingdb_memory_backend.py +189 -0
- veadk/memory/short_term_memory.py +252 -0
- veadk/memory/short_term_memory_backends/__init__.py +13 -0
- veadk/memory/short_term_memory_backends/base_backend.py +31 -0
- veadk/memory/short_term_memory_backends/mysql_backend.py +49 -0
- veadk/memory/short_term_memory_backends/postgresql_backend.py +49 -0
- veadk/memory/short_term_memory_backends/sqlite_backend.py +55 -0
- veadk/memory/short_term_memory_processor.py +100 -0
- veadk/processors/__init__.py +26 -0
- veadk/processors/base_run_processor.py +120 -0
- veadk/prompts/__init__.py +13 -0
- veadk/prompts/agent_default_prompt.py +30 -0
- veadk/prompts/prompt_evaluator.py +20 -0
- veadk/prompts/prompt_memory_processor.py +55 -0
- veadk/prompts/prompt_optimization.py +150 -0
- veadk/runner.py +732 -0
- veadk/tools/__init__.py +13 -0
- veadk/tools/builtin_tools/__init__.py +13 -0
- veadk/tools/builtin_tools/agent_authorization.py +94 -0
- veadk/tools/builtin_tools/generate_image.py +23 -0
- veadk/tools/builtin_tools/image_edit.py +300 -0
- veadk/tools/builtin_tools/image_generate.py +446 -0
- veadk/tools/builtin_tools/lark.py +67 -0
- veadk/tools/builtin_tools/las.py +24 -0
- veadk/tools/builtin_tools/link_reader.py +66 -0
- veadk/tools/builtin_tools/llm_shield.py +381 -0
- veadk/tools/builtin_tools/load_knowledgebase.py +97 -0
- veadk/tools/builtin_tools/mcp_router.py +29 -0
- veadk/tools/builtin_tools/run_code.py +113 -0
- veadk/tools/builtin_tools/tts.py +253 -0
- veadk/tools/builtin_tools/vesearch.py +49 -0
- veadk/tools/builtin_tools/video_generate.py +363 -0
- veadk/tools/builtin_tools/web_scraper.py +76 -0
- veadk/tools/builtin_tools/web_search.py +83 -0
- veadk/tools/demo_tools.py +58 -0
- veadk/tools/load_knowledgebase_tool.py +149 -0
- veadk/tools/sandbox/__init__.py +13 -0
- veadk/tools/sandbox/browser_sandbox.py +37 -0
- veadk/tools/sandbox/code_sandbox.py +40 -0
- veadk/tools/sandbox/computer_sandbox.py +34 -0
- veadk/tracing/__init__.py +13 -0
- veadk/tracing/base_tracer.py +58 -0
- veadk/tracing/telemetry/__init__.py +13 -0
- veadk/tracing/telemetry/attributes/attributes.py +29 -0
- veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +180 -0
- veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +858 -0
- veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +152 -0
- veadk/tracing/telemetry/attributes/extractors/types.py +164 -0
- veadk/tracing/telemetry/exporters/__init__.py +13 -0
- veadk/tracing/telemetry/exporters/apmplus_exporter.py +558 -0
- veadk/tracing/telemetry/exporters/base_exporter.py +39 -0
- veadk/tracing/telemetry/exporters/cozeloop_exporter.py +129 -0
- veadk/tracing/telemetry/exporters/inmemory_exporter.py +248 -0
- veadk/tracing/telemetry/exporters/tls_exporter.py +139 -0
- veadk/tracing/telemetry/opentelemetry_tracer.py +320 -0
- veadk/tracing/telemetry/telemetry.py +411 -0
- veadk/types.py +47 -0
- veadk/utils/__init__.py +13 -0
- veadk/utils/audio_manager.py +95 -0
- veadk/utils/auth.py +294 -0
- veadk/utils/logger.py +59 -0
- veadk/utils/mcp_utils.py +44 -0
- veadk/utils/misc.py +184 -0
- veadk/utils/patches.py +101 -0
- veadk/utils/volcengine_sign.py +205 -0
- veadk/version.py +15 -0
- veadk_python-0.2.27.dist-info/METADATA +373 -0
- veadk_python-0.2.27.dist-info/RECORD +218 -0
- veadk_python-0.2.27.dist-info/WHEEL +5 -0
- veadk_python-0.2.27.dist-info/entry_points.txt +2 -0
- veadk_python-0.2.27.dist-info/licenses/LICENSE +201 -0
- veadk_python-0.2.27.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import time
|
|
18
|
+
import uuid
|
|
19
|
+
from abc import abstractmethod
|
|
20
|
+
from typing import Any, Optional
|
|
21
|
+
|
|
22
|
+
from google.adk import Runner
|
|
23
|
+
from google.adk.evaluation.eval_set import EvalSet
|
|
24
|
+
from google.adk.sessions import InMemorySessionService
|
|
25
|
+
from google.genai import types
|
|
26
|
+
from pydantic import BaseModel
|
|
27
|
+
|
|
28
|
+
from veadk.utils.misc import formatted_timestamp
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ToolInvocation(BaseModel):
|
|
32
|
+
"""Represents a single tool invocation in agent execution.
|
|
33
|
+
|
|
34
|
+
This model holds tool name, arguments, and result.
|
|
35
|
+
Used in tracking tool usage during evaluation.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
tool_name (str): Name of the tool called.
|
|
39
|
+
tool_args (dict[str, Any]): Arguments passed to the tool. Defaults to empty dict.
|
|
40
|
+
tool_result (Any): Result from tool execution. Defaults to None.
|
|
41
|
+
|
|
42
|
+
Note:
|
|
43
|
+
Flexible for various tool types and results.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
tool_name: str
|
|
47
|
+
tool_args: dict[str, Any] = {}
|
|
48
|
+
tool_result: Any = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class Invocation(BaseModel):
|
|
52
|
+
"""Models a single invocation in the evaluation process.
|
|
53
|
+
|
|
54
|
+
This class stores input, expected and actual outputs, tools, and latency.
|
|
55
|
+
Essential for comparing agent behavior.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
invocation_id (str): Unique ID for the invocation. Defaults to empty.
|
|
59
|
+
input (str): User input prompt.
|
|
60
|
+
actual_output (str): Actual response from agent.
|
|
61
|
+
expected_output (str): Expected response.
|
|
62
|
+
actual_tool (list[dict]): List of actual tools called with details.
|
|
63
|
+
expected_tool (list[dict]): List of expected tools.
|
|
64
|
+
latency (str): Execution time in ms. Defaults to empty.
|
|
65
|
+
|
|
66
|
+
Note:
|
|
67
|
+
Tools are dicts with 'name' and 'args'.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
invocation_id: str = ""
|
|
71
|
+
input: str
|
|
72
|
+
actual_output: str
|
|
73
|
+
expected_output: str
|
|
74
|
+
actual_tool: list[dict] = []
|
|
75
|
+
expected_tool: list[dict] = []
|
|
76
|
+
latency: str = "" # ms
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class EvalTestCase(BaseModel):
|
|
80
|
+
"""Groups invocations for a single test case.
|
|
81
|
+
|
|
82
|
+
This model contains a list of invocations for one evaluation scenario.
|
|
83
|
+
Used to structure test data.
|
|
84
|
+
|
|
85
|
+
Attributes:
|
|
86
|
+
invocations (list[Invocation]): List of invocation objects in the case.
|
|
87
|
+
|
|
88
|
+
Note:
|
|
89
|
+
Each case corresponds to one session or conversation.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
invocations: list[Invocation]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class MetricResult(BaseModel):
|
|
96
|
+
"""Stores result of a single metric evaluation.
|
|
97
|
+
|
|
98
|
+
This model holds the outcome of one metric application.
|
|
99
|
+
Includes success, score, and reason.
|
|
100
|
+
|
|
101
|
+
Attributes:
|
|
102
|
+
metric_type (str): Type or name of the metric.
|
|
103
|
+
success (bool): If the metric passed.
|
|
104
|
+
score (float): Numerical score from evaluation.
|
|
105
|
+
reason (str): Explanation for the score.
|
|
106
|
+
|
|
107
|
+
Note:
|
|
108
|
+
Score is float between 0 and 1 typically.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
metric_type: str
|
|
112
|
+
success: bool
|
|
113
|
+
score: float
|
|
114
|
+
reason: str
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class EvalResultData(BaseModel):
|
|
118
|
+
"""Aggregates metric results for an evaluation.
|
|
119
|
+
|
|
120
|
+
This class collects multiple metric results and computes averages.
|
|
121
|
+
Used for overall case scoring.
|
|
122
|
+
|
|
123
|
+
Attributes:
|
|
124
|
+
metric_results (list[MetricResult]): List of individual metric outcomes.
|
|
125
|
+
average_score (float): Computed average score. Defaults to 0.0.
|
|
126
|
+
total_reason (str): Combined reasons. Defaults to empty.
|
|
127
|
+
|
|
128
|
+
Note:
|
|
129
|
+
Call call_before_append to compute averages and reasons.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
metric_results: list[MetricResult]
|
|
133
|
+
average_score: float = 0.0
|
|
134
|
+
total_reason: str = ""
|
|
135
|
+
|
|
136
|
+
def calculate_average_score(self):
|
|
137
|
+
"""Calculates the average score from metric results.
|
|
138
|
+
|
|
139
|
+
This method sums scores and divides by count.
|
|
140
|
+
Updates average_score attribute.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
None: Updates internal state.
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
ZeroDivisionError: If no metrics.
|
|
147
|
+
"""
|
|
148
|
+
total_score = sum(result.score for result in self.metric_results)
|
|
149
|
+
self.average_score = (
|
|
150
|
+
total_score / len(self.metric_results) if self.metric_results else 0.0
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def generate_total_reason(self):
|
|
154
|
+
"""Generates a combined reason string from all metrics.
|
|
155
|
+
|
|
156
|
+
This method joins reasons with metric types.
|
|
157
|
+
Updates total_reason attribute.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
None: Updates internal state.
|
|
161
|
+
|
|
162
|
+
Note:
|
|
163
|
+
Format: 'metric_type: reason'
|
|
164
|
+
"""
|
|
165
|
+
self.total_reason = "\n".join(
|
|
166
|
+
f"{result.metric_type:}:{result.reason}" for result in self.metric_results
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def call_before_append(self):
|
|
170
|
+
"""Computes average score and total reason before adding to list.
|
|
171
|
+
|
|
172
|
+
This method calls calculate_average_score and generate_total_reason.
|
|
173
|
+
Ensures data is ready for storage.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
None: Updates internal state.
|
|
177
|
+
"""
|
|
178
|
+
self.calculate_average_score()
|
|
179
|
+
self.generate_total_reason()
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class BaseEvaluator:
|
|
183
|
+
"""Base class for all evaluators in the system.
|
|
184
|
+
|
|
185
|
+
This abstract class provides common functionality for evaluation.
|
|
186
|
+
Handles building eval sets, generating outputs, and abstract evaluate.
|
|
187
|
+
|
|
188
|
+
Attributes:
|
|
189
|
+
name (str): Name of the evaluator.
|
|
190
|
+
agent: The agent being evaluated.
|
|
191
|
+
invocation_list (list[EvalTestCase]): List of test cases.
|
|
192
|
+
result_list (list[EvalResultData]): List of evaluation results.
|
|
193
|
+
agent_information_list (list[dict]): List of agent config info.
|
|
194
|
+
|
|
195
|
+
Note:
|
|
196
|
+
Subclasses must implement evaluate method.
|
|
197
|
+
Supports JSON and tracing formats for input.
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
def __init__(
|
|
201
|
+
self,
|
|
202
|
+
agent,
|
|
203
|
+
name: str,
|
|
204
|
+
):
|
|
205
|
+
"""Initializes the base evaluator with agent and name.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
agent: Agent instance to evaluate.
|
|
209
|
+
name (str): Identifier for the evaluator.
|
|
210
|
+
|
|
211
|
+
Raises:
|
|
212
|
+
ValueError: If agent or name invalid.
|
|
213
|
+
"""
|
|
214
|
+
self.name = name
|
|
215
|
+
self.agent = agent
|
|
216
|
+
self.invocation_list: list[EvalTestCase] = []
|
|
217
|
+
self.result_list: list[EvalResultData] = []
|
|
218
|
+
self.agent_information_list: list[dict] = []
|
|
219
|
+
|
|
220
|
+
def _build_eval_set_from_eval_json(self, eval_json_path: str) -> EvalSet:
|
|
221
|
+
"""Builds eval set from standard eval JSON file.
|
|
222
|
+
|
|
223
|
+
This private method loads using file loader.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
eval_json_path (str): Path to JSON file.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
EvalSet: Loaded set.
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
ValueError: If loading fails.
|
|
233
|
+
"""
|
|
234
|
+
from veadk.evaluation.eval_set_file_loader import load_eval_set_from_file
|
|
235
|
+
|
|
236
|
+
return load_eval_set_from_file(eval_json_path)
|
|
237
|
+
|
|
238
|
+
def _build_eval_set_from_tracing_json(self, tracing_json_path: str) -> EvalSet:
|
|
239
|
+
"""Builds eval set from tracing JSON spans.
|
|
240
|
+
|
|
241
|
+
This private method parses spans, groups by trace, extracts tools and conversation.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
tracing_json_path (str): Path to tracing JSON.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
EvalSet: Constructed set from traces.
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
ValueError: If JSON invalid or parsing fails.
|
|
251
|
+
json.JSONDecodeError: For malformed JSON.
|
|
252
|
+
|
|
253
|
+
Note:
|
|
254
|
+
Assumes spans have gen_ai attributes for tools and content.
|
|
255
|
+
"""
|
|
256
|
+
try:
|
|
257
|
+
with open(tracing_json_path, "r") as f:
|
|
258
|
+
tracing_data = json.load(f)
|
|
259
|
+
except json.JSONDecodeError as e:
|
|
260
|
+
raise ValueError(f"Invalid JSON format in file {tracing_json_path}: {e}")
|
|
261
|
+
except Exception as e:
|
|
262
|
+
raise ValueError(f"Error reading file {tracing_json_path}: {e}")
|
|
263
|
+
|
|
264
|
+
# Group spans by trace_id
|
|
265
|
+
trace_groups = {}
|
|
266
|
+
for span in tracing_data:
|
|
267
|
+
trace_id = span["trace_id"]
|
|
268
|
+
if trace_id not in trace_groups:
|
|
269
|
+
trace_groups[trace_id] = []
|
|
270
|
+
trace_groups[trace_id].append(span)
|
|
271
|
+
|
|
272
|
+
# Convert to evalset format
|
|
273
|
+
eval_cases, conversation = [], []
|
|
274
|
+
app_name, user_id = "", ""
|
|
275
|
+
creation_timestamp = 0
|
|
276
|
+
for trace_id, spans in trace_groups.items():
|
|
277
|
+
tool_uses = []
|
|
278
|
+
|
|
279
|
+
# Extract tool_uses from spans with name starting with "execute_tool"
|
|
280
|
+
for span in spans:
|
|
281
|
+
if span["name"].startswith("execute_tool"):
|
|
282
|
+
# Extract tool parameters from gen_ai.tool.input
|
|
283
|
+
tool_input_str = span["attributes"].get("gen_ai.tool.input", "{}")
|
|
284
|
+
try:
|
|
285
|
+
tool_input = json.loads(tool_input_str)
|
|
286
|
+
tool_args = tool_input.get("parameters", {})
|
|
287
|
+
except json.JSONDecodeError:
|
|
288
|
+
tool_args = {}
|
|
289
|
+
|
|
290
|
+
# Extract the tool call ID from gen_ai.tool.output
|
|
291
|
+
tool_output_str = span["attributes"].get("gen_ai.tool.output", "{}")
|
|
292
|
+
tool_call_id = None
|
|
293
|
+
try:
|
|
294
|
+
tool_output = json.loads(tool_output_str)
|
|
295
|
+
tool_call_id = tool_output.get("id", None)
|
|
296
|
+
except json.JSONDecodeError:
|
|
297
|
+
tool_call_id = None
|
|
298
|
+
|
|
299
|
+
tool_uses.append(
|
|
300
|
+
{
|
|
301
|
+
"id": tool_call_id,
|
|
302
|
+
"args": tool_args,
|
|
303
|
+
"name": span["attributes"].get("gen_ai.tool.name", None),
|
|
304
|
+
}
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
# Extract conversation data from call_llm spans
|
|
308
|
+
user_input = ""
|
|
309
|
+
final_output = ""
|
|
310
|
+
|
|
311
|
+
# Find the first call_llm span for user input and the last one for final output
|
|
312
|
+
call_llm_spans = [span for span in spans if span["name"] == "call_llm"]
|
|
313
|
+
|
|
314
|
+
if call_llm_spans:
|
|
315
|
+
# Get user input from the first call_llm span
|
|
316
|
+
first_span = call_llm_spans[0]
|
|
317
|
+
user_input = first_span["attributes"].get("gen_ai.prompt.0.content", "")
|
|
318
|
+
|
|
319
|
+
# Get final output from the last call_llm span
|
|
320
|
+
last_span = call_llm_spans[-1]
|
|
321
|
+
final_output = last_span["attributes"].get(
|
|
322
|
+
"gen_ai.completion.0.content", ""
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Get metadata from any span
|
|
326
|
+
app_name = first_span["attributes"].get("gen_ai.app.name", "")
|
|
327
|
+
user_id = first_span["attributes"].get("gen_ai.user.id", "")
|
|
328
|
+
creation_timestamp = first_span["start_time"] / 1e9
|
|
329
|
+
|
|
330
|
+
if user_input and final_output:
|
|
331
|
+
# Create user_content and final_response in the expected format
|
|
332
|
+
user_content = {"role": "user", "parts": [{"text": user_input}]}
|
|
333
|
+
|
|
334
|
+
final_response = {"role": "model", "parts": [{"text": final_output}]}
|
|
335
|
+
|
|
336
|
+
conversation.append(
|
|
337
|
+
{
|
|
338
|
+
"invocation_id": str(uuid.uuid4()),
|
|
339
|
+
"user_content": user_content,
|
|
340
|
+
"final_response": final_response,
|
|
341
|
+
"intermediate_data": {
|
|
342
|
+
"tool_uses": tool_uses,
|
|
343
|
+
"intermediate_responses": [],
|
|
344
|
+
},
|
|
345
|
+
"creation_timestamp": creation_timestamp,
|
|
346
|
+
}
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
eval_cases.append(
|
|
350
|
+
{
|
|
351
|
+
"eval_id": f"veadk_eval_{formatted_timestamp()}",
|
|
352
|
+
"conversation": conversation,
|
|
353
|
+
"session_input": {
|
|
354
|
+
"app_name": app_name,
|
|
355
|
+
"user_id": user_id,
|
|
356
|
+
"state": {},
|
|
357
|
+
},
|
|
358
|
+
"creation_timestamp": creation_timestamp,
|
|
359
|
+
}
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
evalset = EvalSet(
|
|
363
|
+
eval_set_id="default",
|
|
364
|
+
name="default",
|
|
365
|
+
description=None,
|
|
366
|
+
eval_cases=eval_cases,
|
|
367
|
+
creation_timestamp=creation_timestamp,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
return evalset
|
|
371
|
+
|
|
372
|
+
def build_eval_set(
|
|
373
|
+
self, eval_set: Optional[EvalSet] = None, file_path: Optional[str] = None
|
|
374
|
+
):
|
|
375
|
+
"""Builds invocation list from eval set or file.
|
|
376
|
+
|
|
377
|
+
This method parses input, extracts invocations with expected data.
|
|
378
|
+
Supports eval JSON and tracing JSON formats.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
eval_set (Optional[EvalSet]): Direct eval set object.
|
|
382
|
+
file_path (Optional[str]): Path to file for loading.
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
ValueError: If neither provided or format unsupported.
|
|
386
|
+
|
|
387
|
+
Note:
|
|
388
|
+
Generates random session IDs for isolation.
|
|
389
|
+
"""
|
|
390
|
+
|
|
391
|
+
if eval_set is None and file_path is None:
|
|
392
|
+
raise ValueError("eval_set or file_path is required")
|
|
393
|
+
if eval_set:
|
|
394
|
+
eval_cases = eval_set.eval_cases
|
|
395
|
+
else:
|
|
396
|
+
try:
|
|
397
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
398
|
+
file_content = json.load(f)
|
|
399
|
+
except json.JSONDecodeError as e:
|
|
400
|
+
raise ValueError(f"Invalid JSON format in file {file_path}: {e}")
|
|
401
|
+
except Exception as e:
|
|
402
|
+
raise ValueError(f"Error reading file {file_path}: {e}")
|
|
403
|
+
|
|
404
|
+
if isinstance(file_content, dict) and "eval_cases" in file_content:
|
|
405
|
+
eval_cases = self._build_eval_set_from_eval_json(file_path).eval_cases
|
|
406
|
+
elif (
|
|
407
|
+
isinstance(file_content, list)
|
|
408
|
+
and len(file_content) > 0
|
|
409
|
+
and all(
|
|
410
|
+
isinstance(span, dict) and "trace_id" in span
|
|
411
|
+
for span in file_content
|
|
412
|
+
)
|
|
413
|
+
):
|
|
414
|
+
eval_cases = self._build_eval_set_from_tracing_json(
|
|
415
|
+
file_path
|
|
416
|
+
).eval_cases
|
|
417
|
+
else:
|
|
418
|
+
raise ValueError(
|
|
419
|
+
f"Unsupported file format in {file_path}. Please provide a valid file."
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
eval_case_data_list: list[EvalTestCase] = []
|
|
423
|
+
for eval_case in eval_cases:
|
|
424
|
+
eval_case_data = EvalTestCase(invocations=[])
|
|
425
|
+
if eval_case.session_input:
|
|
426
|
+
self.agent_information_list.append(
|
|
427
|
+
{
|
|
428
|
+
"app_name": eval_case.session_input.app_name,
|
|
429
|
+
"user_id": eval_case.session_input.user_id,
|
|
430
|
+
"session_id": str(
|
|
431
|
+
uuid.uuid4()
|
|
432
|
+
), # random session id for evaluation,
|
|
433
|
+
}
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
for invocation in eval_case.conversation:
|
|
437
|
+
_input: str = ""
|
|
438
|
+
_expected_output: str = ""
|
|
439
|
+
_expected_tool: list[dict] = []
|
|
440
|
+
|
|
441
|
+
user_content = invocation.user_content
|
|
442
|
+
_input = user_content.parts[0].text
|
|
443
|
+
_expected_output = invocation.final_response.parts[0].text
|
|
444
|
+
|
|
445
|
+
if (
|
|
446
|
+
hasattr(invocation.intermediate_data, "tool_uses")
|
|
447
|
+
and invocation.intermediate_data.tool_uses
|
|
448
|
+
):
|
|
449
|
+
for expected_tool_use in invocation.intermediate_data.tool_uses:
|
|
450
|
+
_expected_tool.append(
|
|
451
|
+
{
|
|
452
|
+
"name": expected_tool_use.name,
|
|
453
|
+
"args": expected_tool_use.args,
|
|
454
|
+
}
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
elif (
|
|
458
|
+
hasattr(invocation.intermediate_data, "invocation_events")
|
|
459
|
+
and invocation.intermediate_data.invocation_events
|
|
460
|
+
):
|
|
461
|
+
for event in invocation.intermediate_data.invocation_events:
|
|
462
|
+
if hasattr(event, "content") and hasattr(
|
|
463
|
+
event.content, "parts"
|
|
464
|
+
):
|
|
465
|
+
for part in event.content.parts:
|
|
466
|
+
if (
|
|
467
|
+
hasattr(part, "function_call")
|
|
468
|
+
and part.function_call is not None
|
|
469
|
+
):
|
|
470
|
+
_expected_tool.append(
|
|
471
|
+
{
|
|
472
|
+
"name": part.function_call.name,
|
|
473
|
+
"args": part.function_call.args,
|
|
474
|
+
}
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
eval_case_data.invocations.append(
|
|
478
|
+
Invocation(
|
|
479
|
+
invocation_id=invocation.invocation_id,
|
|
480
|
+
input=_input,
|
|
481
|
+
actual_output="",
|
|
482
|
+
actual_tool=[],
|
|
483
|
+
expected_output=_expected_output,
|
|
484
|
+
expected_tool=_expected_tool,
|
|
485
|
+
latency="",
|
|
486
|
+
)
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
eval_case_data_list.append(eval_case_data)
|
|
490
|
+
self.invocation_list = eval_case_data_list
|
|
491
|
+
|
|
492
|
+
async def generate_actual_outputs(self):
|
|
493
|
+
"""Generates actual outputs by running the agent on inputs.
|
|
494
|
+
|
|
495
|
+
This method uses Runner to execute agent for each invocation.
|
|
496
|
+
Captures outputs, tools, and latency.
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
None: Updates invocation actual fields.
|
|
500
|
+
|
|
501
|
+
Raises:
|
|
502
|
+
Exception: If runner or execution fails.
|
|
503
|
+
|
|
504
|
+
Note:
|
|
505
|
+
Uses InMemorySessionService for isolation.
|
|
506
|
+
Supports long-term memory if present.
|
|
507
|
+
"""
|
|
508
|
+
for eval_case_data, agent_information in zip(
|
|
509
|
+
self.invocation_list, self.agent_information_list
|
|
510
|
+
):
|
|
511
|
+
session_service = InMemorySessionService()
|
|
512
|
+
_ = await session_service.create_session(
|
|
513
|
+
app_name=agent_information["app_name"],
|
|
514
|
+
user_id=agent_information["user_id"],
|
|
515
|
+
state={},
|
|
516
|
+
session_id=agent_information["session_id"],
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
if getattr(self.agent, "long_term_memory", None):
|
|
520
|
+
runner = Runner(
|
|
521
|
+
app_name=agent_information["app_name"],
|
|
522
|
+
agent=self.agent,
|
|
523
|
+
session_service=session_service,
|
|
524
|
+
memory_service=self.agent.long_term_memory,
|
|
525
|
+
)
|
|
526
|
+
else:
|
|
527
|
+
runner = Runner(
|
|
528
|
+
app_name=agent_information["app_name"],
|
|
529
|
+
agent=self.agent,
|
|
530
|
+
session_service=session_service,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
for invocation in eval_case_data.invocations:
|
|
534
|
+
_actual_output: str = ""
|
|
535
|
+
_actual_tool: list[dict] = []
|
|
536
|
+
_latency: str = ""
|
|
537
|
+
final_response = None
|
|
538
|
+
tool_uses = []
|
|
539
|
+
invocation_id = ""
|
|
540
|
+
|
|
541
|
+
user_content = types.Content(
|
|
542
|
+
role="user", parts=[types.Part(text=invocation.input)]
|
|
543
|
+
)
|
|
544
|
+
tik = time.time()
|
|
545
|
+
async for event in runner.run_async(
|
|
546
|
+
user_id=agent_information["user_id"],
|
|
547
|
+
session_id=agent_information["session_id"],
|
|
548
|
+
new_message=user_content,
|
|
549
|
+
):
|
|
550
|
+
invocation_id = (
|
|
551
|
+
event.invocation_id if not invocation_id else invocation_id
|
|
552
|
+
)
|
|
553
|
+
if (
|
|
554
|
+
event.is_final_response()
|
|
555
|
+
and event.content
|
|
556
|
+
and event.content.parts
|
|
557
|
+
):
|
|
558
|
+
final_response = event.content
|
|
559
|
+
elif event.get_function_calls():
|
|
560
|
+
for call in event.get_function_calls():
|
|
561
|
+
tool_uses.append(call)
|
|
562
|
+
tok = time.time()
|
|
563
|
+
_latency = str((tok - tik) * 1000)
|
|
564
|
+
|
|
565
|
+
if final_response and final_response.parts:
|
|
566
|
+
_actual_output = final_response.parts[0].text
|
|
567
|
+
for tool_use in tool_uses:
|
|
568
|
+
_actual_tool.append(
|
|
569
|
+
{
|
|
570
|
+
"name": tool_use.name,
|
|
571
|
+
"args": tool_use.args,
|
|
572
|
+
}
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
invocation.actual_output = _actual_output
|
|
576
|
+
invocation.actual_tool = _actual_tool
|
|
577
|
+
invocation.latency = _latency
|
|
578
|
+
|
|
579
|
+
def get_eval_set_information(self) -> list[list[dict[str, Any]]]:
|
|
580
|
+
"""Retrieves combined evaluation information.
|
|
581
|
+
|
|
582
|
+
This method merges invocations and results into dict lists.
|
|
583
|
+
Useful for reporting.
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
list[list[dict[str, Any]]]: Nested list of case data dicts.
|
|
587
|
+
|
|
588
|
+
Note:
|
|
589
|
+
Defaults to empty results if not evaluated yet.
|
|
590
|
+
"""
|
|
591
|
+
result = []
|
|
592
|
+
for i, eval_case in enumerate(self.invocation_list):
|
|
593
|
+
case_data = []
|
|
594
|
+
# Get corresponding eval_result or use default if not available
|
|
595
|
+
eval_result = (
|
|
596
|
+
self.result_list[i]
|
|
597
|
+
if i < len(self.result_list)
|
|
598
|
+
else EvalResultData(metric_results=[])
|
|
599
|
+
)
|
|
600
|
+
for invocation in eval_case.invocations:
|
|
601
|
+
data = {
|
|
602
|
+
"input": invocation.input,
|
|
603
|
+
"expected_output": invocation.expected_output,
|
|
604
|
+
"actual_output": invocation.actual_output,
|
|
605
|
+
"expected_tool": invocation.expected_tool,
|
|
606
|
+
"actual_tool": invocation.actual_tool,
|
|
607
|
+
"score": eval_result.average_score,
|
|
608
|
+
"reason": eval_result.total_reason,
|
|
609
|
+
"latency": invocation.latency,
|
|
610
|
+
}
|
|
611
|
+
case_data.append(data)
|
|
612
|
+
result.append(case_data)
|
|
613
|
+
return result
|
|
614
|
+
|
|
615
|
+
@abstractmethod
|
|
616
|
+
async def evaluate(
|
|
617
|
+
self,
|
|
618
|
+
metrics: list[Any],
|
|
619
|
+
eval_set: Optional[EvalSet],
|
|
620
|
+
eval_set_file_path: Optional[str],
|
|
621
|
+
eval_id: str,
|
|
622
|
+
):
|
|
623
|
+
"""Abstract method for performing the evaluation.
|
|
624
|
+
|
|
625
|
+
Subclasses implement specific metric evaluation logic.
|
|
626
|
+
|
|
627
|
+
Args:
|
|
628
|
+
metrics (list[Any]): Metrics to apply.
|
|
629
|
+
eval_set (Optional[EvalSet]): Eval set.
|
|
630
|
+
eval_set_file_path (Optional[str]): File path.
|
|
631
|
+
eval_id (str): Evaluation ID.
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
Any: Evaluation results specific to subclass.
|
|
635
|
+
|
|
636
|
+
Raises:
|
|
637
|
+
NotImplementedError: If not overridden.
|
|
638
|
+
|
|
639
|
+
Note:
|
|
640
|
+
Must populate result_list after evaluation.
|
|
641
|
+
"""
|
|
642
|
+
pass
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .deepeval_evaluator import DeepevalEvaluator
|
|
16
|
+
|
|
17
|
+
__all__ = ["DeepevalEvaluator"]
|