veadk-python 0.2.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veadk/__init__.py +37 -0
- veadk/a2a/__init__.py +13 -0
- veadk/a2a/agent_card.py +45 -0
- veadk/a2a/remote_ve_agent.py +390 -0
- veadk/a2a/utils/__init__.py +13 -0
- veadk/a2a/utils/agent_to_a2a.py +170 -0
- veadk/a2a/ve_a2a_server.py +93 -0
- veadk/a2a/ve_agent_executor.py +78 -0
- veadk/a2a/ve_middlewares.py +313 -0
- veadk/a2a/ve_task_store.py +37 -0
- veadk/agent.py +402 -0
- veadk/agent_builder.py +93 -0
- veadk/agents/loop_agent.py +68 -0
- veadk/agents/parallel_agent.py +72 -0
- veadk/agents/sequential_agent.py +64 -0
- veadk/auth/__init__.py +13 -0
- veadk/auth/base_auth.py +22 -0
- veadk/auth/ve_credential_service.py +203 -0
- veadk/auth/veauth/__init__.py +13 -0
- veadk/auth/veauth/apmplus_veauth.py +58 -0
- veadk/auth/veauth/ark_veauth.py +75 -0
- veadk/auth/veauth/base_veauth.py +50 -0
- veadk/auth/veauth/cozeloop_veauth.py +13 -0
- veadk/auth/veauth/opensearch_veauth.py +75 -0
- veadk/auth/veauth/postgresql_veauth.py +75 -0
- veadk/auth/veauth/prompt_pilot_veauth.py +60 -0
- veadk/auth/veauth/speech_veauth.py +54 -0
- veadk/auth/veauth/utils.py +69 -0
- veadk/auth/veauth/vesearch_veauth.py +62 -0
- veadk/auth/veauth/viking_mem0_veauth.py +91 -0
- veadk/cli/__init__.py +13 -0
- veadk/cli/cli.py +58 -0
- veadk/cli/cli_clean.py +87 -0
- veadk/cli/cli_create.py +163 -0
- veadk/cli/cli_deploy.py +233 -0
- veadk/cli/cli_eval.py +215 -0
- veadk/cli/cli_init.py +214 -0
- veadk/cli/cli_kb.py +110 -0
- veadk/cli/cli_pipeline.py +285 -0
- veadk/cli/cli_prompt.py +86 -0
- veadk/cli/cli_update.py +106 -0
- veadk/cli/cli_uploadevalset.py +139 -0
- veadk/cli/cli_web.py +143 -0
- veadk/cloud/__init__.py +13 -0
- veadk/cloud/cloud_agent_engine.py +485 -0
- veadk/cloud/cloud_app.py +475 -0
- veadk/config.py +115 -0
- veadk/configs/__init__.py +13 -0
- veadk/configs/auth_configs.py +133 -0
- veadk/configs/database_configs.py +132 -0
- veadk/configs/model_configs.py +78 -0
- veadk/configs/tool_configs.py +54 -0
- veadk/configs/tracing_configs.py +110 -0
- veadk/consts.py +74 -0
- veadk/evaluation/__init__.py +17 -0
- veadk/evaluation/adk_evaluator/__init__.py +17 -0
- veadk/evaluation/adk_evaluator/adk_evaluator.py +302 -0
- veadk/evaluation/base_evaluator.py +642 -0
- veadk/evaluation/deepeval_evaluator/__init__.py +17 -0
- veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +339 -0
- veadk/evaluation/eval_set_file_loader.py +48 -0
- veadk/evaluation/eval_set_recorder.py +146 -0
- veadk/evaluation/types.py +65 -0
- veadk/evaluation/utils/prometheus.py +196 -0
- veadk/integrations/__init__.py +13 -0
- veadk/integrations/ve_apig/__init__.py +13 -0
- veadk/integrations/ve_apig/ve_apig.py +349 -0
- veadk/integrations/ve_apig/ve_apig_utils.py +332 -0
- veadk/integrations/ve_code_pipeline/__init__.py +13 -0
- veadk/integrations/ve_code_pipeline/ve_code_pipeline.py +431 -0
- veadk/integrations/ve_cozeloop/__init__.py +13 -0
- veadk/integrations/ve_cozeloop/ve_cozeloop.py +96 -0
- veadk/integrations/ve_cr/__init__.py +13 -0
- veadk/integrations/ve_cr/ve_cr.py +220 -0
- veadk/integrations/ve_faas/__init__.py +13 -0
- veadk/integrations/ve_faas/template/cookiecutter.json +15 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/config.yaml.example +6 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/deploy.py +106 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__init__.py +13 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/agent.py +25 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/app.py +202 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/requirements.txt +3 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/run.sh +49 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/__init__.py +14 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/agent.py +27 -0
- veadk/integrations/ve_faas/ve_faas.py +754 -0
- veadk/integrations/ve_faas/ve_faas_utils.py +408 -0
- veadk/integrations/ve_faas/web_template/cookiecutter.json +20 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/config.yaml.example +2 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/deploy.py +44 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/Dockerfile +23 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/app.py +123 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/init_db.py +46 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/models.py +36 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/requirements.txt +4 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/run.sh +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/css/style.css +368 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/js/admin.js +0 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/dashboard.html +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/edit_post.html +24 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/login.html +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/posts.html +53 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/base.html +45 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/index.html +29 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/post.html +14 -0
- veadk/integrations/ve_identity/__init__.py +110 -0
- veadk/integrations/ve_identity/auth_config.py +261 -0
- veadk/integrations/ve_identity/auth_mixins.py +650 -0
- veadk/integrations/ve_identity/auth_processor.py +385 -0
- veadk/integrations/ve_identity/function_tool.py +158 -0
- veadk/integrations/ve_identity/identity_client.py +864 -0
- veadk/integrations/ve_identity/mcp_tool.py +181 -0
- veadk/integrations/ve_identity/mcp_toolset.py +431 -0
- veadk/integrations/ve_identity/models.py +228 -0
- veadk/integrations/ve_identity/token_manager.py +188 -0
- veadk/integrations/ve_identity/utils.py +151 -0
- veadk/integrations/ve_prompt_pilot/__init__.py +13 -0
- veadk/integrations/ve_prompt_pilot/ve_prompt_pilot.py +85 -0
- veadk/integrations/ve_tls/__init__.py +13 -0
- veadk/integrations/ve_tls/utils.py +116 -0
- veadk/integrations/ve_tls/ve_tls.py +212 -0
- veadk/integrations/ve_tos/ve_tos.py +710 -0
- veadk/integrations/ve_viking_db_memory/__init__.py +13 -0
- veadk/integrations/ve_viking_db_memory/ve_viking_db_memory.py +308 -0
- veadk/knowledgebase/__init__.py +17 -0
- veadk/knowledgebase/backends/__init__.py +13 -0
- veadk/knowledgebase/backends/base_backend.py +72 -0
- veadk/knowledgebase/backends/in_memory_backend.py +91 -0
- veadk/knowledgebase/backends/opensearch_backend.py +162 -0
- veadk/knowledgebase/backends/redis_backend.py +172 -0
- veadk/knowledgebase/backends/utils.py +92 -0
- veadk/knowledgebase/backends/vikingdb_knowledge_backend.py +608 -0
- veadk/knowledgebase/entry.py +25 -0
- veadk/knowledgebase/knowledgebase.py +307 -0
- veadk/memory/__init__.py +35 -0
- veadk/memory/long_term_memory.py +365 -0
- veadk/memory/long_term_memory_backends/__init__.py +13 -0
- veadk/memory/long_term_memory_backends/base_backend.py +35 -0
- veadk/memory/long_term_memory_backends/in_memory_backend.py +67 -0
- veadk/memory/long_term_memory_backends/mem0_backend.py +155 -0
- veadk/memory/long_term_memory_backends/opensearch_backend.py +124 -0
- veadk/memory/long_term_memory_backends/redis_backend.py +140 -0
- veadk/memory/long_term_memory_backends/vikingdb_memory_backend.py +189 -0
- veadk/memory/short_term_memory.py +252 -0
- veadk/memory/short_term_memory_backends/__init__.py +13 -0
- veadk/memory/short_term_memory_backends/base_backend.py +31 -0
- veadk/memory/short_term_memory_backends/mysql_backend.py +49 -0
- veadk/memory/short_term_memory_backends/postgresql_backend.py +49 -0
- veadk/memory/short_term_memory_backends/sqlite_backend.py +55 -0
- veadk/memory/short_term_memory_processor.py +100 -0
- veadk/processors/__init__.py +26 -0
- veadk/processors/base_run_processor.py +120 -0
- veadk/prompts/__init__.py +13 -0
- veadk/prompts/agent_default_prompt.py +30 -0
- veadk/prompts/prompt_evaluator.py +20 -0
- veadk/prompts/prompt_memory_processor.py +55 -0
- veadk/prompts/prompt_optimization.py +150 -0
- veadk/runner.py +732 -0
- veadk/tools/__init__.py +13 -0
- veadk/tools/builtin_tools/__init__.py +13 -0
- veadk/tools/builtin_tools/agent_authorization.py +94 -0
- veadk/tools/builtin_tools/generate_image.py +23 -0
- veadk/tools/builtin_tools/image_edit.py +300 -0
- veadk/tools/builtin_tools/image_generate.py +446 -0
- veadk/tools/builtin_tools/lark.py +67 -0
- veadk/tools/builtin_tools/las.py +24 -0
- veadk/tools/builtin_tools/link_reader.py +66 -0
- veadk/tools/builtin_tools/llm_shield.py +381 -0
- veadk/tools/builtin_tools/load_knowledgebase.py +97 -0
- veadk/tools/builtin_tools/mcp_router.py +29 -0
- veadk/tools/builtin_tools/run_code.py +113 -0
- veadk/tools/builtin_tools/tts.py +253 -0
- veadk/tools/builtin_tools/vesearch.py +49 -0
- veadk/tools/builtin_tools/video_generate.py +363 -0
- veadk/tools/builtin_tools/web_scraper.py +76 -0
- veadk/tools/builtin_tools/web_search.py +83 -0
- veadk/tools/demo_tools.py +58 -0
- veadk/tools/load_knowledgebase_tool.py +149 -0
- veadk/tools/sandbox/__init__.py +13 -0
- veadk/tools/sandbox/browser_sandbox.py +37 -0
- veadk/tools/sandbox/code_sandbox.py +40 -0
- veadk/tools/sandbox/computer_sandbox.py +34 -0
- veadk/tracing/__init__.py +13 -0
- veadk/tracing/base_tracer.py +58 -0
- veadk/tracing/telemetry/__init__.py +13 -0
- veadk/tracing/telemetry/attributes/attributes.py +29 -0
- veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +180 -0
- veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +858 -0
- veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +152 -0
- veadk/tracing/telemetry/attributes/extractors/types.py +164 -0
- veadk/tracing/telemetry/exporters/__init__.py +13 -0
- veadk/tracing/telemetry/exporters/apmplus_exporter.py +558 -0
- veadk/tracing/telemetry/exporters/base_exporter.py +39 -0
- veadk/tracing/telemetry/exporters/cozeloop_exporter.py +129 -0
- veadk/tracing/telemetry/exporters/inmemory_exporter.py +248 -0
- veadk/tracing/telemetry/exporters/tls_exporter.py +139 -0
- veadk/tracing/telemetry/opentelemetry_tracer.py +320 -0
- veadk/tracing/telemetry/telemetry.py +411 -0
- veadk/types.py +47 -0
- veadk/utils/__init__.py +13 -0
- veadk/utils/audio_manager.py +95 -0
- veadk/utils/auth.py +294 -0
- veadk/utils/logger.py +59 -0
- veadk/utils/mcp_utils.py +44 -0
- veadk/utils/misc.py +184 -0
- veadk/utils/patches.py +101 -0
- veadk/utils/volcengine_sign.py +205 -0
- veadk/version.py +15 -0
- veadk_python-0.2.27.dist-info/METADATA +373 -0
- veadk_python-0.2.27.dist-info/RECORD +218 -0
- veadk_python-0.2.27.dist-info/WHEEL +5 -0
- veadk_python-0.2.27.dist-info/entry_points.txt +2 -0
- veadk_python-0.2.27.dist-info/licenses/LICENSE +201 -0
- veadk_python-0.2.27.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import time
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
from deepeval import evaluate
|
|
19
|
+
from deepeval.evaluate import CacheConfig
|
|
20
|
+
from deepeval.evaluate.types import EvaluationResult
|
|
21
|
+
from deepeval.metrics import BaseMetric
|
|
22
|
+
from deepeval.models import LocalModel
|
|
23
|
+
from deepeval.test_case import LLMTestCase
|
|
24
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
25
|
+
from google.adk.evaluation.eval_set import EvalSet
|
|
26
|
+
from typing_extensions import override
|
|
27
|
+
|
|
28
|
+
from veadk.config import getenv
|
|
29
|
+
from veadk.evaluation.base_evaluator import BaseEvaluator, EvalResultData, MetricResult
|
|
30
|
+
from veadk.evaluation.types import EvalResultCaseData, EvalResultMetadata
|
|
31
|
+
from veadk.evaluation.utils.prometheus import (
|
|
32
|
+
PrometheusPushgatewayConfig,
|
|
33
|
+
push_to_prometheus,
|
|
34
|
+
)
|
|
35
|
+
from veadk.utils.logger import get_logger
|
|
36
|
+
|
|
37
|
+
logger = get_logger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def formatted_timestamp():
|
|
41
|
+
"""Generates a formatted timestamp string in YYYYMMDDHHMMSS format.
|
|
42
|
+
|
|
43
|
+
This function creates a string representation of the current time.
|
|
44
|
+
It uses local time for formatting.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
str: Timestamp string like '20251028123045'.
|
|
48
|
+
"""
|
|
49
|
+
# YYYYMMDDHHMMSS
|
|
50
|
+
return time.strftime("%Y%m%d%H%M%S", time.localtime())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DeepevalEvaluator(BaseEvaluator):
|
|
54
|
+
"""Evaluates agents using DeepEval metrics with Prometheus export.
|
|
55
|
+
|
|
56
|
+
This class uses DeepEval to test agent performance.
|
|
57
|
+
It runs agents on test cases and scores them.
|
|
58
|
+
Results can be sent to Prometheus for monitoring.
|
|
59
|
+
|
|
60
|
+
Attributes:
|
|
61
|
+
judge_model_name (str): Name of the model that judges the agent.
|
|
62
|
+
judge_model (LocalModel): The judge model instance.
|
|
63
|
+
prometheus_config (PrometheusPushgatewayConfig | None): Settings for
|
|
64
|
+
Prometheus export. If None, no export happens.
|
|
65
|
+
|
|
66
|
+
Note:
|
|
67
|
+
Needs judge model credentials from environment if not given.
|
|
68
|
+
Turns off cache to get fresh results each time.
|
|
69
|
+
|
|
70
|
+
Examples:
|
|
71
|
+
```python
|
|
72
|
+
agent = Agent(tools=[get_city_weather])
|
|
73
|
+
evaluator = DeepevalEvaluator(agent=agent)
|
|
74
|
+
metrics = [GEval(threshold=0.8)]
|
|
75
|
+
results = await evaluator.evaluate(metrics, eval_set_file_path="test.json")
|
|
76
|
+
```
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
agent,
|
|
82
|
+
judge_model_api_key: str = "",
|
|
83
|
+
judge_model_name: str = "",
|
|
84
|
+
judge_model_api_base: str = "",
|
|
85
|
+
name: str = "veadk_deepeval_evaluator",
|
|
86
|
+
prometheus_config: PrometheusPushgatewayConfig | None = None,
|
|
87
|
+
):
|
|
88
|
+
"""Sets up the DeepEval evaluator with agent and judge model.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
agent: The agent to test.
|
|
92
|
+
judge_model_api_key: API key for the judge model. If empty,
|
|
93
|
+
gets from MODEL_JUDGE_API_KEY environment variable.
|
|
94
|
+
judge_model_name: Name of the judge model. If empty,
|
|
95
|
+
gets from MODEL_JUDGE_NAME environment variable.
|
|
96
|
+
judge_model_api_base: Base URL for judge model API. If empty,
|
|
97
|
+
gets from MODEL_JUDGE_API_BASE environment variable.
|
|
98
|
+
name: Name for this evaluator. Defaults to 'veadk_deepeval_evaluator'.
|
|
99
|
+
prometheus_config: Settings for Prometheus export. If None,
|
|
100
|
+
no export happens.
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
ValueError: If model settings are wrong.
|
|
104
|
+
EnvironmentError: If environment variables are missing.
|
|
105
|
+
|
|
106
|
+
Examples:
|
|
107
|
+
```python
|
|
108
|
+
evaluator = DeepevalEvaluator(
|
|
109
|
+
agent=my_agent,
|
|
110
|
+
judge_model_api_key="sk-...",
|
|
111
|
+
prometheus_config=prometheus_config)
|
|
112
|
+
```
|
|
113
|
+
"""
|
|
114
|
+
super().__init__(agent=agent, name=name)
|
|
115
|
+
|
|
116
|
+
if not judge_model_api_key:
|
|
117
|
+
judge_model_api_key = getenv("MODEL_JUDGE_API_KEY") or getenv(
|
|
118
|
+
"MODEL_AGENT_API_KEY"
|
|
119
|
+
)
|
|
120
|
+
if not judge_model_name:
|
|
121
|
+
judge_model_name = getenv(
|
|
122
|
+
"MODEL_JUDGE_NAME",
|
|
123
|
+
"doubao-seed-1-6-250615",
|
|
124
|
+
)
|
|
125
|
+
if not judge_model_api_base:
|
|
126
|
+
judge_model_api_base = getenv(
|
|
127
|
+
"MODEL_JUDGE_API_BASE",
|
|
128
|
+
"https://ark.cn-beijing.volces.com/api/v3/",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
self.judge_model_name = judge_model_name
|
|
132
|
+
self.judge_model = LocalModel(
|
|
133
|
+
model=judge_model_name,
|
|
134
|
+
base_url=judge_model_api_base,
|
|
135
|
+
api_key=judge_model_api_key,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
self.prometheus_config = prometheus_config
|
|
139
|
+
|
|
140
|
+
@override
|
|
141
|
+
async def evaluate(
|
|
142
|
+
self,
|
|
143
|
+
metrics: list[BaseMetric],
|
|
144
|
+
eval_set: Optional[EvalSet] = None,
|
|
145
|
+
eval_set_file_path: Optional[str] = None,
|
|
146
|
+
eval_id: str = f"test_{formatted_timestamp()}",
|
|
147
|
+
):
|
|
148
|
+
"""Tests agent using DeepEval on given test cases.
|
|
149
|
+
|
|
150
|
+
This method does these steps:
|
|
151
|
+
1. Loads test cases from memory or file
|
|
152
|
+
2. Runs agent to get actual responses
|
|
153
|
+
3. Converts to DeepEval test format
|
|
154
|
+
4. Runs metrics evaluation
|
|
155
|
+
5. Sends results to Prometheus if needed
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
metrics: List of DeepEval metrics to use for scoring.
|
|
159
|
+
eval_set: Test cases in memory. If given, used first.
|
|
160
|
+
eval_set_file_path: Path to test case file. Used if no eval_set.
|
|
161
|
+
eval_id: Unique name for this test run. Used for tracking.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
EvaluationResult: Results from DeepEval with scores and details.
|
|
165
|
+
|
|
166
|
+
Raises:
|
|
167
|
+
ValueError: If no test cases found.
|
|
168
|
+
FileNotFoundError: If test file not found.
|
|
169
|
+
EvaluationError: If agent fails or metrics fail.
|
|
170
|
+
|
|
171
|
+
Examples:
|
|
172
|
+
```python
|
|
173
|
+
metrics = [GEval(threshold=0.8), ToolCorrectnessMetric(threshold=0.5)]
|
|
174
|
+
results = await evaluator.evaluate(
|
|
175
|
+
metrics=metrics,
|
|
176
|
+
eval_set_file_path="test_cases.json")
|
|
177
|
+
print(f"Test cases run: {len(results.test_results)}")
|
|
178
|
+
```
|
|
179
|
+
"""
|
|
180
|
+
# Get evaluation data by parsing eval set file
|
|
181
|
+
self.build_eval_set(eval_set, eval_set_file_path)
|
|
182
|
+
|
|
183
|
+
# Get actual data by running agent
|
|
184
|
+
logger.info("Start to run agent for actual data.")
|
|
185
|
+
await self.generate_actual_outputs()
|
|
186
|
+
eval_case_data_list = self.invocation_list
|
|
187
|
+
|
|
188
|
+
# Build test cases in Deepeval format
|
|
189
|
+
logger.info("Start to build test cases in Deepeval format.")
|
|
190
|
+
test_cases = []
|
|
191
|
+
for eval_case_data in eval_case_data_list:
|
|
192
|
+
for invocation in eval_case_data.invocations:
|
|
193
|
+
invocations_context_actual: str = (
|
|
194
|
+
"" # {"role": "user", "content": "xxxxx"}
|
|
195
|
+
)
|
|
196
|
+
invocations_context_expect: str = ""
|
|
197
|
+
|
|
198
|
+
test_case = LLMTestCase(
|
|
199
|
+
input=invocation.input,
|
|
200
|
+
actual_output=invocation.actual_output,
|
|
201
|
+
expected_output=invocation.expected_output,
|
|
202
|
+
tools_called=[
|
|
203
|
+
ToolCall(name=tool["name"], input_parameters=tool["args"])
|
|
204
|
+
for tool in invocation.actual_tool
|
|
205
|
+
],
|
|
206
|
+
expected_tools=[
|
|
207
|
+
ToolCall(name=tool["name"], input_parameters=tool["args"])
|
|
208
|
+
for tool in invocation.expected_tool
|
|
209
|
+
],
|
|
210
|
+
additional_metadata={"latency": invocation.latency},
|
|
211
|
+
context=[
|
|
212
|
+
"actual_conversation_history: "
|
|
213
|
+
+ (invocations_context_actual or "Empty"),
|
|
214
|
+
"expect_conversation_history: "
|
|
215
|
+
+ (invocations_context_expect or "Empty"),
|
|
216
|
+
],
|
|
217
|
+
)
|
|
218
|
+
invocations_context_actual += (
|
|
219
|
+
f'{{"role": "user", "content": "{invocation.input}"}}\n'
|
|
220
|
+
)
|
|
221
|
+
invocations_context_actual += f'{{"role": "assistant", "content": "{invocation.actual_output}"}}\n'
|
|
222
|
+
invocations_context_expect += (
|
|
223
|
+
f'{{"role": "user", "content": "{invocation.input}"}}\n'
|
|
224
|
+
)
|
|
225
|
+
invocations_context_expect += f'{{"role": "assistant", "content": "{invocation.expected_output}"}}\n'
|
|
226
|
+
|
|
227
|
+
test_cases.append(test_case)
|
|
228
|
+
|
|
229
|
+
# Run Deepeval evaluation according to metrics
|
|
230
|
+
logger.info("Start to run Deepeval evaluation according to metrics.")
|
|
231
|
+
test_results = evaluate(
|
|
232
|
+
test_cases=test_cases,
|
|
233
|
+
metrics=metrics,
|
|
234
|
+
cache_config=CacheConfig(write_cache=False),
|
|
235
|
+
)
|
|
236
|
+
for test_result in test_results.test_results:
|
|
237
|
+
eval_result_data = EvalResultData(metric_results=[])
|
|
238
|
+
for metrics_data_item in test_result.metrics_data:
|
|
239
|
+
metric_result = MetricResult(
|
|
240
|
+
metric_type=metrics_data_item.name,
|
|
241
|
+
success=metrics_data_item.success,
|
|
242
|
+
score=metrics_data_item.score,
|
|
243
|
+
reason=metrics_data_item.reason,
|
|
244
|
+
)
|
|
245
|
+
eval_result_data.metric_results.append(metric_result)
|
|
246
|
+
|
|
247
|
+
eval_result_data.call_before_append() # calculate average score and generate total reason
|
|
248
|
+
self.result_list.append(eval_result_data)
|
|
249
|
+
self.result_list.reverse() # deepeval test_results is in reverse order
|
|
250
|
+
|
|
251
|
+
# export to Prometheus if needed
|
|
252
|
+
if self.prometheus_config is not None:
|
|
253
|
+
self.export_results(eval_id, test_results)
|
|
254
|
+
|
|
255
|
+
return test_results
|
|
256
|
+
|
|
257
|
+
def export_results(self, eval_id: str, test_results: EvaluationResult):
|
|
258
|
+
"""Sends evaluation results to Prometheus for monitoring.
|
|
259
|
+
|
|
260
|
+
This method takes test results, counts passes and failures,
|
|
261
|
+
and sends metrics to Prometheus.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
eval_id: Unique name for this test. Used as label in Prometheus.
|
|
265
|
+
test_results: Results from DeepEval evaluation.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
None: Results are sent directly to Prometheus.
|
|
269
|
+
|
|
270
|
+
Raises:
|
|
271
|
+
PrometheusConnectionError: If cannot connect to Prometheus.
|
|
272
|
+
PrometheusPushError: If sending data fails.
|
|
273
|
+
|
|
274
|
+
Note:
|
|
275
|
+
Uses fixed thresholds for now: case_threshold=0.5, diff_threshold=0.2.
|
|
276
|
+
These may change later.
|
|
277
|
+
|
|
278
|
+
Examples:
|
|
279
|
+
```python
|
|
280
|
+
evaluator.export_results("test_20240101", test_results)
|
|
281
|
+
```
|
|
282
|
+
"""
|
|
283
|
+
# fixed attributions
|
|
284
|
+
test_name = eval_id
|
|
285
|
+
test_cases_total = len(test_results.test_results)
|
|
286
|
+
eval_data = EvalResultMetadata(
|
|
287
|
+
tested_model=self.agent.model_name,
|
|
288
|
+
judge_model=self.judge_model_name,
|
|
289
|
+
)
|
|
290
|
+
# parsed attributions
|
|
291
|
+
test_cases_failure = 0
|
|
292
|
+
test_cases_pass = 0
|
|
293
|
+
test_data_list = []
|
|
294
|
+
# NOTE: we hard-coding the following two attributions for development
|
|
295
|
+
case_threshold = 0.5
|
|
296
|
+
diff_threshold = 0.2
|
|
297
|
+
|
|
298
|
+
for idx, test_result in enumerate(test_results.test_results):
|
|
299
|
+
pass_flag = "PASSED"
|
|
300
|
+
if test_result.success:
|
|
301
|
+
test_cases_pass += 1
|
|
302
|
+
else:
|
|
303
|
+
pass_flag = "FAILURE"
|
|
304
|
+
test_cases_failure += 1
|
|
305
|
+
|
|
306
|
+
test_data_list.append(
|
|
307
|
+
EvalResultCaseData(
|
|
308
|
+
id=str(idx),
|
|
309
|
+
input=test_result.input,
|
|
310
|
+
actual_output=test_result.actual_output,
|
|
311
|
+
expected_output=test_result.expected_output,
|
|
312
|
+
# [temporary] score: This method is not generally applicable now and is currently only available in the GEval mode.
|
|
313
|
+
score=str(test_result.metrics_data[0].score),
|
|
314
|
+
reason=test_result.metrics_data[0].reason,
|
|
315
|
+
status=pass_flag,
|
|
316
|
+
latency=test_result.additional_metadata["latency"],
|
|
317
|
+
)
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
exported_data = {
|
|
321
|
+
"test_name": test_name,
|
|
322
|
+
"test_cases_total": test_cases_total,
|
|
323
|
+
"test_cases_failure": test_cases_failure,
|
|
324
|
+
"test_cases_pass": test_cases_pass,
|
|
325
|
+
"test_data_list": test_data_list,
|
|
326
|
+
"eval_data": eval_data,
|
|
327
|
+
"case_threshold": case_threshold,
|
|
328
|
+
"diff_threshold": diff_threshold,
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
push_to_prometheus(
|
|
332
|
+
**exported_data,
|
|
333
|
+
url=self.prometheus_config.url,
|
|
334
|
+
username=self.prometheus_config.username,
|
|
335
|
+
password=self.prometheus_config.password,
|
|
336
|
+
)
|
|
337
|
+
logger.info(
|
|
338
|
+
f"Upload to Prometheus Pushgateway ({self.prometheus_config.url}) successfully! Test name: {eval_id}"
|
|
339
|
+
)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from google.adk.evaluation.eval_set import EvalSet
|
|
16
|
+
from google.adk.evaluation.local_eval_sets_manager import (
|
|
17
|
+
load_eval_set_from_file as adk_load_eval_set_from_file,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_eval_set_from_file(eval_set_file_path: str) -> EvalSet:
|
|
22
|
+
"""Loads an evaluation set from a JSON file.
|
|
23
|
+
|
|
24
|
+
This function uses ADK's loader to parse the file into an EvalSet object.
|
|
25
|
+
It handles errors in file reading or parsing.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
eval_set_file_path (str): Path to the JSON eval set file.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
EvalSet: Loaded evaluation set object.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
Exception: If file loading or parsing fails, with details.
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
```python
|
|
38
|
+
eval_set = load_eval_set_from_file("my_eval.json")
|
|
39
|
+
print(len(eval_set.eval_cases))
|
|
40
|
+
```
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
eval_set = adk_load_eval_set_from_file(eval_set_file_path, eval_set_file_path)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
raise Exception(
|
|
46
|
+
f"Failed to load eval set from file {eval_set_file_path}"
|
|
47
|
+
) from e
|
|
48
|
+
return eval_set
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import time
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
import os
|
|
18
|
+
from google.adk.cli.utils import evals
|
|
19
|
+
from google.adk.evaluation.eval_case import EvalCase, SessionInput
|
|
20
|
+
from google.adk.evaluation.local_eval_sets_manager import LocalEvalSetsManager
|
|
21
|
+
from google.adk.sessions import BaseSessionService
|
|
22
|
+
|
|
23
|
+
from veadk.utils.logger import get_logger
|
|
24
|
+
from veadk.utils.misc import formatted_timestamp, get_agents_dir
|
|
25
|
+
|
|
26
|
+
logger = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EvalSetRecorder(LocalEvalSetsManager):
|
|
30
|
+
"""Records evaluation sets from sessions for later use in testing.
|
|
31
|
+
|
|
32
|
+
This class extends LocalEvalSetsManager to add sessions to eval sets.
|
|
33
|
+
It handles dumping eval sets to files from session data.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
eval_set_id (str): ID of the eval set. Defaults to 'default'.
|
|
37
|
+
session_service (BaseSessionService): Service for session management.
|
|
38
|
+
|
|
39
|
+
Note:
|
|
40
|
+
Uses temporary directory for storing eval sets.
|
|
41
|
+
Creates eval cases from session invocations.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self, session_service: BaseSessionService, eval_set_id: str = "default"
|
|
46
|
+
):
|
|
47
|
+
"""Initializes the eval set recorder with session service and ID.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
session_service (BaseSessionService): Service to retrieve sessions.
|
|
51
|
+
eval_set_id (str): ID for the eval set. Defaults to 'default'.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
ValueError: If eval_set_id is invalid.
|
|
55
|
+
"""
|
|
56
|
+
super().__init__(agents_dir=get_agents_dir())
|
|
57
|
+
self.eval_set_id = eval_set_id if eval_set_id != "" else "default"
|
|
58
|
+
self.session_service: BaseSessionService = session_service
|
|
59
|
+
|
|
60
|
+
# adapted from google.adk.cli.fast_api
|
|
61
|
+
async def add_session_to_eval_set(
|
|
62
|
+
self,
|
|
63
|
+
app_name: str,
|
|
64
|
+
eval_set_id: str,
|
|
65
|
+
session_id: str,
|
|
66
|
+
user_id: str,
|
|
67
|
+
):
|
|
68
|
+
"""Adds a session to the evaluation set as an eval case.
|
|
69
|
+
|
|
70
|
+
This method retrieves a session and converts it to eval invocations.
|
|
71
|
+
It creates a new eval case with timestamp.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
app_name (str): Name of the app for the session.
|
|
75
|
+
eval_set_id (str): ID of the eval set to add to.
|
|
76
|
+
session_id (str): ID of the session to add.
|
|
77
|
+
user_id (str): ID of the user owning the session.
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
AssertionError: If session not found.
|
|
81
|
+
ValueError: If adding eval case fails.
|
|
82
|
+
"""
|
|
83
|
+
eval_id = f"veadk_eval_{formatted_timestamp()}"
|
|
84
|
+
|
|
85
|
+
# Get the session
|
|
86
|
+
session = await self.session_service.get_session(
|
|
87
|
+
app_name=app_name, user_id=user_id, session_id=session_id
|
|
88
|
+
)
|
|
89
|
+
assert session, "Session not found."
|
|
90
|
+
|
|
91
|
+
# Convert the session data to eval invocations
|
|
92
|
+
invocations = evals.convert_session_to_eval_invocations(session)
|
|
93
|
+
|
|
94
|
+
# Populate the session with initial session state.
|
|
95
|
+
# initial_session_state = create_empty_state(agent_loader.load_agent(app_name))
|
|
96
|
+
|
|
97
|
+
new_eval_case = EvalCase(
|
|
98
|
+
eval_id=eval_id,
|
|
99
|
+
conversation=invocations,
|
|
100
|
+
session_input=SessionInput(app_name=app_name, user_id=user_id),
|
|
101
|
+
creation_timestamp=time.time(),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
self.add_eval_case(app_name, eval_set_id, new_eval_case)
|
|
106
|
+
except ValueError as ve:
|
|
107
|
+
raise ValueError(f"Add eval case to eval set error: {ve}")
|
|
108
|
+
|
|
109
|
+
async def dump(
|
|
110
|
+
self,
|
|
111
|
+
app_name: str,
|
|
112
|
+
user_id: str,
|
|
113
|
+
session_id: str,
|
|
114
|
+
) -> str:
|
|
115
|
+
"""Dumps the current eval set to a file path.
|
|
116
|
+
|
|
117
|
+
This method creates the eval set if needed and adds the session.
|
|
118
|
+
It ensures directory exists and logs the dump path.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
app_name (str): Name of the app.
|
|
122
|
+
user_id (str): ID of the user.
|
|
123
|
+
session_id (str): ID of the session to dump.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
str: Path where the eval set was dumped.
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If dump operation fails.
|
|
130
|
+
"""
|
|
131
|
+
dump_path = self._get_eval_set_file_path(app_name, self.eval_set_id)
|
|
132
|
+
Path(dump_path).parent.mkdir(parents=True, exist_ok=True)
|
|
133
|
+
|
|
134
|
+
if not os.path.exists(dump_path):
|
|
135
|
+
self.create_eval_set(app_name=app_name, eval_set_id=self.eval_set_id)
|
|
136
|
+
|
|
137
|
+
await self.add_session_to_eval_set(
|
|
138
|
+
app_name=app_name,
|
|
139
|
+
eval_set_id=self.eval_set_id,
|
|
140
|
+
session_id=session_id,
|
|
141
|
+
user_id=user_id,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
logger.info(f"Dump eval set to {dump_path}")
|
|
145
|
+
|
|
146
|
+
return dump_path
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class EvalResultCaseData:
|
|
20
|
+
"""Holds data for a single evaluation case result.
|
|
21
|
+
|
|
22
|
+
This dataclass stores input, outputs, score, and status for one test case.
|
|
23
|
+
Used in evaluation reporting and metrics export.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
id (str): Unique ID of the case.
|
|
27
|
+
input (str): User input for the case.
|
|
28
|
+
actual_output (str): Actual agent response.
|
|
29
|
+
expected_output (str): Expected agent response.
|
|
30
|
+
score (str): Score as string from evaluation.
|
|
31
|
+
reason (str): Reason for the score.
|
|
32
|
+
status (str): Status like 'PASSED' or 'FAILURE'.
|
|
33
|
+
latency (str): Latency in milliseconds as string.
|
|
34
|
+
|
|
35
|
+
Note:
|
|
36
|
+
Score and latency are strings for compatibility with external systems.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
id: str
|
|
40
|
+
input: str
|
|
41
|
+
actual_output: str
|
|
42
|
+
expected_output: str
|
|
43
|
+
score: str
|
|
44
|
+
reason: str
|
|
45
|
+
status: str # `PASSED` or `FAILURE`
|
|
46
|
+
latency: str
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class EvalResultMetadata:
|
|
51
|
+
"""Stores metadata about the evaluation run.
|
|
52
|
+
|
|
53
|
+
This dataclass captures model information for the evaluation.
|
|
54
|
+
Used in reporting and tracing.
|
|
55
|
+
|
|
56
|
+
Attributes:
|
|
57
|
+
tested_model (str): Name of the model being tested.
|
|
58
|
+
judge_model (str): Name of the judge model used.
|
|
59
|
+
|
|
60
|
+
Note:
|
|
61
|
+
Simple structure for quick metadata access.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
tested_model: str
|
|
65
|
+
judge_model: str
|