veadk-python 0.2.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veadk/__init__.py +37 -0
- veadk/a2a/__init__.py +13 -0
- veadk/a2a/agent_card.py +45 -0
- veadk/a2a/remote_ve_agent.py +390 -0
- veadk/a2a/utils/__init__.py +13 -0
- veadk/a2a/utils/agent_to_a2a.py +170 -0
- veadk/a2a/ve_a2a_server.py +93 -0
- veadk/a2a/ve_agent_executor.py +78 -0
- veadk/a2a/ve_middlewares.py +313 -0
- veadk/a2a/ve_task_store.py +37 -0
- veadk/agent.py +402 -0
- veadk/agent_builder.py +93 -0
- veadk/agents/loop_agent.py +68 -0
- veadk/agents/parallel_agent.py +72 -0
- veadk/agents/sequential_agent.py +64 -0
- veadk/auth/__init__.py +13 -0
- veadk/auth/base_auth.py +22 -0
- veadk/auth/ve_credential_service.py +203 -0
- veadk/auth/veauth/__init__.py +13 -0
- veadk/auth/veauth/apmplus_veauth.py +58 -0
- veadk/auth/veauth/ark_veauth.py +75 -0
- veadk/auth/veauth/base_veauth.py +50 -0
- veadk/auth/veauth/cozeloop_veauth.py +13 -0
- veadk/auth/veauth/opensearch_veauth.py +75 -0
- veadk/auth/veauth/postgresql_veauth.py +75 -0
- veadk/auth/veauth/prompt_pilot_veauth.py +60 -0
- veadk/auth/veauth/speech_veauth.py +54 -0
- veadk/auth/veauth/utils.py +69 -0
- veadk/auth/veauth/vesearch_veauth.py +62 -0
- veadk/auth/veauth/viking_mem0_veauth.py +91 -0
- veadk/cli/__init__.py +13 -0
- veadk/cli/cli.py +58 -0
- veadk/cli/cli_clean.py +87 -0
- veadk/cli/cli_create.py +163 -0
- veadk/cli/cli_deploy.py +233 -0
- veadk/cli/cli_eval.py +215 -0
- veadk/cli/cli_init.py +214 -0
- veadk/cli/cli_kb.py +110 -0
- veadk/cli/cli_pipeline.py +285 -0
- veadk/cli/cli_prompt.py +86 -0
- veadk/cli/cli_update.py +106 -0
- veadk/cli/cli_uploadevalset.py +139 -0
- veadk/cli/cli_web.py +143 -0
- veadk/cloud/__init__.py +13 -0
- veadk/cloud/cloud_agent_engine.py +485 -0
- veadk/cloud/cloud_app.py +475 -0
- veadk/config.py +115 -0
- veadk/configs/__init__.py +13 -0
- veadk/configs/auth_configs.py +133 -0
- veadk/configs/database_configs.py +132 -0
- veadk/configs/model_configs.py +78 -0
- veadk/configs/tool_configs.py +54 -0
- veadk/configs/tracing_configs.py +110 -0
- veadk/consts.py +74 -0
- veadk/evaluation/__init__.py +17 -0
- veadk/evaluation/adk_evaluator/__init__.py +17 -0
- veadk/evaluation/adk_evaluator/adk_evaluator.py +302 -0
- veadk/evaluation/base_evaluator.py +642 -0
- veadk/evaluation/deepeval_evaluator/__init__.py +17 -0
- veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +339 -0
- veadk/evaluation/eval_set_file_loader.py +48 -0
- veadk/evaluation/eval_set_recorder.py +146 -0
- veadk/evaluation/types.py +65 -0
- veadk/evaluation/utils/prometheus.py +196 -0
- veadk/integrations/__init__.py +13 -0
- veadk/integrations/ve_apig/__init__.py +13 -0
- veadk/integrations/ve_apig/ve_apig.py +349 -0
- veadk/integrations/ve_apig/ve_apig_utils.py +332 -0
- veadk/integrations/ve_code_pipeline/__init__.py +13 -0
- veadk/integrations/ve_code_pipeline/ve_code_pipeline.py +431 -0
- veadk/integrations/ve_cozeloop/__init__.py +13 -0
- veadk/integrations/ve_cozeloop/ve_cozeloop.py +96 -0
- veadk/integrations/ve_cr/__init__.py +13 -0
- veadk/integrations/ve_cr/ve_cr.py +220 -0
- veadk/integrations/ve_faas/__init__.py +13 -0
- veadk/integrations/ve_faas/template/cookiecutter.json +15 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/config.yaml.example +6 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/deploy.py +106 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/__init__.py +13 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/agent.py +25 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/app.py +202 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/requirements.txt +3 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/run.sh +49 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/__init__.py +14 -0
- veadk/integrations/ve_faas/template/{{cookiecutter.local_dir_name}}/src/{{ cookiecutter.app_name }}/agent.py +27 -0
- veadk/integrations/ve_faas/ve_faas.py +754 -0
- veadk/integrations/ve_faas/ve_faas_utils.py +408 -0
- veadk/integrations/ve_faas/web_template/cookiecutter.json +20 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/__init__.py +13 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/clean.py +23 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/config.yaml.example +2 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/deploy.py +44 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/Dockerfile +23 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/app.py +123 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/init_db.py +46 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/models.py +36 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/requirements.txt +4 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/run.sh +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/css/style.css +368 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/static/js/admin.js +0 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/dashboard.html +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/edit_post.html +24 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/login.html +21 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/admin/posts.html +53 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/base.html +45 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/index.html +29 -0
- veadk/integrations/ve_faas/web_template/{{cookiecutter.local_dir_name}}/src/templates/post.html +14 -0
- veadk/integrations/ve_identity/__init__.py +110 -0
- veadk/integrations/ve_identity/auth_config.py +261 -0
- veadk/integrations/ve_identity/auth_mixins.py +650 -0
- veadk/integrations/ve_identity/auth_processor.py +385 -0
- veadk/integrations/ve_identity/function_tool.py +158 -0
- veadk/integrations/ve_identity/identity_client.py +864 -0
- veadk/integrations/ve_identity/mcp_tool.py +181 -0
- veadk/integrations/ve_identity/mcp_toolset.py +431 -0
- veadk/integrations/ve_identity/models.py +228 -0
- veadk/integrations/ve_identity/token_manager.py +188 -0
- veadk/integrations/ve_identity/utils.py +151 -0
- veadk/integrations/ve_prompt_pilot/__init__.py +13 -0
- veadk/integrations/ve_prompt_pilot/ve_prompt_pilot.py +85 -0
- veadk/integrations/ve_tls/__init__.py +13 -0
- veadk/integrations/ve_tls/utils.py +116 -0
- veadk/integrations/ve_tls/ve_tls.py +212 -0
- veadk/integrations/ve_tos/ve_tos.py +710 -0
- veadk/integrations/ve_viking_db_memory/__init__.py +13 -0
- veadk/integrations/ve_viking_db_memory/ve_viking_db_memory.py +308 -0
- veadk/knowledgebase/__init__.py +17 -0
- veadk/knowledgebase/backends/__init__.py +13 -0
- veadk/knowledgebase/backends/base_backend.py +72 -0
- veadk/knowledgebase/backends/in_memory_backend.py +91 -0
- veadk/knowledgebase/backends/opensearch_backend.py +162 -0
- veadk/knowledgebase/backends/redis_backend.py +172 -0
- veadk/knowledgebase/backends/utils.py +92 -0
- veadk/knowledgebase/backends/vikingdb_knowledge_backend.py +608 -0
- veadk/knowledgebase/entry.py +25 -0
- veadk/knowledgebase/knowledgebase.py +307 -0
- veadk/memory/__init__.py +35 -0
- veadk/memory/long_term_memory.py +365 -0
- veadk/memory/long_term_memory_backends/__init__.py +13 -0
- veadk/memory/long_term_memory_backends/base_backend.py +35 -0
- veadk/memory/long_term_memory_backends/in_memory_backend.py +67 -0
- veadk/memory/long_term_memory_backends/mem0_backend.py +155 -0
- veadk/memory/long_term_memory_backends/opensearch_backend.py +124 -0
- veadk/memory/long_term_memory_backends/redis_backend.py +140 -0
- veadk/memory/long_term_memory_backends/vikingdb_memory_backend.py +189 -0
- veadk/memory/short_term_memory.py +252 -0
- veadk/memory/short_term_memory_backends/__init__.py +13 -0
- veadk/memory/short_term_memory_backends/base_backend.py +31 -0
- veadk/memory/short_term_memory_backends/mysql_backend.py +49 -0
- veadk/memory/short_term_memory_backends/postgresql_backend.py +49 -0
- veadk/memory/short_term_memory_backends/sqlite_backend.py +55 -0
- veadk/memory/short_term_memory_processor.py +100 -0
- veadk/processors/__init__.py +26 -0
- veadk/processors/base_run_processor.py +120 -0
- veadk/prompts/__init__.py +13 -0
- veadk/prompts/agent_default_prompt.py +30 -0
- veadk/prompts/prompt_evaluator.py +20 -0
- veadk/prompts/prompt_memory_processor.py +55 -0
- veadk/prompts/prompt_optimization.py +150 -0
- veadk/runner.py +732 -0
- veadk/tools/__init__.py +13 -0
- veadk/tools/builtin_tools/__init__.py +13 -0
- veadk/tools/builtin_tools/agent_authorization.py +94 -0
- veadk/tools/builtin_tools/generate_image.py +23 -0
- veadk/tools/builtin_tools/image_edit.py +300 -0
- veadk/tools/builtin_tools/image_generate.py +446 -0
- veadk/tools/builtin_tools/lark.py +67 -0
- veadk/tools/builtin_tools/las.py +24 -0
- veadk/tools/builtin_tools/link_reader.py +66 -0
- veadk/tools/builtin_tools/llm_shield.py +381 -0
- veadk/tools/builtin_tools/load_knowledgebase.py +97 -0
- veadk/tools/builtin_tools/mcp_router.py +29 -0
- veadk/tools/builtin_tools/run_code.py +113 -0
- veadk/tools/builtin_tools/tts.py +253 -0
- veadk/tools/builtin_tools/vesearch.py +49 -0
- veadk/tools/builtin_tools/video_generate.py +363 -0
- veadk/tools/builtin_tools/web_scraper.py +76 -0
- veadk/tools/builtin_tools/web_search.py +83 -0
- veadk/tools/demo_tools.py +58 -0
- veadk/tools/load_knowledgebase_tool.py +149 -0
- veadk/tools/sandbox/__init__.py +13 -0
- veadk/tools/sandbox/browser_sandbox.py +37 -0
- veadk/tools/sandbox/code_sandbox.py +40 -0
- veadk/tools/sandbox/computer_sandbox.py +34 -0
- veadk/tracing/__init__.py +13 -0
- veadk/tracing/base_tracer.py +58 -0
- veadk/tracing/telemetry/__init__.py +13 -0
- veadk/tracing/telemetry/attributes/attributes.py +29 -0
- veadk/tracing/telemetry/attributes/extractors/common_attributes_extractors.py +180 -0
- veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +858 -0
- veadk/tracing/telemetry/attributes/extractors/tool_attributes_extractors.py +152 -0
- veadk/tracing/telemetry/attributes/extractors/types.py +164 -0
- veadk/tracing/telemetry/exporters/__init__.py +13 -0
- veadk/tracing/telemetry/exporters/apmplus_exporter.py +558 -0
- veadk/tracing/telemetry/exporters/base_exporter.py +39 -0
- veadk/tracing/telemetry/exporters/cozeloop_exporter.py +129 -0
- veadk/tracing/telemetry/exporters/inmemory_exporter.py +248 -0
- veadk/tracing/telemetry/exporters/tls_exporter.py +139 -0
- veadk/tracing/telemetry/opentelemetry_tracer.py +320 -0
- veadk/tracing/telemetry/telemetry.py +411 -0
- veadk/types.py +47 -0
- veadk/utils/__init__.py +13 -0
- veadk/utils/audio_manager.py +95 -0
- veadk/utils/auth.py +294 -0
- veadk/utils/logger.py +59 -0
- veadk/utils/mcp_utils.py +44 -0
- veadk/utils/misc.py +184 -0
- veadk/utils/patches.py +101 -0
- veadk/utils/volcengine_sign.py +205 -0
- veadk/version.py +15 -0
- veadk_python-0.2.27.dist-info/METADATA +373 -0
- veadk_python-0.2.27.dist-info/RECORD +218 -0
- veadk_python-0.2.27.dist-info/WHEEL +5 -0
- veadk_python-0.2.27.dist-info/entry_points.txt +2 -0
- veadk_python-0.2.27.dist-info/licenses/LICENSE +201 -0
- veadk_python-0.2.27.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import time
|
|
17
|
+
import uuid
|
|
18
|
+
from os import path
|
|
19
|
+
|
|
20
|
+
from google.adk.evaluation.agent_evaluator import (
|
|
21
|
+
RESPONSE_MATCH_SCORE_KEY,
|
|
22
|
+
TOOL_TRAJECTORY_SCORE_KEY,
|
|
23
|
+
AgentEvaluator,
|
|
24
|
+
)
|
|
25
|
+
from google.adk.evaluation.eval_case import IntermediateData, Invocation
|
|
26
|
+
from google.adk.evaluation.evaluator import EvalStatus
|
|
27
|
+
from google.adk.evaluation.eval_set import EvalSet
|
|
28
|
+
from typing import Optional
|
|
29
|
+
from typing_extensions import override
|
|
30
|
+
from veadk.evaluation.base_evaluator import BaseEvaluator
|
|
31
|
+
from types import SimpleNamespace
|
|
32
|
+
from google.genai import types as genai_types
|
|
33
|
+
|
|
34
|
+
from google.adk.evaluation.eval_metrics import EvalMetric
|
|
35
|
+
from google.adk.evaluation.metric_evaluator_registry import (
|
|
36
|
+
DEFAULT_METRIC_EVALUATOR_REGISTRY,
|
|
37
|
+
)
|
|
38
|
+
import inspect
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def formatted_timestamp():
|
|
42
|
+
"""Generates a formatted timestamp string in YYYYMMDDHHMMSS format.
|
|
43
|
+
|
|
44
|
+
This function creates a string representation of the current time.
|
|
45
|
+
It uses local time for formatting.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
str: Timestamp string like '20251028123045'.
|
|
49
|
+
"""
|
|
50
|
+
# YYYYMMDDHHMMSS
|
|
51
|
+
return time.strftime("%Y%m%d%H%M%S", time.localtime())
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ADKEvaluator(BaseEvaluator):
|
|
55
|
+
"""Evaluates agents using Google ADK metrics.
|
|
56
|
+
|
|
57
|
+
This class uses Google's Agent Development Kit (ADK) to test agents.
|
|
58
|
+
It checks tool usage and response quality.
|
|
59
|
+
Runs tests multiple times for reliable results.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
name (str): Name of this evaluator. Defaults to 'veadk_adk_evaluator'.
|
|
63
|
+
|
|
64
|
+
Note:
|
|
65
|
+
Works with .test.json files and folders of files.
|
|
66
|
+
Default thresholds: tool=1.0, response=0.8.
|
|
67
|
+
Runs each test multiple times (default 2) for average scores.
|
|
68
|
+
|
|
69
|
+
Examples:
|
|
70
|
+
```python
|
|
71
|
+
agent = Agent(tools=[get_city_weather])
|
|
72
|
+
evaluator = ADKEvaluator(agent=agent)
|
|
73
|
+
results, failures = await evaluator.evaluate(eval_set_file_path="test_folder")
|
|
74
|
+
```
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
agent,
|
|
80
|
+
name: str = "veadk_adk_evaluator",
|
|
81
|
+
):
|
|
82
|
+
"""Initializes the ADK evaluator with agent and name.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
agent: The agent to evaluate.
|
|
86
|
+
name (str): Name of the evaluator. Defaults to 'veadk_adk_evaluator'.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If agent is invalid.
|
|
90
|
+
"""
|
|
91
|
+
super().__init__(agent=agent, name=name)
|
|
92
|
+
|
|
93
|
+
@override
|
|
94
|
+
async def evaluate(
|
|
95
|
+
self,
|
|
96
|
+
eval_set: Optional[EvalSet] = None,
|
|
97
|
+
eval_set_file_path: Optional[str] = None,
|
|
98
|
+
eval_id: str = f"test_{formatted_timestamp()}",
|
|
99
|
+
tool_score_threshold: float = 1.0,
|
|
100
|
+
response_match_score_threshold: float = 0.8,
|
|
101
|
+
num_runs: int = 2,
|
|
102
|
+
print_detailed_results: bool = True,
|
|
103
|
+
):
|
|
104
|
+
"""Tests agent using ADK metrics on test cases.
|
|
105
|
+
|
|
106
|
+
This method does these steps:
|
|
107
|
+
1. Finds test files in folder or single file
|
|
108
|
+
2. Sets up scoring rules with thresholds
|
|
109
|
+
3. Runs agent multiple times for each test
|
|
110
|
+
4. Converts data to ADK format
|
|
111
|
+
5. Scores tool usage and response quality
|
|
112
|
+
6. Collects results and failures
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
eval_set: Test cases in memory. If given, used first.
|
|
116
|
+
eval_set_file_path: Path to test file or folder. Used if no eval_set.
|
|
117
|
+
eval_id: Unique name for this test run.
|
|
118
|
+
tool_score_threshold: Minimum score for tool usage. 1.0 means perfect.
|
|
119
|
+
response_match_score_threshold: Minimum score for response match.
|
|
120
|
+
Uses text similarity. 0.8 is default.
|
|
121
|
+
num_runs: How many times to run each test. More runs = more reliable.
|
|
122
|
+
print_detailed_results: If True, shows detailed scores for each test.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
tuple[list, list]: Two lists:
|
|
126
|
+
- List of evaluation results with scores
|
|
127
|
+
- List of failure messages if tests failed
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
ValueError: If no test cases found or thresholds wrong.
|
|
131
|
+
FileNotFoundError: If test file not found.
|
|
132
|
+
EvaluationError: If agent fails or scoring fails.
|
|
133
|
+
|
|
134
|
+
Examples:
|
|
135
|
+
```python
|
|
136
|
+
results, failures = await evaluator.evaluate(
|
|
137
|
+
eval_set_file_path="tests/",
|
|
138
|
+
tool_score_threshold=0.9,
|
|
139
|
+
num_runs=3)
|
|
140
|
+
print(f"Results: {len(results)}, Failures: {len(failures)}")
|
|
141
|
+
```
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
# Resolve eval files: accept a directory (scan *.test.json) or a single file
|
|
145
|
+
test_files = []
|
|
146
|
+
eval_dataset_file_path_or_dir = eval_set_file_path
|
|
147
|
+
if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
|
|
148
|
+
eval_dataset_file_path_or_dir
|
|
149
|
+
):
|
|
150
|
+
for root, _, files in os.walk(eval_dataset_file_path_or_dir):
|
|
151
|
+
for file in files:
|
|
152
|
+
if file.endswith(".test.json"):
|
|
153
|
+
test_files.append(path.join(root, file))
|
|
154
|
+
else:
|
|
155
|
+
test_files = [eval_dataset_file_path_or_dir]
|
|
156
|
+
|
|
157
|
+
# Build metric criteria (metric_name -> threshold)
|
|
158
|
+
criteria = {
|
|
159
|
+
TOOL_TRAJECTORY_SCORE_KEY: tool_score_threshold, # 1-point scale; 1.0 means perfect tool call trajectory
|
|
160
|
+
RESPONSE_MATCH_SCORE_KEY: response_match_score_threshold, # Rouge-1 text match; 0.8 default threshold
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
# Aggregate all evaluation results and failures across files
|
|
164
|
+
result = []
|
|
165
|
+
failures = []
|
|
166
|
+
|
|
167
|
+
# Iterate each test file and evaluate per-case, per-metric
|
|
168
|
+
for test_file in test_files:
|
|
169
|
+
# Build in-memory evaluation cases via BaseEvaluator from the provided file
|
|
170
|
+
self.build_eval_set(eval_set, test_file)
|
|
171
|
+
|
|
172
|
+
evaluation_result_list = []
|
|
173
|
+
|
|
174
|
+
# For each eval case, generate actual outputs num_runs times using BaseEvaluator
|
|
175
|
+
for case_idx, eval_case_data in enumerate(self.invocation_list):
|
|
176
|
+
# Convert BaseEvaluator's expected data into ADK Invocation list
|
|
177
|
+
expected_invocations: list[Invocation] = []
|
|
178
|
+
for inv in eval_case_data.invocations:
|
|
179
|
+
user_content = genai_types.Content(
|
|
180
|
+
role="user",
|
|
181
|
+
parts=[genai_types.Part(text=inv.input or "")],
|
|
182
|
+
)
|
|
183
|
+
expected_final = genai_types.Content(
|
|
184
|
+
role=None,
|
|
185
|
+
parts=[genai_types.Part(text=inv.expected_output or "")],
|
|
186
|
+
)
|
|
187
|
+
expected_tool_calls = [
|
|
188
|
+
SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
|
|
189
|
+
for t in (inv.expected_tool or [])
|
|
190
|
+
]
|
|
191
|
+
# Pack a full expected Invocation for ADK metrics
|
|
192
|
+
expected_invocations.append(
|
|
193
|
+
Invocation(
|
|
194
|
+
invocation_id=inv.invocation_id,
|
|
195
|
+
user_content=user_content,
|
|
196
|
+
final_response=expected_final,
|
|
197
|
+
intermediate_data=IntermediateData(
|
|
198
|
+
tool_uses=expected_tool_calls
|
|
199
|
+
),
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Collect actual invocations across runs
|
|
204
|
+
actual_invocations_all_runs: list[Invocation] = []
|
|
205
|
+
for _ in range(num_runs):
|
|
206
|
+
for agent_information in self.agent_information_list:
|
|
207
|
+
agent_information["session_id"] = str(uuid.uuid4())
|
|
208
|
+
|
|
209
|
+
# Generate actual outputs for all cases in this run via BaseEvaluator
|
|
210
|
+
await self.generate_actual_outputs()
|
|
211
|
+
|
|
212
|
+
# Convert BaseEvaluator's actual data into ADK Invocation list
|
|
213
|
+
for inv in eval_case_data.invocations:
|
|
214
|
+
user_content = genai_types.Content(
|
|
215
|
+
role="user",
|
|
216
|
+
parts=[genai_types.Part(text=inv.input or "")],
|
|
217
|
+
)
|
|
218
|
+
actual_final = genai_types.Content(
|
|
219
|
+
role=None,
|
|
220
|
+
parts=[genai_types.Part(text=inv.actual_output or "")],
|
|
221
|
+
)
|
|
222
|
+
# Collect the tool calls observed during actual execution
|
|
223
|
+
actual_tool_calls = [
|
|
224
|
+
SimpleNamespace(name=t.get("name"), args=t.get("args", {}))
|
|
225
|
+
for t in (inv.actual_tool or [])
|
|
226
|
+
]
|
|
227
|
+
# Pack a full actual Invocation for ADK metrics
|
|
228
|
+
actual_invocations_all_runs.append(
|
|
229
|
+
Invocation(
|
|
230
|
+
invocation_id=inv.invocation_id,
|
|
231
|
+
user_content=user_content,
|
|
232
|
+
final_response=actual_final,
|
|
233
|
+
intermediate_data=IntermediateData(
|
|
234
|
+
tool_uses=actual_tool_calls
|
|
235
|
+
),
|
|
236
|
+
)
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Repeat expected invocations to align with num_runs
|
|
240
|
+
expected_invocations_repeated = expected_invocations * num_runs
|
|
241
|
+
|
|
242
|
+
# Evaluate per metric via ADK metric evaluators obtained from the registry
|
|
243
|
+
for metric_name, threshold in criteria.items():
|
|
244
|
+
eval_metric = EvalMetric(
|
|
245
|
+
metric_name=metric_name, threshold=threshold
|
|
246
|
+
)
|
|
247
|
+
metric_evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_evaluator(
|
|
248
|
+
eval_metric=eval_metric
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if inspect.iscoroutinefunction(
|
|
252
|
+
metric_evaluator.evaluate_invocations
|
|
253
|
+
):
|
|
254
|
+
evaluation_result = await metric_evaluator.evaluate_invocations(
|
|
255
|
+
actual_invocations=actual_invocations_all_runs,
|
|
256
|
+
expected_invocations=expected_invocations_repeated,
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
evaluation_result = metric_evaluator.evaluate_invocations(
|
|
260
|
+
actual_invocations=actual_invocations_all_runs,
|
|
261
|
+
expected_invocations=expected_invocations_repeated,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
if print_detailed_results:
|
|
265
|
+
per_items = []
|
|
266
|
+
for i, per in enumerate(
|
|
267
|
+
getattr(evaluation_result, "per_invocation_results", [])
|
|
268
|
+
or []
|
|
269
|
+
):
|
|
270
|
+
per_items.append(
|
|
271
|
+
SimpleNamespace(
|
|
272
|
+
actual_invocation=actual_invocations_all_runs[i],
|
|
273
|
+
expected_invocation=expected_invocations_repeated[
|
|
274
|
+
i
|
|
275
|
+
],
|
|
276
|
+
eval_metric_result=SimpleNamespace(
|
|
277
|
+
eval_status=per.eval_status,
|
|
278
|
+
score=per.score,
|
|
279
|
+
threshold=threshold,
|
|
280
|
+
),
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
AgentEvaluator._print_details(
|
|
285
|
+
eval_metric_result_with_invocations=per_items,
|
|
286
|
+
overall_eval_status=evaluation_result.overall_eval_status,
|
|
287
|
+
overall_score=evaluation_result.overall_score,
|
|
288
|
+
metric_name=metric_name,
|
|
289
|
+
threshold=threshold,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
if evaluation_result.overall_eval_status != EvalStatus.PASSED:
|
|
293
|
+
failures.append(
|
|
294
|
+
f"{metric_name} for {self.agent.name} Failed. Expected {threshold},"
|
|
295
|
+
f" but got {evaluation_result.overall_score}."
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
evaluation_result_list.append(evaluation_result)
|
|
299
|
+
|
|
300
|
+
result.append(evaluation_result_list)
|
|
301
|
+
|
|
302
|
+
return result, failures
|