google-adk 0.5.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/adk/agents/base_agent.py +76 -30
- google/adk/agents/callback_context.py +2 -6
- google/adk/agents/llm_agent.py +122 -30
- google/adk/agents/loop_agent.py +1 -1
- google/adk/agents/parallel_agent.py +7 -0
- google/adk/agents/readonly_context.py +8 -0
- google/adk/agents/run_config.py +1 -1
- google/adk/agents/sequential_agent.py +31 -0
- google/adk/agents/transcription_entry.py +4 -2
- google/adk/artifacts/gcs_artifact_service.py +1 -1
- google/adk/artifacts/in_memory_artifact_service.py +1 -1
- google/adk/auth/auth_credential.py +10 -2
- google/adk/auth/auth_preprocessor.py +7 -1
- google/adk/auth/auth_tool.py +3 -4
- google/adk/cli/agent_graph.py +5 -5
- google/adk/cli/browser/index.html +4 -4
- google/adk/cli/browser/{main-ULN5R5I5.js → main-PKDNKWJE.js} +59 -60
- google/adk/cli/browser/polyfills-B6TNHZQ6.js +17 -0
- google/adk/cli/cli.py +10 -9
- google/adk/cli/cli_deploy.py +7 -2
- google/adk/cli/cli_eval.py +109 -115
- google/adk/cli/cli_tools_click.py +179 -67
- google/adk/cli/fast_api.py +248 -197
- google/adk/cli/utils/agent_loader.py +137 -0
- google/adk/cli/utils/cleanup.py +40 -0
- google/adk/cli/utils/common.py +23 -0
- google/adk/cli/utils/evals.py +83 -0
- google/adk/cli/utils/logs.py +8 -5
- google/adk/code_executors/__init__.py +3 -1
- google/adk/code_executors/built_in_code_executor.py +52 -0
- google/adk/code_executors/code_execution_utils.py +2 -1
- google/adk/code_executors/container_code_executor.py +0 -1
- google/adk/code_executors/vertex_ai_code_executor.py +6 -8
- google/adk/evaluation/__init__.py +1 -1
- google/adk/evaluation/agent_evaluator.py +168 -128
- google/adk/evaluation/eval_case.py +104 -0
- google/adk/evaluation/eval_metrics.py +74 -0
- google/adk/evaluation/eval_result.py +86 -0
- google/adk/evaluation/eval_set.py +39 -0
- google/adk/evaluation/eval_set_results_manager.py +47 -0
- google/adk/evaluation/eval_sets_manager.py +43 -0
- google/adk/evaluation/evaluation_generator.py +88 -113
- google/adk/evaluation/evaluator.py +58 -0
- google/adk/evaluation/local_eval_set_results_manager.py +113 -0
- google/adk/evaluation/local_eval_sets_manager.py +264 -0
- google/adk/evaluation/response_evaluator.py +106 -1
- google/adk/evaluation/trajectory_evaluator.py +84 -2
- google/adk/events/event.py +6 -1
- google/adk/events/event_actions.py +6 -1
- google/adk/examples/base_example_provider.py +1 -0
- google/adk/examples/example_util.py +3 -2
- google/adk/flows/llm_flows/_code_execution.py +9 -1
- google/adk/flows/llm_flows/audio_transcriber.py +4 -3
- google/adk/flows/llm_flows/base_llm_flow.py +58 -21
- google/adk/flows/llm_flows/contents.py +3 -1
- google/adk/flows/llm_flows/functions.py +9 -8
- google/adk/flows/llm_flows/instructions.py +18 -80
- google/adk/flows/llm_flows/single_flow.py +2 -2
- google/adk/memory/__init__.py +1 -1
- google/adk/memory/_utils.py +23 -0
- google/adk/memory/base_memory_service.py +23 -21
- google/adk/memory/in_memory_memory_service.py +57 -25
- google/adk/memory/memory_entry.py +37 -0
- google/adk/memory/vertex_ai_rag_memory_service.py +38 -15
- google/adk/models/anthropic_llm.py +16 -9
- google/adk/models/base_llm.py +2 -1
- google/adk/models/base_llm_connection.py +2 -0
- google/adk/models/gemini_llm_connection.py +11 -11
- google/adk/models/google_llm.py +12 -2
- google/adk/models/lite_llm.py +80 -23
- google/adk/models/llm_response.py +16 -3
- google/adk/models/registry.py +1 -1
- google/adk/runners.py +98 -42
- google/adk/sessions/__init__.py +1 -1
- google/adk/sessions/_session_util.py +2 -1
- google/adk/sessions/base_session_service.py +6 -33
- google/adk/sessions/database_session_service.py +57 -67
- google/adk/sessions/in_memory_session_service.py +106 -24
- google/adk/sessions/session.py +3 -0
- google/adk/sessions/vertex_ai_session_service.py +44 -51
- google/adk/telemetry.py +7 -2
- google/adk/tools/__init__.py +4 -7
- google/adk/tools/_memory_entry_utils.py +30 -0
- google/adk/tools/agent_tool.py +10 -10
- google/adk/tools/apihub_tool/apihub_toolset.py +55 -74
- google/adk/tools/apihub_tool/clients/apihub_client.py +10 -3
- google/adk/tools/apihub_tool/clients/secret_client.py +1 -0
- google/adk/tools/application_integration_tool/application_integration_toolset.py +111 -85
- google/adk/tools/application_integration_tool/clients/connections_client.py +28 -1
- google/adk/tools/application_integration_tool/clients/integration_client.py +7 -5
- google/adk/tools/application_integration_tool/integration_connector_tool.py +69 -26
- google/adk/tools/base_toolset.py +96 -0
- google/adk/tools/bigquery/__init__.py +28 -0
- google/adk/tools/bigquery/bigquery_credentials.py +216 -0
- google/adk/tools/bigquery/bigquery_tool.py +116 -0
- google/adk/tools/{built_in_code_execution_tool.py → enterprise_search_tool.py} +17 -11
- google/adk/tools/function_parameter_parse_util.py +9 -2
- google/adk/tools/function_tool.py +33 -3
- google/adk/tools/get_user_choice_tool.py +1 -0
- google/adk/tools/google_api_tool/__init__.py +24 -70
- google/adk/tools/google_api_tool/google_api_tool.py +12 -6
- google/adk/tools/google_api_tool/{google_api_tool_set.py → google_api_toolset.py} +57 -55
- google/adk/tools/google_api_tool/google_api_toolsets.py +108 -0
- google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +40 -42
- google/adk/tools/google_search_tool.py +2 -2
- google/adk/tools/langchain_tool.py +96 -49
- google/adk/tools/load_memory_tool.py +14 -5
- google/adk/tools/mcp_tool/__init__.py +3 -2
- google/adk/tools/mcp_tool/conversion_utils.py +6 -2
- google/adk/tools/mcp_tool/mcp_session_manager.py +80 -69
- google/adk/tools/mcp_tool/mcp_tool.py +35 -32
- google/adk/tools/mcp_tool/mcp_toolset.py +99 -194
- google/adk/tools/openapi_tool/auth/credential_exchangers/base_credential_exchanger.py +1 -3
- google/adk/tools/openapi_tool/auth/credential_exchangers/service_account_exchanger.py +6 -7
- google/adk/tools/openapi_tool/common/common.py +5 -1
- google/adk/tools/openapi_tool/openapi_spec_parser/__init__.py +7 -2
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +27 -7
- google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +36 -32
- google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +11 -1
- google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +1 -1
- google/adk/tools/preload_memory_tool.py +27 -18
- google/adk/tools/retrieval/__init__.py +1 -1
- google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +1 -1
- google/adk/tools/toolbox_toolset.py +107 -0
- google/adk/tools/transfer_to_agent_tool.py +0 -1
- google/adk/utils/__init__.py +13 -0
- google/adk/utils/instructions_utils.py +131 -0
- google/adk/version.py +1 -1
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/METADATA +18 -19
- google_adk-1.1.0.dist-info/RECORD +200 -0
- google/adk/agents/remote_agent.py +0 -50
- google/adk/cli/browser/polyfills-FFHMD2TL.js +0 -18
- google/adk/cli/fast_api.py.orig +0 -728
- google/adk/tools/google_api_tool/google_api_tool_sets.py +0 -112
- google/adk/tools/toolbox_tool.py +0 -46
- google_adk-0.5.0.dist-info/RECORD +0 -180
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/WHEEL +0 -0
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/entry_points.txt +0 -0
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,264 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import json
|
16
|
+
import logging
|
17
|
+
import os
|
18
|
+
import re
|
19
|
+
import time
|
20
|
+
from typing import Any
|
21
|
+
import uuid
|
22
|
+
|
23
|
+
from google.genai import types as genai_types
|
24
|
+
from pydantic import ValidationError
|
25
|
+
from typing_extensions import override
|
26
|
+
|
27
|
+
from .eval_case import EvalCase
|
28
|
+
from .eval_case import IntermediateData
|
29
|
+
from .eval_case import Invocation
|
30
|
+
from .eval_case import SessionInput
|
31
|
+
from .eval_set import EvalSet
|
32
|
+
from .eval_sets_manager import EvalSetsManager
|
33
|
+
|
34
|
+
logger = logging.getLogger("google_adk." + __name__)
|
35
|
+
|
36
|
+
_EVAL_SET_FILE_EXTENSION = ".evalset.json"
|
37
|
+
|
38
|
+
|
39
|
+
def _convert_invocation_to_pydantic_schema(
|
40
|
+
invocation_in_json_format: dict[str, Any],
|
41
|
+
) -> Invocation:
|
42
|
+
"""Converts an invocation from old json format to new Pydantic Schema"""
|
43
|
+
query = invocation_in_json_format["query"]
|
44
|
+
reference = invocation_in_json_format["reference"]
|
45
|
+
expected_tool_use = []
|
46
|
+
expected_intermediate_agent_responses = []
|
47
|
+
|
48
|
+
for old_tool_use in invocation_in_json_format.get("expected_tool_use", []):
|
49
|
+
expected_tool_use.append(
|
50
|
+
genai_types.FunctionCall(
|
51
|
+
name=old_tool_use["tool_name"], args=old_tool_use["tool_input"]
|
52
|
+
)
|
53
|
+
)
|
54
|
+
|
55
|
+
for old_intermediate_response in invocation_in_json_format.get(
|
56
|
+
"expected_intermediate_agent_responses", []
|
57
|
+
):
|
58
|
+
expected_intermediate_agent_responses.append((
|
59
|
+
old_intermediate_response["author"],
|
60
|
+
[genai_types.Part.from_text(text=old_intermediate_response["text"])],
|
61
|
+
))
|
62
|
+
|
63
|
+
return Invocation(
|
64
|
+
invocation_id=str(uuid.uuid4()),
|
65
|
+
user_content=genai_types.Content(
|
66
|
+
parts=[genai_types.Part.from_text(text=query)], role="user"
|
67
|
+
),
|
68
|
+
final_response=genai_types.Content(
|
69
|
+
parts=[genai_types.Part.from_text(text=reference)], role="model"
|
70
|
+
),
|
71
|
+
intermediate_data=IntermediateData(
|
72
|
+
tool_uses=expected_tool_use,
|
73
|
+
intermediate_responses=expected_intermediate_agent_responses,
|
74
|
+
),
|
75
|
+
creation_timestamp=time.time(),
|
76
|
+
)
|
77
|
+
|
78
|
+
|
79
|
+
def convert_eval_set_to_pydanctic_schema(
|
80
|
+
eval_set_id: str,
|
81
|
+
eval_set_in_json_format: list[dict[str, Any]],
|
82
|
+
) -> EvalSet:
|
83
|
+
r"""Returns an pydantic EvalSet generated from the json representation.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
eval_set_id: Eval set id.
|
87
|
+
eval_set_in_json_format: Eval set specified in JSON format.
|
88
|
+
|
89
|
+
Here is a sample eval set in JSON format:
|
90
|
+
[
|
91
|
+
{
|
92
|
+
"name": "roll_17_sided_dice_twice",
|
93
|
+
"data": [
|
94
|
+
{
|
95
|
+
"query": "What can you do?",
|
96
|
+
"expected_tool_use": [],
|
97
|
+
"expected_intermediate_agent_responses": [],
|
98
|
+
"reference": "I can roll dice of different sizes and check if a number
|
99
|
+
is prime. I can also use multiple tools in parallel.\n"
|
100
|
+
},
|
101
|
+
{
|
102
|
+
"query": "Roll a 17 sided dice twice for me",
|
103
|
+
"expected_tool_use": [
|
104
|
+
{
|
105
|
+
"tool_name": "roll_die",
|
106
|
+
"tool_input": {
|
107
|
+
"sides": 17
|
108
|
+
}
|
109
|
+
},
|
110
|
+
{
|
111
|
+
"tool_name": "roll_die",
|
112
|
+
"tool_input": {
|
113
|
+
"sides": 17
|
114
|
+
}
|
115
|
+
}
|
116
|
+
],
|
117
|
+
"expected_intermediate_agent_responses": [],
|
118
|
+
"reference": "I have rolled a 17 sided die twice. The first roll was
|
119
|
+
13 and the second roll was 4.\n"
|
120
|
+
}
|
121
|
+
],
|
122
|
+
"initial_session": {
|
123
|
+
"state": {},
|
124
|
+
"app_name": "hello_world",
|
125
|
+
"user_id": "user"
|
126
|
+
}
|
127
|
+
}
|
128
|
+
]
|
129
|
+
"""
|
130
|
+
eval_cases = []
|
131
|
+
for old_eval_case in eval_set_in_json_format:
|
132
|
+
new_invocations = []
|
133
|
+
|
134
|
+
for old_invocation in old_eval_case["data"]:
|
135
|
+
new_invocations.append(
|
136
|
+
_convert_invocation_to_pydantic_schema(old_invocation)
|
137
|
+
)
|
138
|
+
|
139
|
+
session_input = None
|
140
|
+
if (
|
141
|
+
"initial_session" in old_eval_case
|
142
|
+
and len(old_eval_case["initial_session"]) > 0
|
143
|
+
):
|
144
|
+
session_input = SessionInput(
|
145
|
+
app_name=old_eval_case["initial_session"].get("app_name", ""),
|
146
|
+
user_id=old_eval_case["initial_session"].get("user_id", ""),
|
147
|
+
state=old_eval_case["initial_session"].get("state", {}),
|
148
|
+
)
|
149
|
+
|
150
|
+
new_eval_case = EvalCase(
|
151
|
+
eval_id=old_eval_case["name"],
|
152
|
+
conversation=new_invocations,
|
153
|
+
session_input=session_input,
|
154
|
+
creation_timestamp=time.time(),
|
155
|
+
)
|
156
|
+
eval_cases.append(new_eval_case)
|
157
|
+
|
158
|
+
return EvalSet(
|
159
|
+
eval_set_id=eval_set_id,
|
160
|
+
name=eval_set_id,
|
161
|
+
creation_timestamp=time.time(),
|
162
|
+
eval_cases=eval_cases,
|
163
|
+
)
|
164
|
+
|
165
|
+
|
166
|
+
def load_eval_set_from_file(
|
167
|
+
eval_set_file_path: str, eval_set_id: str
|
168
|
+
) -> EvalSet:
|
169
|
+
"""Returns an EvalSet that is read from the given file."""
|
170
|
+
with open(eval_set_file_path, "r", encoding="utf-8") as f:
|
171
|
+
content = f.read()
|
172
|
+
try:
|
173
|
+
return EvalSet.model_validate_json(content)
|
174
|
+
except ValidationError:
|
175
|
+
# We assume that the eval data was specified in the old format and try
|
176
|
+
# to convert it to the new format.
|
177
|
+
return convert_eval_set_to_pydanctic_schema(
|
178
|
+
eval_set_id, json.loads(content)
|
179
|
+
)
|
180
|
+
|
181
|
+
|
182
|
+
class LocalEvalSetsManager(EvalSetsManager):
|
183
|
+
"""An EvalSets manager that stores eval sets locally on disk."""
|
184
|
+
|
185
|
+
def __init__(self, agents_dir: str):
|
186
|
+
self._agents_dir = agents_dir
|
187
|
+
|
188
|
+
@override
|
189
|
+
def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
|
190
|
+
"""Returns an EvalSet identified by an app_name and eval_set_id."""
|
191
|
+
# Load the eval set file data
|
192
|
+
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
193
|
+
return load_eval_set_from_file(eval_set_file_path, eval_set_id)
|
194
|
+
|
195
|
+
@override
|
196
|
+
def create_eval_set(self, app_name: str, eval_set_id: str):
|
197
|
+
"""Creates an empty EvalSet given the app_name and eval_set_id."""
|
198
|
+
self._validate_id(id_name="Eval Set Id", id_value=eval_set_id)
|
199
|
+
|
200
|
+
# Define the file path
|
201
|
+
new_eval_set_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
202
|
+
|
203
|
+
logger.info("Creating eval set file `%s`", new_eval_set_path)
|
204
|
+
|
205
|
+
if not os.path.exists(new_eval_set_path):
|
206
|
+
# Write the JSON string to the file
|
207
|
+
logger.info("Eval set file doesn't exist, we will create a new one.")
|
208
|
+
new_eval_set = EvalSet(
|
209
|
+
eval_set_id=eval_set_id,
|
210
|
+
name=eval_set_id,
|
211
|
+
eval_cases=[],
|
212
|
+
creation_timestamp=time.time(),
|
213
|
+
)
|
214
|
+
self._write_eval_set(new_eval_set_path, new_eval_set)
|
215
|
+
|
216
|
+
@override
|
217
|
+
def list_eval_sets(self, app_name: str) -> list[str]:
|
218
|
+
"""Returns a list of EvalSets that belong to the given app_name."""
|
219
|
+
eval_set_file_path = os.path.join(self._agents_dir, app_name)
|
220
|
+
eval_sets = []
|
221
|
+
for file in os.listdir(eval_set_file_path):
|
222
|
+
if file.endswith(_EVAL_SET_FILE_EXTENSION):
|
223
|
+
eval_sets.append(
|
224
|
+
os.path.basename(file).removesuffix(_EVAL_SET_FILE_EXTENSION)
|
225
|
+
)
|
226
|
+
|
227
|
+
return sorted(eval_sets)
|
228
|
+
|
229
|
+
@override
|
230
|
+
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
|
231
|
+
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
|
232
|
+
eval_case_id = eval_case.eval_id
|
233
|
+
self._validate_id(id_name="Eval Case Id", id_value=eval_case_id)
|
234
|
+
|
235
|
+
eval_set = self.get_eval_set(app_name, eval_set_id)
|
236
|
+
|
237
|
+
if [x for x in eval_set.eval_cases if x.eval_id == eval_case_id]:
|
238
|
+
raise ValueError(
|
239
|
+
f"Eval id `{eval_case_id}` already exists in `{eval_set_id}`"
|
240
|
+
" eval set.",
|
241
|
+
)
|
242
|
+
|
243
|
+
eval_set.eval_cases.append(eval_case)
|
244
|
+
|
245
|
+
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
246
|
+
self._write_eval_set(eval_set_file_path, eval_set)
|
247
|
+
|
248
|
+
def _get_eval_set_file_path(self, app_name: str, eval_set_id: str) -> str:
|
249
|
+
return os.path.join(
|
250
|
+
self._agents_dir,
|
251
|
+
app_name,
|
252
|
+
eval_set_id + _EVAL_SET_FILE_EXTENSION,
|
253
|
+
)
|
254
|
+
|
255
|
+
def _validate_id(self, id_name: str, id_value: str):
|
256
|
+
pattern = r"^[a-zA-Z0-9_]+$"
|
257
|
+
if not bool(re.fullmatch(pattern, id_value)):
|
258
|
+
raise ValueError(
|
259
|
+
f"Invalid {id_name}. {id_name} should have the `{pattern}` format",
|
260
|
+
)
|
261
|
+
|
262
|
+
def _write_eval_set(self, eval_set_path: str, eval_set: EvalSet):
|
263
|
+
with open(eval_set_path, "w") as f:
|
264
|
+
f.write(eval_set.model_dump_json(indent=2))
|
@@ -13,17 +13,122 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from typing import Any
|
16
|
+
from typing import Optional
|
16
17
|
|
18
|
+
from deprecated import deprecated
|
19
|
+
from google.genai import types as genai_types
|
17
20
|
import pandas as pd
|
18
21
|
from tabulate import tabulate
|
22
|
+
from typing_extensions import override
|
19
23
|
from vertexai.preview.evaluation import EvalTask
|
20
24
|
from vertexai.preview.evaluation import MetricPromptTemplateExamples
|
21
25
|
|
26
|
+
from .eval_case import IntermediateData
|
27
|
+
from .eval_case import Invocation
|
28
|
+
from .evaluator import EvalStatus
|
29
|
+
from .evaluator import EvaluationResult
|
30
|
+
from .evaluator import Evaluator
|
31
|
+
from .evaluator import PerInvocationResult
|
22
32
|
|
23
|
-
|
33
|
+
|
34
|
+
class ResponseEvaluator(Evaluator):
|
24
35
|
"""Runs response evaluation for agents."""
|
25
36
|
|
37
|
+
def __init__(self, threshold: float, metric_name: str):
|
38
|
+
if "response_evaluation_score" == metric_name:
|
39
|
+
self._metric_name = MetricPromptTemplateExamples.Pointwise.COHERENCE
|
40
|
+
elif "response_match_score" == metric_name:
|
41
|
+
self._metric_name = "rouge_1"
|
42
|
+
else:
|
43
|
+
raise ValueError(f"`{metric_name}` is not supported.")
|
44
|
+
|
45
|
+
self._threshold = threshold
|
46
|
+
|
47
|
+
@override
|
48
|
+
def evaluate_invocations(
|
49
|
+
self,
|
50
|
+
actual_invocations: list[Invocation],
|
51
|
+
expected_invocations: list[Invocation],
|
52
|
+
) -> EvaluationResult:
|
53
|
+
total_score = 0.0
|
54
|
+
num_invocations = 0
|
55
|
+
per_invocation_results = []
|
56
|
+
for actual, expected in zip(actual_invocations, expected_invocations):
|
57
|
+
prompt = self._get_text(expected.user_content)
|
58
|
+
reference = self._get_text(expected.final_response)
|
59
|
+
response = self._get_text(actual.final_response)
|
60
|
+
actual_tool_use = self._get_tool_use_trajectory(actual.intermediate_data)
|
61
|
+
reference_trajectory = self._get_tool_use_trajectory(
|
62
|
+
expected.intermediate_data
|
63
|
+
)
|
64
|
+
|
65
|
+
eval_case = {
|
66
|
+
"prompt": prompt,
|
67
|
+
"reference": reference,
|
68
|
+
"response": response,
|
69
|
+
"actual_tool_user": actual_tool_use,
|
70
|
+
"reference_trajectory": reference_trajectory,
|
71
|
+
}
|
72
|
+
|
73
|
+
eval_case_result = ResponseEvaluator._perform_eval(
|
74
|
+
pd.DataFrame([eval_case]), [self._metric_name]
|
75
|
+
)
|
76
|
+
score = self._get_score(eval_case_result)
|
77
|
+
per_invocation_results.append(
|
78
|
+
PerInvocationResult(
|
79
|
+
actual_invocation=actual,
|
80
|
+
expected_invocation=expected,
|
81
|
+
score=score,
|
82
|
+
eval_status=self._get_eval_status(score),
|
83
|
+
)
|
84
|
+
)
|
85
|
+
total_score += score
|
86
|
+
num_invocations += 1
|
87
|
+
|
88
|
+
if per_invocation_results:
|
89
|
+
overall_score = total_score / num_invocations
|
90
|
+
return EvaluationResult(
|
91
|
+
overall_score=overall_score,
|
92
|
+
overall_eval_status=self._get_eval_status(overall_score),
|
93
|
+
per_invocation_results=per_invocation_results,
|
94
|
+
)
|
95
|
+
|
96
|
+
return EvaluationResult()
|
97
|
+
|
98
|
+
def _get_text(self, content: Optional[genai_types.Content]) -> str:
|
99
|
+
if content and content.parts:
|
100
|
+
return "\n".join([p.text for p in content.parts if p.text])
|
101
|
+
|
102
|
+
return ""
|
103
|
+
|
104
|
+
def _get_tool_use_trajectory(
|
105
|
+
self, intermediate_data: Optional[IntermediateData]
|
106
|
+
) -> list[dict[str, Any]]:
|
107
|
+
tool_use_trajectory = []
|
108
|
+
if not intermediate_data:
|
109
|
+
return tool_use_trajectory
|
110
|
+
|
111
|
+
for function_call in intermediate_data.tool_uses:
|
112
|
+
tool_use_trajectory.append({
|
113
|
+
"tool_name": function_call.name,
|
114
|
+
"tool_input": function_call.args or {},
|
115
|
+
})
|
116
|
+
|
117
|
+
return tool_use_trajectory
|
118
|
+
|
119
|
+
def _get_score(self, eval_result) -> float:
|
120
|
+
return eval_result.summary_metrics[f"{self._metric_name}/mean"].item()
|
121
|
+
|
122
|
+
def _get_eval_status(self, score: float):
|
123
|
+
return EvalStatus.PASSED if score >= self._threshold else EvalStatus.FAILED
|
124
|
+
|
26
125
|
@staticmethod
|
126
|
+
@deprecated(
|
127
|
+
reason=(
|
128
|
+
"This method has been deprecated and will be removed soon. Please use"
|
129
|
+
" evaluate_invocations instead."
|
130
|
+
)
|
131
|
+
)
|
27
132
|
def evaluate(
|
28
133
|
raw_eval_dataset: list[list[dict[str, Any]]],
|
29
134
|
evaluation_criteria: list[str],
|
@@ -13,17 +13,98 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from typing import Any
|
16
|
+
from typing import cast
|
16
17
|
|
18
|
+
from deprecated import deprecated
|
19
|
+
from google.genai import types as genai_types
|
17
20
|
import pandas as pd
|
18
21
|
from tabulate import tabulate
|
22
|
+
from typing_extensions import override
|
19
23
|
|
24
|
+
from .eval_case import Invocation
|
20
25
|
from .evaluation_constants import EvalConstants
|
26
|
+
from .evaluator import EvalStatus
|
27
|
+
from .evaluator import EvaluationResult
|
28
|
+
from .evaluator import Evaluator
|
29
|
+
from .evaluator import PerInvocationResult
|
21
30
|
|
22
31
|
|
23
|
-
class TrajectoryEvaluator:
|
32
|
+
class TrajectoryEvaluator(Evaluator):
|
24
33
|
"""Evaluates tool use trajectories for accuracy."""
|
25
34
|
|
35
|
+
def __init__(self, threshold: float):
|
36
|
+
self._threshold = threshold
|
37
|
+
|
38
|
+
@override
|
39
|
+
def evaluate_invocations(
|
40
|
+
self,
|
41
|
+
actual_invocations: list[Invocation],
|
42
|
+
expected_invocations: list[Invocation],
|
43
|
+
) -> EvaluationResult:
|
44
|
+
"""Returns EvaluationResult after performing evaluations using actual and expected invocations."""
|
45
|
+
total_tool_use_accuracy = 0.0
|
46
|
+
num_invocations = 0
|
47
|
+
per_invocation_results = []
|
48
|
+
|
49
|
+
for actual, expected in zip(actual_invocations, expected_invocations):
|
50
|
+
actual_tool_uses = (
|
51
|
+
actual.intermediate_data.tool_uses if actual.intermediate_data else []
|
52
|
+
)
|
53
|
+
expected_tool_uses = (
|
54
|
+
expected.intermediate_data.tool_uses
|
55
|
+
if expected.intermediate_data
|
56
|
+
else []
|
57
|
+
)
|
58
|
+
tool_use_accuracy = (
|
59
|
+
1.0
|
60
|
+
if self._are_tool_calls_equal(actual_tool_uses, expected_tool_uses)
|
61
|
+
else 0.0
|
62
|
+
)
|
63
|
+
per_invocation_results.append(
|
64
|
+
PerInvocationResult(
|
65
|
+
actual_invocation=actual,
|
66
|
+
expected_invocation=expected,
|
67
|
+
score=tool_use_accuracy,
|
68
|
+
eval_status=self._get_eval_status(tool_use_accuracy),
|
69
|
+
)
|
70
|
+
)
|
71
|
+
total_tool_use_accuracy += tool_use_accuracy
|
72
|
+
num_invocations += 1
|
73
|
+
|
74
|
+
if per_invocation_results:
|
75
|
+
overall_score = total_tool_use_accuracy / num_invocations
|
76
|
+
return EvaluationResult(
|
77
|
+
overall_score=overall_score,
|
78
|
+
overall_eval_status=self._get_eval_status(overall_score),
|
79
|
+
per_invocation_results=per_invocation_results,
|
80
|
+
)
|
81
|
+
|
82
|
+
return EvaluationResult()
|
83
|
+
|
84
|
+
def _are_tool_calls_equal(
|
85
|
+
self,
|
86
|
+
actual_tool_calls: list[genai_types.FunctionCall],
|
87
|
+
expected_tool_calls: list[genai_types.FunctionCall],
|
88
|
+
) -> bool:
|
89
|
+
if len(actual_tool_calls) != len(expected_tool_calls):
|
90
|
+
return False
|
91
|
+
|
92
|
+
for actual, expected in zip(actual_tool_calls, expected_tool_calls):
|
93
|
+
if actual.name != expected.name or actual.args != expected.args:
|
94
|
+
return False
|
95
|
+
|
96
|
+
return True
|
97
|
+
|
98
|
+
def _get_eval_status(self, score: float):
|
99
|
+
return EvalStatus.PASSED if score >= self._threshold else EvalStatus.FAILED
|
100
|
+
|
26
101
|
@staticmethod
|
102
|
+
@deprecated(
|
103
|
+
reason=(
|
104
|
+
"This method has been deprecated and will be removed soon. Please use"
|
105
|
+
" evaluate_invocations instead."
|
106
|
+
)
|
107
|
+
)
|
27
108
|
def evaluate(
|
28
109
|
eval_dataset: list[list[dict[str, Any]]],
|
29
110
|
*,
|
@@ -35,7 +116,7 @@ class TrajectoryEvaluator:
|
|
35
116
|
tool use trajectories. An exact match scores a 1, 0 otherwise. The final
|
36
117
|
number is an average of these individual scores.
|
37
118
|
|
38
|
-
Value range: [0, 1], where 0
|
119
|
+
Value range: [0, 1], where 0 means none of the tool use entries aligned,
|
39
120
|
and 1 would mean all of them aligned. Higher value is good.
|
40
121
|
|
41
122
|
Args:
|
@@ -137,6 +218,7 @@ class TrajectoryEvaluator:
|
|
137
218
|
return new_row, failure
|
138
219
|
|
139
220
|
@staticmethod
|
221
|
+
@deprecated()
|
140
222
|
def are_tools_equal(list_a_original, list_b_original):
|
141
223
|
# Remove other entries that we don't want to evaluate
|
142
224
|
list_a = [
|
google/adk/events/event.py
CHANGED
@@ -19,6 +19,7 @@ import string
|
|
19
19
|
from typing import Optional
|
20
20
|
|
21
21
|
from google.genai import types
|
22
|
+
from pydantic import alias_generators
|
22
23
|
from pydantic import ConfigDict
|
23
24
|
from pydantic import Field
|
24
25
|
|
@@ -46,7 +47,11 @@ class Event(LlmResponse):
|
|
46
47
|
"""
|
47
48
|
|
48
49
|
model_config = ConfigDict(
|
49
|
-
extra='forbid',
|
50
|
+
extra='forbid',
|
51
|
+
ser_json_bytes='base64',
|
52
|
+
val_json_bytes='base64',
|
53
|
+
alias_generator=alias_generators.to_camel,
|
54
|
+
populate_by_name=True,
|
50
55
|
)
|
51
56
|
"""The pydantic model config."""
|
52
57
|
|
@@ -16,6 +16,7 @@ from __future__ import annotations
|
|
16
16
|
|
17
17
|
from typing import Optional
|
18
18
|
|
19
|
+
from pydantic import alias_generators
|
19
20
|
from pydantic import BaseModel
|
20
21
|
from pydantic import ConfigDict
|
21
22
|
from pydantic import Field
|
@@ -26,7 +27,11 @@ from ..auth.auth_tool import AuthConfig
|
|
26
27
|
class EventActions(BaseModel):
|
27
28
|
"""Represents the actions attached to an event."""
|
28
29
|
|
29
|
-
model_config = ConfigDict(
|
30
|
+
model_config = ConfigDict(
|
31
|
+
extra='forbid',
|
32
|
+
alias_generator=alias_generators.to_camel,
|
33
|
+
populate_by_name=True,
|
34
|
+
)
|
30
35
|
"""The pydantic model config."""
|
31
36
|
|
32
37
|
skip_summarization: Optional[bool] = None
|
@@ -15,8 +15,9 @@
|
|
15
15
|
"""Utility functions for converting examples to a string that can be used in system instructions in the prompt."""
|
16
16
|
|
17
17
|
import logging
|
18
|
-
from typing import Optional
|
18
|
+
from typing import Optional
|
19
19
|
from typing import TYPE_CHECKING
|
20
|
+
from typing import Union
|
20
21
|
|
21
22
|
from .base_example_provider import BaseExampleProvider
|
22
23
|
from .example import Example
|
@@ -24,7 +25,7 @@ from .example import Example
|
|
24
25
|
if TYPE_CHECKING:
|
25
26
|
from ..sessions.session import Session
|
26
27
|
|
27
|
-
logger = logging.getLogger(__name__)
|
28
|
+
logger = logging.getLogger("google_adk." + __name__)
|
28
29
|
|
29
30
|
# Constant parts of the example string
|
30
31
|
_EXAMPLES_INTRO = (
|
@@ -22,7 +22,6 @@ import dataclasses
|
|
22
22
|
import os
|
23
23
|
import re
|
24
24
|
from typing import AsyncGenerator
|
25
|
-
from typing import Generator
|
26
25
|
from typing import Optional
|
27
26
|
from typing import TYPE_CHECKING
|
28
27
|
|
@@ -31,6 +30,7 @@ from typing_extensions import override
|
|
31
30
|
|
32
31
|
from ...agents.invocation_context import InvocationContext
|
33
32
|
from ...code_executors.base_code_executor import BaseCodeExecutor
|
33
|
+
from ...code_executors.built_in_code_executor import BuiltInCodeExecutor
|
34
34
|
from ...code_executors.code_execution_utils import CodeExecutionInput
|
35
35
|
from ...code_executors.code_execution_utils import CodeExecutionResult
|
36
36
|
from ...code_executors.code_execution_utils import CodeExecutionUtils
|
@@ -174,6 +174,11 @@ async def _run_pre_processor(
|
|
174
174
|
|
175
175
|
if not code_executor or not isinstance(code_executor, BaseCodeExecutor):
|
176
176
|
return
|
177
|
+
|
178
|
+
if isinstance(code_executor, BuiltInCodeExecutor):
|
179
|
+
code_executor.process_llm_request(llm_request)
|
180
|
+
return
|
181
|
+
|
177
182
|
if not code_executor.optimize_data_file:
|
178
183
|
return
|
179
184
|
|
@@ -262,6 +267,9 @@ async def _run_post_processor(
|
|
262
267
|
if not llm_response or not llm_response.content:
|
263
268
|
return
|
264
269
|
|
270
|
+
if isinstance(code_executor, BuiltInCodeExecutor):
|
271
|
+
return
|
272
|
+
|
265
273
|
code_executor_context = CodeExecutorContext(invocation_context.session.state)
|
266
274
|
# Skip if the error count exceeds the max retry attempts.
|
267
275
|
if (
|
@@ -25,8 +25,9 @@ if TYPE_CHECKING:
|
|
25
25
|
class AudioTranscriber:
|
26
26
|
"""Transcribes audio using Google Cloud Speech-to-Text."""
|
27
27
|
|
28
|
-
def __init__(self):
|
29
|
-
|
28
|
+
def __init__(self, init_client=False):
|
29
|
+
if init_client:
|
30
|
+
self.client = speech.SpeechClient()
|
30
31
|
|
31
32
|
def transcribe_file(
|
32
33
|
self, invocation_context: InvocationContext
|
@@ -84,7 +85,7 @@ class AudioTranscriber:
|
|
84
85
|
|
85
86
|
# Step2: transcription
|
86
87
|
for speaker, data in bundled_audio:
|
87
|
-
if
|
88
|
+
if isinstance(data, genai_types.Blob):
|
88
89
|
audio = speech.RecognitionAudio(content=data)
|
89
90
|
|
90
91
|
config = speech.RecognitionConfig(
|