google-adk 0.4.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/adk/agents/active_streaming_tool.py +1 -0
- google/adk/agents/base_agent.py +91 -47
- google/adk/agents/base_agent.py.orig +330 -0
- google/adk/agents/callback_context.py +4 -9
- google/adk/agents/invocation_context.py +1 -0
- google/adk/agents/langgraph_agent.py +1 -0
- google/adk/agents/live_request_queue.py +1 -0
- google/adk/agents/llm_agent.py +172 -35
- google/adk/agents/loop_agent.py +1 -1
- google/adk/agents/parallel_agent.py +7 -0
- google/adk/agents/readonly_context.py +7 -1
- google/adk/agents/run_config.py +5 -1
- google/adk/agents/sequential_agent.py +31 -0
- google/adk/agents/transcription_entry.py +5 -2
- google/adk/artifacts/base_artifact_service.py +5 -10
- google/adk/artifacts/gcs_artifact_service.py +9 -9
- google/adk/artifacts/in_memory_artifact_service.py +6 -6
- google/adk/auth/auth_credential.py +9 -5
- google/adk/auth/auth_preprocessor.py +7 -1
- google/adk/auth/auth_tool.py +3 -4
- google/adk/cli/agent_graph.py +5 -5
- google/adk/cli/browser/index.html +2 -2
- google/adk/cli/browser/{main-HWIBUY2R.js → main-QOEMUXM4.js} +58 -58
- google/adk/cli/cli.py +7 -7
- google/adk/cli/cli_deploy.py +7 -2
- google/adk/cli/cli_eval.py +181 -106
- google/adk/cli/cli_tools_click.py +147 -62
- google/adk/cli/fast_api.py +340 -158
- google/adk/cli/fast_api.py.orig +822 -0
- google/adk/cli/utils/common.py +23 -0
- google/adk/cli/utils/evals.py +83 -1
- google/adk/cli/utils/logs.py +13 -5
- google/adk/code_executors/__init__.py +3 -1
- google/adk/code_executors/built_in_code_executor.py +52 -0
- google/adk/evaluation/__init__.py +1 -1
- google/adk/evaluation/agent_evaluator.py +168 -128
- google/adk/evaluation/eval_case.py +102 -0
- google/adk/evaluation/eval_set.py +37 -0
- google/adk/evaluation/eval_sets_manager.py +42 -0
- google/adk/evaluation/evaluation_constants.py +1 -0
- google/adk/evaluation/evaluation_generator.py +89 -114
- google/adk/evaluation/evaluator.py +56 -0
- google/adk/evaluation/local_eval_sets_manager.py +264 -0
- google/adk/evaluation/response_evaluator.py +107 -3
- google/adk/evaluation/trajectory_evaluator.py +83 -2
- google/adk/events/event.py +7 -1
- google/adk/events/event_actions.py +7 -1
- google/adk/examples/example.py +1 -0
- google/adk/examples/example_util.py +3 -2
- google/adk/flows/__init__.py +0 -1
- google/adk/flows/llm_flows/_code_execution.py +19 -11
- google/adk/flows/llm_flows/audio_transcriber.py +4 -3
- google/adk/flows/llm_flows/base_llm_flow.py +86 -22
- google/adk/flows/llm_flows/basic.py +3 -0
- google/adk/flows/llm_flows/functions.py +10 -9
- google/adk/flows/llm_flows/instructions.py +28 -9
- google/adk/flows/llm_flows/single_flow.py +1 -1
- google/adk/memory/__init__.py +1 -1
- google/adk/memory/_utils.py +23 -0
- google/adk/memory/base_memory_service.py +25 -21
- google/adk/memory/base_memory_service.py.orig +76 -0
- google/adk/memory/in_memory_memory_service.py +59 -27
- google/adk/memory/memory_entry.py +37 -0
- google/adk/memory/vertex_ai_rag_memory_service.py +40 -17
- google/adk/models/anthropic_llm.py +36 -11
- google/adk/models/base_llm.py +45 -4
- google/adk/models/gemini_llm_connection.py +15 -2
- google/adk/models/google_llm.py +9 -44
- google/adk/models/google_llm.py.orig +305 -0
- google/adk/models/lite_llm.py +94 -38
- google/adk/models/llm_request.py +1 -1
- google/adk/models/llm_response.py +15 -3
- google/adk/models/registry.py +1 -1
- google/adk/runners.py +68 -44
- google/adk/sessions/__init__.py +1 -1
- google/adk/sessions/_session_util.py +14 -0
- google/adk/sessions/base_session_service.py +8 -32
- google/adk/sessions/database_session_service.py +58 -61
- google/adk/sessions/in_memory_session_service.py +108 -26
- google/adk/sessions/session.py +4 -0
- google/adk/sessions/vertex_ai_session_service.py +23 -45
- google/adk/telemetry.py +3 -0
- google/adk/tools/__init__.py +4 -7
- google/adk/tools/{built_in_code_execution_tool.py → _built_in_code_execution_tool.py} +11 -0
- google/adk/tools/_memory_entry_utils.py +30 -0
- google/adk/tools/agent_tool.py +16 -13
- google/adk/tools/apihub_tool/apihub_toolset.py +55 -74
- google/adk/tools/application_integration_tool/application_integration_toolset.py +107 -85
- google/adk/tools/application_integration_tool/clients/connections_client.py +29 -25
- google/adk/tools/application_integration_tool/clients/integration_client.py +6 -6
- google/adk/tools/application_integration_tool/integration_connector_tool.py +69 -26
- google/adk/tools/base_toolset.py +58 -0
- google/adk/tools/enterprise_search_tool.py +65 -0
- google/adk/tools/function_parameter_parse_util.py +2 -2
- google/adk/tools/google_api_tool/__init__.py +18 -70
- google/adk/tools/google_api_tool/google_api_tool.py +11 -5
- google/adk/tools/google_api_tool/google_api_toolset.py +126 -0
- google/adk/tools/google_api_tool/google_api_toolsets.py +102 -0
- google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +40 -42
- google/adk/tools/langchain_tool.py +96 -49
- google/adk/tools/load_artifacts_tool.py +4 -4
- google/adk/tools/load_memory_tool.py +16 -5
- google/adk/tools/mcp_tool/__init__.py +3 -2
- google/adk/tools/mcp_tool/conversion_utils.py +1 -1
- google/adk/tools/mcp_tool/mcp_session_manager.py +167 -16
- google/adk/tools/mcp_tool/mcp_session_manager.py.orig +322 -0
- google/adk/tools/mcp_tool/mcp_tool.py +12 -12
- google/adk/tools/mcp_tool/mcp_toolset.py +155 -195
- google/adk/tools/openapi_tool/common/common.py +2 -5
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +32 -7
- google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +43 -33
- google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +1 -1
- google/adk/tools/preload_memory_tool.py +27 -18
- google/adk/tools/retrieval/__init__.py +1 -1
- google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +1 -1
- google/adk/tools/tool_context.py +4 -4
- google/adk/tools/toolbox_toolset.py +79 -0
- google/adk/tools/transfer_to_agent_tool.py +0 -1
- google/adk/version.py +1 -1
- {google_adk-0.4.0.dist-info → google_adk-1.0.0.dist-info}/METADATA +7 -5
- google_adk-1.0.0.dist-info/RECORD +195 -0
- google/adk/agents/remote_agent.py +0 -50
- google/adk/tools/google_api_tool/google_api_tool_set.py +0 -110
- google/adk/tools/google_api_tool/google_api_tool_sets.py +0 -112
- google/adk/tools/toolbox_tool.py +0 -46
- google_adk-0.4.0.dist-info/RECORD +0 -179
- {google_adk-0.4.0.dist-info → google_adk-1.0.0.dist-info}/WHEEL +0 -0
- {google_adk-0.4.0.dist-info → google_adk-1.0.0.dist-info}/entry_points.txt +0 -0
- {google_adk-0.4.0.dist-info → google_adk-1.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Optional
|
16
|
+
from pydantic import BaseModel
|
17
|
+
from .eval_case import EvalCase
|
18
|
+
|
19
|
+
|
20
|
+
class EvalSet(BaseModel):
|
21
|
+
"""A set of eval cases."""
|
22
|
+
|
23
|
+
eval_set_id: str
|
24
|
+
"""Unique identifier for the eval set."""
|
25
|
+
|
26
|
+
name: Optional[str] = None
|
27
|
+
"""Name of the dataset."""
|
28
|
+
|
29
|
+
description: Optional[str] = None
|
30
|
+
"""Description of the dataset."""
|
31
|
+
|
32
|
+
eval_cases: list[EvalCase]
|
33
|
+
"""List of eval cases in the dataset. Each case represents a single
|
34
|
+
interaction to be evaluated."""
|
35
|
+
|
36
|
+
creation_timestamp: float = 0.0
|
37
|
+
"""The time at which this eval set was created."""
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from abc import ABC, abstractmethod
|
16
|
+
|
17
|
+
from .eval_case import EvalCase
|
18
|
+
from .eval_set import EvalSet
|
19
|
+
|
20
|
+
|
21
|
+
class EvalSetsManager(ABC):
|
22
|
+
"""An interface to manage an Eval Sets."""
|
23
|
+
|
24
|
+
@abstractmethod
|
25
|
+
def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
|
26
|
+
"""Returns an EvalSet identified by an app_name and eval_set_id."""
|
27
|
+
raise NotImplementedError()
|
28
|
+
|
29
|
+
@abstractmethod
|
30
|
+
def create_eval_set(self, app_name: str, eval_set_id: str):
|
31
|
+
"""Creates an empty EvalSet given the app_name and eval_set_id."""
|
32
|
+
raise NotImplementedError()
|
33
|
+
|
34
|
+
@abstractmethod
|
35
|
+
def list_eval_sets(self, app_name: str) -> list[str]:
|
36
|
+
"""Returns a list of EvalSets that belong to the given app_name."""
|
37
|
+
raise NotImplementedError()
|
38
|
+
|
39
|
+
@abstractmethod
|
40
|
+
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
|
41
|
+
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
|
42
|
+
raise NotImplementedError()
|
@@ -13,32 +13,46 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import importlib
|
16
|
+
from typing import Any
|
17
|
+
from typing import Optional
|
16
18
|
import uuid
|
17
19
|
|
18
|
-
from
|
20
|
+
from pydantic import BaseModel
|
19
21
|
|
20
|
-
from ..agents.base_agent import BaseAgent
|
21
22
|
from ..agents.llm_agent import Agent
|
22
|
-
from ..
|
23
|
-
from ..agents.llm_agent import LlmAgent
|
23
|
+
from ..artifacts.base_artifact_service import BaseArtifactService
|
24
24
|
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
25
25
|
from ..runners import Runner
|
26
|
+
from ..sessions.base_session_service import BaseSessionService
|
26
27
|
from ..sessions.in_memory_session_service import InMemorySessionService
|
27
28
|
from ..sessions.session import Session
|
28
|
-
from .
|
29
|
+
from .eval_case import EvalCase
|
30
|
+
from .eval_case import IntermediateData
|
31
|
+
from .eval_case import Invocation
|
32
|
+
from .eval_case import SessionInput
|
33
|
+
from .eval_set import EvalSet
|
34
|
+
|
35
|
+
|
36
|
+
class EvalCaseResponses(BaseModel):
|
37
|
+
"""Contains multiple responses associated with an EvalCase.
|
38
|
+
|
39
|
+
Multiple responses are a result of repeated requests to genereate inferences.
|
40
|
+
"""
|
41
|
+
|
42
|
+
eval_case: EvalCase
|
43
|
+
responses: list[list[Invocation]]
|
29
44
|
|
30
45
|
|
31
46
|
class EvaluationGenerator:
|
32
47
|
"""Generates evaluation responses for agents."""
|
33
48
|
|
34
49
|
@staticmethod
|
35
|
-
def generate_responses(
|
36
|
-
|
37
|
-
agent_module_path,
|
38
|
-
repeat_num=3,
|
39
|
-
agent_name=None,
|
40
|
-
|
41
|
-
):
|
50
|
+
async def generate_responses(
|
51
|
+
eval_set: EvalSet,
|
52
|
+
agent_module_path: str,
|
53
|
+
repeat_num: int = 3,
|
54
|
+
agent_name: str = None,
|
55
|
+
) -> list[EvalCaseResponses]:
|
42
56
|
"""Returns evaluation responses for the given dataset and agent.
|
43
57
|
|
44
58
|
Args:
|
@@ -48,17 +62,23 @@ class EvaluationGenerator:
|
|
48
62
|
usually done to remove uncertainty that a single run may bring.
|
49
63
|
agent_name: The name of the agent that should be evaluated. This is
|
50
64
|
usually the sub-agent.
|
51
|
-
initial_session: Initial session for the eval data.
|
52
65
|
"""
|
53
66
|
results = []
|
54
67
|
|
55
|
-
for
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
68
|
+
for eval_case in eval_set.eval_cases:
|
69
|
+
responses = []
|
70
|
+
for _ in range(repeat_num):
|
71
|
+
response_invocations = await EvaluationGenerator._process_query(
|
72
|
+
eval_case.conversation,
|
73
|
+
agent_module_path,
|
74
|
+
agent_name,
|
75
|
+
eval_case.session_input,
|
61
76
|
)
|
77
|
+
responses.append(response_invocations)
|
78
|
+
|
79
|
+
results.append(
|
80
|
+
EvalCaseResponses(eval_case=eval_case, responses=responses)
|
81
|
+
)
|
62
82
|
|
63
83
|
return results
|
64
84
|
|
@@ -89,7 +109,12 @@ class EvaluationGenerator:
|
|
89
109
|
return results
|
90
110
|
|
91
111
|
@staticmethod
|
92
|
-
def _process_query(
|
112
|
+
async def _process_query(
|
113
|
+
invocations: list[Invocation],
|
114
|
+
module_name: str,
|
115
|
+
agent_name: Optional[str] = None,
|
116
|
+
initial_session: Optional[SessionInput] = None,
|
117
|
+
) -> list[Invocation]:
|
93
118
|
"""Process a query using the agent and evaluation dataset."""
|
94
119
|
module_path = f"{module_name}"
|
95
120
|
agent_module = importlib.import_module(module_path)
|
@@ -102,56 +127,40 @@ class EvaluationGenerator:
|
|
102
127
|
agent_to_evaluate = root_agent.find_agent(agent_name)
|
103
128
|
assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
|
104
129
|
|
105
|
-
return EvaluationGenerator.
|
106
|
-
|
130
|
+
return await EvaluationGenerator._generate_inferences_from_root_agent(
|
131
|
+
invocations, agent_to_evaluate, reset_func, initial_session
|
107
132
|
)
|
108
133
|
|
109
134
|
@staticmethod
|
110
|
-
def
|
111
|
-
|
112
|
-
root_agent,
|
113
|
-
reset_func,
|
114
|
-
initial_session=
|
115
|
-
session_id=None,
|
116
|
-
session_service=None,
|
117
|
-
artifact_service=None,
|
118
|
-
):
|
119
|
-
"""
|
120
|
-
|
121
|
-
# we don't know which tools belong to which agent
|
122
|
-
# so we just apply to any agents that has certain tool outputs
|
123
|
-
all_mock_tools = set()
|
124
|
-
for eval_entry in data:
|
125
|
-
expected_tool_use = eval_entry.get(EvalConstants.EXPECTED_TOOL_USE, [])
|
126
|
-
for expected in expected_tool_use:
|
127
|
-
if EvalConstants.MOCK_TOOL_OUTPUT in expected:
|
128
|
-
all_mock_tools.add(expected[EvalConstants.TOOL_NAME])
|
129
|
-
|
130
|
-
eval_data_copy = data.copy()
|
131
|
-
EvaluationGenerator.apply_before_tool_callback(
|
132
|
-
root_agent,
|
133
|
-
lambda *args: EvaluationGenerator.before_tool_callback(
|
134
|
-
*args, eval_dataset=eval_data_copy
|
135
|
-
),
|
136
|
-
all_mock_tools,
|
137
|
-
)
|
138
|
-
|
135
|
+
async def _generate_inferences_from_root_agent(
|
136
|
+
invocations: list[Invocation],
|
137
|
+
root_agent: Agent,
|
138
|
+
reset_func: Any,
|
139
|
+
initial_session: Optional[SessionInput] = None,
|
140
|
+
session_id: Optional[str] = None,
|
141
|
+
session_service: Optional[BaseSessionService] = None,
|
142
|
+
artifact_service: Optional[BaseArtifactService] = None,
|
143
|
+
) -> list[Invocation]:
|
144
|
+
"""Scrapes the root agent given the list of Invocations."""
|
139
145
|
if not session_service:
|
140
146
|
session_service = InMemorySessionService()
|
141
147
|
|
142
|
-
app_name =
|
143
|
-
|
148
|
+
app_name = (
|
149
|
+
initial_session.app_name if initial_session else "EvaluationGenerator"
|
150
|
+
)
|
151
|
+
user_id = initial_session.user_id if initial_session else "test_user_id"
|
144
152
|
session_id = session_id if session_id else str(uuid.uuid4())
|
145
153
|
|
146
|
-
_ = session_service.create_session(
|
154
|
+
_ = await session_service.create_session(
|
147
155
|
app_name=app_name,
|
148
156
|
user_id=user_id,
|
149
|
-
state=initial_session.
|
157
|
+
state=initial_session.state if initial_session else {},
|
150
158
|
session_id=session_id,
|
151
159
|
)
|
152
160
|
|
153
161
|
if not artifact_service:
|
154
162
|
artifact_service = InMemoryArtifactService()
|
163
|
+
|
155
164
|
runner = Runner(
|
156
165
|
app_name=app_name,
|
157
166
|
agent=root_agent,
|
@@ -163,30 +172,37 @@ class EvaluationGenerator:
|
|
163
172
|
if callable(reset_func):
|
164
173
|
reset_func()
|
165
174
|
|
166
|
-
|
175
|
+
response_invocations = []
|
167
176
|
|
168
|
-
for
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
177
|
+
for invocation in invocations:
|
178
|
+
final_response = None
|
179
|
+
user_content = invocation.user_content
|
180
|
+
tool_uses = []
|
181
|
+
invocation_id = ""
|
173
182
|
|
174
183
|
for event in runner.run(
|
175
|
-
user_id=user_id, session_id=session_id, new_message=
|
184
|
+
user_id=user_id, session_id=session_id, new_message=user_content
|
176
185
|
):
|
186
|
+
invocation_id = (
|
187
|
+
event.invocation_id if not invocation_id else invocation_id
|
188
|
+
)
|
189
|
+
|
177
190
|
if event.is_final_response() and event.content and event.content.parts:
|
178
|
-
|
191
|
+
final_response = event.content
|
179
192
|
elif event.get_function_calls():
|
180
193
|
for call in event.get_function_calls():
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
194
|
+
tool_uses.append(call)
|
195
|
+
|
196
|
+
response_invocations.append(
|
197
|
+
Invocation(
|
198
|
+
invocation_id=invocation_id,
|
199
|
+
user_content=user_content,
|
200
|
+
final_response=final_response,
|
201
|
+
intermediate_data=IntermediateData(tool_uses=tool_uses),
|
202
|
+
)
|
203
|
+
)
|
185
204
|
|
186
|
-
|
187
|
-
responses[index]["response"] = response
|
188
|
-
|
189
|
-
return responses
|
205
|
+
return response_invocations
|
190
206
|
|
191
207
|
@staticmethod
|
192
208
|
def _process_query_with_session(session_data, data):
|
@@ -225,46 +241,5 @@ class EvaluationGenerator:
|
|
225
241
|
responses[index]["actual_tool_use"] = actual_tool_uses
|
226
242
|
responses[index]["response"] = response
|
227
243
|
return responses
|
228
|
-
|
229
|
-
|
230
|
-
def before_tool_callback(tool, args, tool_context, eval_dataset):
|
231
|
-
"""Intercept specific tool calls and return predefined outputs
|
232
|
-
|
233
|
-
from eval_dataset.
|
234
|
-
"""
|
235
|
-
for index, eval_entry in enumerate(eval_dataset):
|
236
|
-
expected_tool_use = eval_entry.get("expected_tool_use", [])
|
237
|
-
for expected in expected_tool_use:
|
238
|
-
if (
|
239
|
-
EvalConstants.MOCK_TOOL_OUTPUT in expected
|
240
|
-
and tool.name == expected[EvalConstants.TOOL_NAME]
|
241
|
-
and args == expected.get(EvalConstants.TOOL_INPUT, {})
|
242
|
-
):
|
243
|
-
# pop the matched entry so we don't rematch again
|
244
|
-
eval_dataset.pop(index)
|
245
|
-
return {"result": expected[EvalConstants.MOCK_TOOL_OUTPUT]}
|
246
|
-
|
247
|
-
return None
|
248
|
-
|
249
|
-
@staticmethod
|
250
|
-
def apply_before_tool_callback(
|
251
|
-
agent: BaseAgent,
|
252
|
-
callback: BeforeToolCallback,
|
253
|
-
all_mock_tools: set[str],
|
254
|
-
):
|
255
|
-
"""Recursively apply the before_tool_callback to the root agent and all its subagents."""
|
256
|
-
# Check if the agent has tools that are defined by evalset.
|
257
|
-
# We use function names to check if tools match
|
258
|
-
if not isinstance(agent, Agent) and not isinstance(agent, LlmAgent):
|
259
|
-
return
|
260
|
-
|
261
|
-
for tool in agent.canonical_tools:
|
262
|
-
tool_name = tool.name
|
263
|
-
if tool_name in all_mock_tools:
|
264
|
-
agent.before_tool_callback = callback
|
265
|
-
|
266
|
-
# Apply recursively to subagents if they exist
|
267
|
-
for sub_agent in agent.sub_agents:
|
268
|
-
EvaluationGenerator.apply_before_tool_callback(
|
269
|
-
sub_agent, callback, all_mock_tools
|
270
|
-
)
|
244
|
+
return responses
|
245
|
+
return responses
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from abc import ABC
|
16
|
+
from enum import Enum
|
17
|
+
from typing import Optional
|
18
|
+
from pydantic import BaseModel
|
19
|
+
from .eval_case import Invocation
|
20
|
+
|
21
|
+
|
22
|
+
class EvalStatus(Enum):
|
23
|
+
PASSED = 1
|
24
|
+
FAILED = 2
|
25
|
+
NOT_EVALUATED = 3
|
26
|
+
|
27
|
+
|
28
|
+
class PerInvocationResult(BaseModel):
|
29
|
+
"""Metric evaluation score per invocation."""
|
30
|
+
|
31
|
+
actual_invocation: Invocation
|
32
|
+
expected_invocation: Invocation
|
33
|
+
score: Optional[float] = None
|
34
|
+
eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
|
35
|
+
|
36
|
+
|
37
|
+
class EvaluationResult(BaseModel):
|
38
|
+
overall_score: Optional[float] = None
|
39
|
+
"""Overall score, based on each invocation."""
|
40
|
+
|
41
|
+
overall_eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
|
42
|
+
"""Overall status, based on each invocation."""
|
43
|
+
|
44
|
+
per_invocation_results: list[PerInvocationResult] = []
|
45
|
+
|
46
|
+
|
47
|
+
class Evaluator(ABC):
|
48
|
+
"""A merics evaluator interface."""
|
49
|
+
|
50
|
+
def evaluate_invocations(
|
51
|
+
self,
|
52
|
+
actual_invocations: list[Invocation],
|
53
|
+
expected_invocations: list[Invocation],
|
54
|
+
) -> EvaluationResult:
|
55
|
+
"""Returns EvaluationResult after performing evaluations using actual and expected invocations."""
|
56
|
+
raise NotImplementedError()
|
@@ -0,0 +1,264 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import json
|
16
|
+
import logging
|
17
|
+
import os
|
18
|
+
import re
|
19
|
+
import time
|
20
|
+
from typing import Any
|
21
|
+
import uuid
|
22
|
+
|
23
|
+
from google.genai import types as genai_types
|
24
|
+
from pydantic import ValidationError
|
25
|
+
from typing_extensions import override
|
26
|
+
|
27
|
+
from .eval_case import EvalCase
|
28
|
+
from .eval_case import IntermediateData
|
29
|
+
from .eval_case import Invocation
|
30
|
+
from .eval_case import SessionInput
|
31
|
+
from .eval_set import EvalSet
|
32
|
+
from .eval_sets_manager import EvalSetsManager
|
33
|
+
|
34
|
+
logger = logging.getLogger("google_adk." + __name__)
|
35
|
+
|
36
|
+
_EVAL_SET_FILE_EXTENSION = ".evalset.json"
|
37
|
+
|
38
|
+
|
39
|
+
def _convert_invocation_to_pydantic_schema(
|
40
|
+
invocation_in_json_format: dict[str, Any],
|
41
|
+
) -> Invocation:
|
42
|
+
"""Converts an invocation from old json format to new Pydantic Schema"""
|
43
|
+
query = invocation_in_json_format["query"]
|
44
|
+
reference = invocation_in_json_format["reference"]
|
45
|
+
expected_tool_use = []
|
46
|
+
expected_intermediate_agent_responses = []
|
47
|
+
|
48
|
+
for old_tool_use in invocation_in_json_format.get("expected_tool_use", []):
|
49
|
+
expected_tool_use.append(
|
50
|
+
genai_types.FunctionCall(
|
51
|
+
name=old_tool_use["tool_name"], args=old_tool_use["tool_input"]
|
52
|
+
)
|
53
|
+
)
|
54
|
+
|
55
|
+
for old_intermediate_response in invocation_in_json_format.get(
|
56
|
+
"expected_intermediate_agent_responses", []
|
57
|
+
):
|
58
|
+
expected_intermediate_agent_responses.append((
|
59
|
+
old_intermediate_response["author"],
|
60
|
+
[genai_types.Part.from_text(text=old_intermediate_response["text"])],
|
61
|
+
))
|
62
|
+
|
63
|
+
return Invocation(
|
64
|
+
invocation_id=str(uuid.uuid4()),
|
65
|
+
user_content=genai_types.Content(
|
66
|
+
parts=[genai_types.Part.from_text(text=query)], role="user"
|
67
|
+
),
|
68
|
+
final_response=genai_types.Content(
|
69
|
+
parts=[genai_types.Part.from_text(text=reference)], role="model"
|
70
|
+
),
|
71
|
+
intermediate_data=IntermediateData(
|
72
|
+
tool_uses=expected_tool_use,
|
73
|
+
intermediate_responses=expected_intermediate_agent_responses,
|
74
|
+
),
|
75
|
+
creation_timestamp=time.time(),
|
76
|
+
)
|
77
|
+
|
78
|
+
|
79
|
+
def convert_eval_set_to_pydanctic_schema(
|
80
|
+
eval_set_id: str,
|
81
|
+
eval_set_in_json_format: list[dict[str, Any]],
|
82
|
+
) -> EvalSet:
|
83
|
+
r"""Returns an pydantic EvalSet generated from the json representation.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
eval_set_id: Eval set id.
|
87
|
+
eval_set_in_json_format: Eval set specified in JSON format.
|
88
|
+
|
89
|
+
Here is a sample eval set in JSON format:
|
90
|
+
[
|
91
|
+
{
|
92
|
+
"name": "roll_17_sided_dice_twice",
|
93
|
+
"data": [
|
94
|
+
{
|
95
|
+
"query": "What can you do?",
|
96
|
+
"expected_tool_use": [],
|
97
|
+
"expected_intermediate_agent_responses": [],
|
98
|
+
"reference": "I can roll dice of different sizes and check if a number
|
99
|
+
is prime. I can also use multiple tools in parallel.\n"
|
100
|
+
},
|
101
|
+
{
|
102
|
+
"query": "Roll a 17 sided dice twice for me",
|
103
|
+
"expected_tool_use": [
|
104
|
+
{
|
105
|
+
"tool_name": "roll_die",
|
106
|
+
"tool_input": {
|
107
|
+
"sides": 17
|
108
|
+
}
|
109
|
+
},
|
110
|
+
{
|
111
|
+
"tool_name": "roll_die",
|
112
|
+
"tool_input": {
|
113
|
+
"sides": 17
|
114
|
+
}
|
115
|
+
}
|
116
|
+
],
|
117
|
+
"expected_intermediate_agent_responses": [],
|
118
|
+
"reference": "I have rolled a 17 sided die twice. The first roll was
|
119
|
+
13 and the second roll was 4.\n"
|
120
|
+
}
|
121
|
+
],
|
122
|
+
"initial_session": {
|
123
|
+
"state": {},
|
124
|
+
"app_name": "hello_world",
|
125
|
+
"user_id": "user"
|
126
|
+
}
|
127
|
+
}
|
128
|
+
]
|
129
|
+
"""
|
130
|
+
eval_cases = []
|
131
|
+
for old_eval_case in eval_set_in_json_format:
|
132
|
+
new_invocations = []
|
133
|
+
|
134
|
+
for old_invocation in old_eval_case["data"]:
|
135
|
+
new_invocations.append(
|
136
|
+
_convert_invocation_to_pydantic_schema(old_invocation)
|
137
|
+
)
|
138
|
+
|
139
|
+
session_input = None
|
140
|
+
if (
|
141
|
+
"initial_session" in old_eval_case
|
142
|
+
and len(old_eval_case["initial_session"]) > 0
|
143
|
+
):
|
144
|
+
session_input = SessionInput(
|
145
|
+
app_name=old_eval_case["initial_session"].get("app_name", ""),
|
146
|
+
user_id=old_eval_case["initial_session"].get("user_id", ""),
|
147
|
+
state=old_eval_case["initial_session"].get("state", {}),
|
148
|
+
)
|
149
|
+
|
150
|
+
new_eval_case = EvalCase(
|
151
|
+
eval_id=old_eval_case["name"],
|
152
|
+
conversation=new_invocations,
|
153
|
+
session_input=session_input,
|
154
|
+
creation_timestamp=time.time(),
|
155
|
+
)
|
156
|
+
eval_cases.append(new_eval_case)
|
157
|
+
|
158
|
+
return EvalSet(
|
159
|
+
eval_set_id=eval_set_id,
|
160
|
+
name=eval_set_id,
|
161
|
+
creation_timestamp=time.time(),
|
162
|
+
eval_cases=eval_cases,
|
163
|
+
)
|
164
|
+
|
165
|
+
|
166
|
+
def load_eval_set_from_file(
|
167
|
+
eval_set_file_path: str, eval_set_id: str
|
168
|
+
) -> EvalSet:
|
169
|
+
"""Returns an EvalSet that is read from the given file."""
|
170
|
+
with open(eval_set_file_path, "r", encoding="utf-8") as f:
|
171
|
+
content = f.read()
|
172
|
+
try:
|
173
|
+
return EvalSet.model_validate_json(content)
|
174
|
+
except ValidationError:
|
175
|
+
# We assume that the eval data was specified in the old format and try
|
176
|
+
# to convert it to the new format.
|
177
|
+
return convert_eval_set_to_pydanctic_schema(
|
178
|
+
eval_set_id, json.loads(content)
|
179
|
+
)
|
180
|
+
|
181
|
+
|
182
|
+
class LocalEvalSetsManager(EvalSetsManager):
|
183
|
+
"""An EvalSets manager that stores eval sets locally on disk."""
|
184
|
+
|
185
|
+
def __init__(self, agent_dir: str):
|
186
|
+
self._agent_dir = agent_dir
|
187
|
+
|
188
|
+
@override
|
189
|
+
def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
|
190
|
+
"""Returns an EvalSet identified by an app_name and eval_set_id."""
|
191
|
+
# Load the eval set file data
|
192
|
+
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
193
|
+
return load_eval_set_from_file(eval_set_file_path, eval_set_id)
|
194
|
+
|
195
|
+
@override
|
196
|
+
def create_eval_set(self, app_name: str, eval_set_id: str):
|
197
|
+
"""Creates an empty EvalSet given the app_name and eval_set_id."""
|
198
|
+
self._validate_id(id_name="Eval Set Id", id_value=eval_set_id)
|
199
|
+
|
200
|
+
# Define the file path
|
201
|
+
new_eval_set_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
202
|
+
|
203
|
+
logger.info("Creating eval set file `%s`", new_eval_set_path)
|
204
|
+
|
205
|
+
if not os.path.exists(new_eval_set_path):
|
206
|
+
# Write the JSON string to the file
|
207
|
+
logger.info("Eval set file doesn't exist, we will create a new one.")
|
208
|
+
new_eval_set = EvalSet(
|
209
|
+
eval_set_id=eval_set_id,
|
210
|
+
name=eval_set_id,
|
211
|
+
eval_cases=[],
|
212
|
+
creation_timestamp=time.time(),
|
213
|
+
)
|
214
|
+
self._write_eval_set(new_eval_set_path, new_eval_set)
|
215
|
+
|
216
|
+
@override
|
217
|
+
def list_eval_sets(self, app_name: str) -> list[str]:
|
218
|
+
"""Returns a list of EvalSets that belong to the given app_name."""
|
219
|
+
eval_set_file_path = os.path.join(self._agent_dir, app_name)
|
220
|
+
eval_sets = []
|
221
|
+
for file in os.listdir(eval_set_file_path):
|
222
|
+
if file.endswith(_EVAL_SET_FILE_EXTENSION):
|
223
|
+
eval_sets.append(
|
224
|
+
os.path.basename(file).removesuffix(_EVAL_SET_FILE_EXTENSION)
|
225
|
+
)
|
226
|
+
|
227
|
+
return sorted(eval_sets)
|
228
|
+
|
229
|
+
@override
|
230
|
+
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
|
231
|
+
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
|
232
|
+
eval_case_id = eval_case.eval_id
|
233
|
+
self._validate_id(id_name="Eval Case Id", id_value=eval_case_id)
|
234
|
+
|
235
|
+
eval_set = self.get_eval_set(app_name, eval_set_id)
|
236
|
+
|
237
|
+
if [x for x in eval_set.eval_cases if x.eval_id == eval_case_id]:
|
238
|
+
raise ValueError(
|
239
|
+
f"Eval id `{eval_case_id}` already exists in `{eval_set_id}`"
|
240
|
+
" eval set.",
|
241
|
+
)
|
242
|
+
|
243
|
+
eval_set.eval_cases.append(eval_case)
|
244
|
+
|
245
|
+
eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
|
246
|
+
self._write_eval_set(eval_set_file_path, eval_set)
|
247
|
+
|
248
|
+
def _get_eval_set_file_path(self, app_name: str, eval_set_id: str) -> str:
|
249
|
+
return os.path.join(
|
250
|
+
self._agent_dir,
|
251
|
+
app_name,
|
252
|
+
eval_set_id + _EVAL_SET_FILE_EXTENSION,
|
253
|
+
)
|
254
|
+
|
255
|
+
def _validate_id(self, id_name: str, id_value: str):
|
256
|
+
pattern = r"^[a-zA-Z0-9_]+$"
|
257
|
+
if not bool(re.fullmatch(pattern, id_value)):
|
258
|
+
raise ValueError(
|
259
|
+
f"Invalid {id_name}. {id_name} should have the `{pattern}` format",
|
260
|
+
)
|
261
|
+
|
262
|
+
def _write_eval_set(self, eval_set_path: str, eval_set: EvalSet):
|
263
|
+
with open(eval_set_path, "w") as f:
|
264
|
+
f.write(eval_set.model_dump_json(indent=2))
|