google-adk 0.5.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/adk/agents/base_agent.py +76 -30
- google/adk/agents/callback_context.py +2 -6
- google/adk/agents/llm_agent.py +122 -30
- google/adk/agents/loop_agent.py +1 -1
- google/adk/agents/parallel_agent.py +7 -0
- google/adk/agents/readonly_context.py +8 -0
- google/adk/agents/run_config.py +1 -1
- google/adk/agents/sequential_agent.py +31 -0
- google/adk/agents/transcription_entry.py +4 -2
- google/adk/artifacts/gcs_artifact_service.py +1 -1
- google/adk/artifacts/in_memory_artifact_service.py +1 -1
- google/adk/auth/auth_credential.py +10 -2
- google/adk/auth/auth_preprocessor.py +7 -1
- google/adk/auth/auth_tool.py +3 -4
- google/adk/cli/agent_graph.py +5 -5
- google/adk/cli/browser/index.html +4 -4
- google/adk/cli/browser/{main-ULN5R5I5.js → main-PKDNKWJE.js} +59 -60
- google/adk/cli/browser/polyfills-B6TNHZQ6.js +17 -0
- google/adk/cli/cli.py +10 -9
- google/adk/cli/cli_deploy.py +7 -2
- google/adk/cli/cli_eval.py +109 -115
- google/adk/cli/cli_tools_click.py +179 -67
- google/adk/cli/fast_api.py +248 -197
- google/adk/cli/utils/agent_loader.py +137 -0
- google/adk/cli/utils/cleanup.py +40 -0
- google/adk/cli/utils/common.py +23 -0
- google/adk/cli/utils/evals.py +83 -0
- google/adk/cli/utils/logs.py +8 -5
- google/adk/code_executors/__init__.py +3 -1
- google/adk/code_executors/built_in_code_executor.py +52 -0
- google/adk/code_executors/code_execution_utils.py +2 -1
- google/adk/code_executors/container_code_executor.py +0 -1
- google/adk/code_executors/vertex_ai_code_executor.py +6 -8
- google/adk/evaluation/__init__.py +1 -1
- google/adk/evaluation/agent_evaluator.py +168 -128
- google/adk/evaluation/eval_case.py +104 -0
- google/adk/evaluation/eval_metrics.py +74 -0
- google/adk/evaluation/eval_result.py +86 -0
- google/adk/evaluation/eval_set.py +39 -0
- google/adk/evaluation/eval_set_results_manager.py +47 -0
- google/adk/evaluation/eval_sets_manager.py +43 -0
- google/adk/evaluation/evaluation_generator.py +88 -113
- google/adk/evaluation/evaluator.py +58 -0
- google/adk/evaluation/local_eval_set_results_manager.py +113 -0
- google/adk/evaluation/local_eval_sets_manager.py +264 -0
- google/adk/evaluation/response_evaluator.py +106 -1
- google/adk/evaluation/trajectory_evaluator.py +84 -2
- google/adk/events/event.py +6 -1
- google/adk/events/event_actions.py +6 -1
- google/adk/examples/base_example_provider.py +1 -0
- google/adk/examples/example_util.py +3 -2
- google/adk/flows/llm_flows/_code_execution.py +9 -1
- google/adk/flows/llm_flows/audio_transcriber.py +4 -3
- google/adk/flows/llm_flows/base_llm_flow.py +58 -21
- google/adk/flows/llm_flows/contents.py +3 -1
- google/adk/flows/llm_flows/functions.py +9 -8
- google/adk/flows/llm_flows/instructions.py +18 -80
- google/adk/flows/llm_flows/single_flow.py +2 -2
- google/adk/memory/__init__.py +1 -1
- google/adk/memory/_utils.py +23 -0
- google/adk/memory/base_memory_service.py +23 -21
- google/adk/memory/in_memory_memory_service.py +57 -25
- google/adk/memory/memory_entry.py +37 -0
- google/adk/memory/vertex_ai_rag_memory_service.py +38 -15
- google/adk/models/anthropic_llm.py +16 -9
- google/adk/models/base_llm.py +2 -1
- google/adk/models/base_llm_connection.py +2 -0
- google/adk/models/gemini_llm_connection.py +11 -11
- google/adk/models/google_llm.py +12 -2
- google/adk/models/lite_llm.py +80 -23
- google/adk/models/llm_response.py +16 -3
- google/adk/models/registry.py +1 -1
- google/adk/runners.py +98 -42
- google/adk/sessions/__init__.py +1 -1
- google/adk/sessions/_session_util.py +2 -1
- google/adk/sessions/base_session_service.py +6 -33
- google/adk/sessions/database_session_service.py +57 -67
- google/adk/sessions/in_memory_session_service.py +106 -24
- google/adk/sessions/session.py +3 -0
- google/adk/sessions/vertex_ai_session_service.py +44 -51
- google/adk/telemetry.py +7 -2
- google/adk/tools/__init__.py +4 -7
- google/adk/tools/_memory_entry_utils.py +30 -0
- google/adk/tools/agent_tool.py +10 -10
- google/adk/tools/apihub_tool/apihub_toolset.py +55 -74
- google/adk/tools/apihub_tool/clients/apihub_client.py +10 -3
- google/adk/tools/apihub_tool/clients/secret_client.py +1 -0
- google/adk/tools/application_integration_tool/application_integration_toolset.py +111 -85
- google/adk/tools/application_integration_tool/clients/connections_client.py +28 -1
- google/adk/tools/application_integration_tool/clients/integration_client.py +7 -5
- google/adk/tools/application_integration_tool/integration_connector_tool.py +69 -26
- google/adk/tools/base_toolset.py +96 -0
- google/adk/tools/bigquery/__init__.py +28 -0
- google/adk/tools/bigquery/bigquery_credentials.py +216 -0
- google/adk/tools/bigquery/bigquery_tool.py +116 -0
- google/adk/tools/{built_in_code_execution_tool.py → enterprise_search_tool.py} +17 -11
- google/adk/tools/function_parameter_parse_util.py +9 -2
- google/adk/tools/function_tool.py +33 -3
- google/adk/tools/get_user_choice_tool.py +1 -0
- google/adk/tools/google_api_tool/__init__.py +24 -70
- google/adk/tools/google_api_tool/google_api_tool.py +12 -6
- google/adk/tools/google_api_tool/{google_api_tool_set.py → google_api_toolset.py} +57 -55
- google/adk/tools/google_api_tool/google_api_toolsets.py +108 -0
- google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +40 -42
- google/adk/tools/google_search_tool.py +2 -2
- google/adk/tools/langchain_tool.py +96 -49
- google/adk/tools/load_memory_tool.py +14 -5
- google/adk/tools/mcp_tool/__init__.py +3 -2
- google/adk/tools/mcp_tool/conversion_utils.py +6 -2
- google/adk/tools/mcp_tool/mcp_session_manager.py +80 -69
- google/adk/tools/mcp_tool/mcp_tool.py +35 -32
- google/adk/tools/mcp_tool/mcp_toolset.py +99 -194
- google/adk/tools/openapi_tool/auth/credential_exchangers/base_credential_exchanger.py +1 -3
- google/adk/tools/openapi_tool/auth/credential_exchangers/service_account_exchanger.py +6 -7
- google/adk/tools/openapi_tool/common/common.py +5 -1
- google/adk/tools/openapi_tool/openapi_spec_parser/__init__.py +7 -2
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +27 -7
- google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +36 -32
- google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +11 -1
- google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +1 -1
- google/adk/tools/preload_memory_tool.py +27 -18
- google/adk/tools/retrieval/__init__.py +1 -1
- google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +1 -1
- google/adk/tools/toolbox_toolset.py +107 -0
- google/adk/tools/transfer_to_agent_tool.py +0 -1
- google/adk/utils/__init__.py +13 -0
- google/adk/utils/instructions_utils.py +131 -0
- google/adk/version.py +1 -1
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/METADATA +18 -19
- google_adk-1.1.0.dist-info/RECORD +200 -0
- google/adk/agents/remote_agent.py +0 -50
- google/adk/cli/browser/polyfills-FFHMD2TL.js +0 -18
- google/adk/cli/fast_api.py.orig +0 -728
- google/adk/tools/google_api_tool/google_api_tool_sets.py +0 -112
- google/adk/tools/toolbox_tool.py +0 -46
- google_adk-0.5.0.dist-info/RECORD +0 -180
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/WHEEL +0 -0
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/entry_points.txt +0 -0
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,43 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from abc import ABC
|
16
|
+
from abc import abstractmethod
|
17
|
+
|
18
|
+
from .eval_case import EvalCase
|
19
|
+
from .eval_set import EvalSet
|
20
|
+
|
21
|
+
|
22
|
+
class EvalSetsManager(ABC):
|
23
|
+
"""An interface to manage an Eval Sets."""
|
24
|
+
|
25
|
+
@abstractmethod
|
26
|
+
def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
|
27
|
+
"""Returns an EvalSet identified by an app_name and eval_set_id."""
|
28
|
+
raise NotImplementedError()
|
29
|
+
|
30
|
+
@abstractmethod
|
31
|
+
def create_eval_set(self, app_name: str, eval_set_id: str):
|
32
|
+
"""Creates an empty EvalSet given the app_name and eval_set_id."""
|
33
|
+
raise NotImplementedError()
|
34
|
+
|
35
|
+
@abstractmethod
|
36
|
+
def list_eval_sets(self, app_name: str) -> list[str]:
|
37
|
+
"""Returns a list of EvalSets that belong to the given app_name."""
|
38
|
+
raise NotImplementedError()
|
39
|
+
|
40
|
+
@abstractmethod
|
41
|
+
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
|
42
|
+
"""Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
|
43
|
+
raise NotImplementedError()
|
@@ -13,19 +13,34 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import importlib
|
16
|
+
from typing import Any
|
17
|
+
from typing import Optional
|
16
18
|
import uuid
|
17
19
|
|
18
|
-
from
|
20
|
+
from pydantic import BaseModel
|
19
21
|
|
20
|
-
from ..agents.base_agent import BaseAgent
|
21
22
|
from ..agents.llm_agent import Agent
|
22
|
-
from ..
|
23
|
-
from ..agents.llm_agent import LlmAgent
|
23
|
+
from ..artifacts.base_artifact_service import BaseArtifactService
|
24
24
|
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
25
25
|
from ..runners import Runner
|
26
|
+
from ..sessions.base_session_service import BaseSessionService
|
26
27
|
from ..sessions.in_memory_session_service import InMemorySessionService
|
27
28
|
from ..sessions.session import Session
|
28
|
-
from .
|
29
|
+
from .eval_case import EvalCase
|
30
|
+
from .eval_case import IntermediateData
|
31
|
+
from .eval_case import Invocation
|
32
|
+
from .eval_case import SessionInput
|
33
|
+
from .eval_set import EvalSet
|
34
|
+
|
35
|
+
|
36
|
+
class EvalCaseResponses(BaseModel):
|
37
|
+
"""Contains multiple responses associated with an EvalCase.
|
38
|
+
|
39
|
+
Multiple responses are a result of repeated requests to genereate inferences.
|
40
|
+
"""
|
41
|
+
|
42
|
+
eval_case: EvalCase
|
43
|
+
responses: list[list[Invocation]]
|
29
44
|
|
30
45
|
|
31
46
|
class EvaluationGenerator:
|
@@ -33,12 +48,11 @@ class EvaluationGenerator:
|
|
33
48
|
|
34
49
|
@staticmethod
|
35
50
|
async def generate_responses(
|
36
|
-
|
37
|
-
agent_module_path,
|
38
|
-
repeat_num=3,
|
39
|
-
agent_name=None,
|
40
|
-
|
41
|
-
):
|
51
|
+
eval_set: EvalSet,
|
52
|
+
agent_module_path: str,
|
53
|
+
repeat_num: int = 3,
|
54
|
+
agent_name: str = None,
|
55
|
+
) -> list[EvalCaseResponses]:
|
42
56
|
"""Returns evaluation responses for the given dataset and agent.
|
43
57
|
|
44
58
|
Args:
|
@@ -48,17 +62,23 @@ class EvaluationGenerator:
|
|
48
62
|
usually done to remove uncertainty that a single run may bring.
|
49
63
|
agent_name: The name of the agent that should be evaluated. This is
|
50
64
|
usually the sub-agent.
|
51
|
-
initial_session: Initial session for the eval data.
|
52
65
|
"""
|
53
66
|
results = []
|
54
67
|
|
55
|
-
for
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
68
|
+
for eval_case in eval_set.eval_cases:
|
69
|
+
responses = []
|
70
|
+
for _ in range(repeat_num):
|
71
|
+
response_invocations = await EvaluationGenerator._process_query(
|
72
|
+
eval_case.conversation,
|
73
|
+
agent_module_path,
|
74
|
+
agent_name,
|
75
|
+
eval_case.session_input,
|
61
76
|
)
|
77
|
+
responses.append(response_invocations)
|
78
|
+
|
79
|
+
results.append(
|
80
|
+
EvalCaseResponses(eval_case=eval_case, responses=responses)
|
81
|
+
)
|
62
82
|
|
63
83
|
return results
|
64
84
|
|
@@ -89,7 +109,12 @@ class EvaluationGenerator:
|
|
89
109
|
return results
|
90
110
|
|
91
111
|
@staticmethod
|
92
|
-
def _process_query(
|
112
|
+
async def _process_query(
|
113
|
+
invocations: list[Invocation],
|
114
|
+
module_name: str,
|
115
|
+
agent_name: Optional[str] = None,
|
116
|
+
initial_session: Optional[SessionInput] = None,
|
117
|
+
) -> list[Invocation]:
|
93
118
|
"""Process a query using the agent and evaluation dataset."""
|
94
119
|
module_path = f"{module_name}"
|
95
120
|
agent_module = importlib.import_module(module_path)
|
@@ -102,56 +127,40 @@ class EvaluationGenerator:
|
|
102
127
|
agent_to_evaluate = root_agent.find_agent(agent_name)
|
103
128
|
assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
|
104
129
|
|
105
|
-
return EvaluationGenerator.
|
106
|
-
|
130
|
+
return await EvaluationGenerator._generate_inferences_from_root_agent(
|
131
|
+
invocations, agent_to_evaluate, reset_func, initial_session
|
107
132
|
)
|
108
133
|
|
109
134
|
@staticmethod
|
110
|
-
async def
|
111
|
-
|
112
|
-
root_agent,
|
113
|
-
reset_func,
|
114
|
-
initial_session=
|
115
|
-
session_id=None,
|
116
|
-
session_service=None,
|
117
|
-
artifact_service=None,
|
118
|
-
):
|
119
|
-
"""
|
120
|
-
|
121
|
-
# we don't know which tools belong to which agent
|
122
|
-
# so we just apply to any agents that has certain tool outputs
|
123
|
-
all_mock_tools = set()
|
124
|
-
for eval_entry in data:
|
125
|
-
expected_tool_use = eval_entry.get(EvalConstants.EXPECTED_TOOL_USE, [])
|
126
|
-
for expected in expected_tool_use:
|
127
|
-
if EvalConstants.MOCK_TOOL_OUTPUT in expected:
|
128
|
-
all_mock_tools.add(expected[EvalConstants.TOOL_NAME])
|
129
|
-
|
130
|
-
eval_data_copy = data.copy()
|
131
|
-
await EvaluationGenerator.apply_before_tool_callback(
|
132
|
-
root_agent,
|
133
|
-
lambda *args: EvaluationGenerator.before_tool_callback(
|
134
|
-
*args, eval_dataset=eval_data_copy
|
135
|
-
),
|
136
|
-
all_mock_tools,
|
137
|
-
)
|
138
|
-
|
135
|
+
async def _generate_inferences_from_root_agent(
|
136
|
+
invocations: list[Invocation],
|
137
|
+
root_agent: Agent,
|
138
|
+
reset_func: Any,
|
139
|
+
initial_session: Optional[SessionInput] = None,
|
140
|
+
session_id: Optional[str] = None,
|
141
|
+
session_service: Optional[BaseSessionService] = None,
|
142
|
+
artifact_service: Optional[BaseArtifactService] = None,
|
143
|
+
) -> list[Invocation]:
|
144
|
+
"""Scrapes the root agent given the list of Invocations."""
|
139
145
|
if not session_service:
|
140
146
|
session_service = InMemorySessionService()
|
141
147
|
|
142
|
-
app_name =
|
143
|
-
|
148
|
+
app_name = (
|
149
|
+
initial_session.app_name if initial_session else "EvaluationGenerator"
|
150
|
+
)
|
151
|
+
user_id = initial_session.user_id if initial_session else "test_user_id"
|
144
152
|
session_id = session_id if session_id else str(uuid.uuid4())
|
145
153
|
|
146
|
-
_ = session_service.create_session(
|
154
|
+
_ = await session_service.create_session(
|
147
155
|
app_name=app_name,
|
148
156
|
user_id=user_id,
|
149
|
-
state=initial_session.
|
157
|
+
state=initial_session.state if initial_session else {},
|
150
158
|
session_id=session_id,
|
151
159
|
)
|
152
160
|
|
153
161
|
if not artifact_service:
|
154
162
|
artifact_service = InMemoryArtifactService()
|
163
|
+
|
155
164
|
runner = Runner(
|
156
165
|
app_name=app_name,
|
157
166
|
agent=root_agent,
|
@@ -163,30 +172,37 @@ class EvaluationGenerator:
|
|
163
172
|
if callable(reset_func):
|
164
173
|
reset_func()
|
165
174
|
|
166
|
-
|
175
|
+
response_invocations = []
|
167
176
|
|
168
|
-
for
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
177
|
+
for invocation in invocations:
|
178
|
+
final_response = None
|
179
|
+
user_content = invocation.user_content
|
180
|
+
tool_uses = []
|
181
|
+
invocation_id = ""
|
173
182
|
|
174
183
|
for event in runner.run(
|
175
|
-
user_id=user_id, session_id=session_id, new_message=
|
184
|
+
user_id=user_id, session_id=session_id, new_message=user_content
|
176
185
|
):
|
186
|
+
invocation_id = (
|
187
|
+
event.invocation_id if not invocation_id else invocation_id
|
188
|
+
)
|
189
|
+
|
177
190
|
if event.is_final_response() and event.content and event.content.parts:
|
178
|
-
|
191
|
+
final_response = event.content
|
179
192
|
elif event.get_function_calls():
|
180
193
|
for call in event.get_function_calls():
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
194
|
+
tool_uses.append(call)
|
195
|
+
|
196
|
+
response_invocations.append(
|
197
|
+
Invocation(
|
198
|
+
invocation_id=invocation_id,
|
199
|
+
user_content=user_content,
|
200
|
+
final_response=final_response,
|
201
|
+
intermediate_data=IntermediateData(tool_uses=tool_uses),
|
202
|
+
)
|
203
|
+
)
|
185
204
|
|
186
|
-
|
187
|
-
responses[index]["response"] = response
|
188
|
-
|
189
|
-
return responses
|
205
|
+
return response_invocations
|
190
206
|
|
191
207
|
@staticmethod
|
192
208
|
def _process_query_with_session(session_data, data):
|
@@ -225,46 +241,5 @@ class EvaluationGenerator:
|
|
225
241
|
responses[index]["actual_tool_use"] = actual_tool_uses
|
226
242
|
responses[index]["response"] = response
|
227
243
|
return responses
|
228
|
-
|
229
|
-
|
230
|
-
def before_tool_callback(tool, args, tool_context, eval_dataset):
|
231
|
-
"""Intercept specific tool calls and return predefined outputs
|
232
|
-
|
233
|
-
from eval_dataset.
|
234
|
-
"""
|
235
|
-
for index, eval_entry in enumerate(eval_dataset):
|
236
|
-
expected_tool_use = eval_entry.get("expected_tool_use", [])
|
237
|
-
for expected in expected_tool_use:
|
238
|
-
if (
|
239
|
-
EvalConstants.MOCK_TOOL_OUTPUT in expected
|
240
|
-
and tool.name == expected[EvalConstants.TOOL_NAME]
|
241
|
-
and args == expected.get(EvalConstants.TOOL_INPUT, {})
|
242
|
-
):
|
243
|
-
# pop the matched entry so we don't rematch again
|
244
|
-
eval_dataset.pop(index)
|
245
|
-
return {"result": expected[EvalConstants.MOCK_TOOL_OUTPUT]}
|
246
|
-
|
247
|
-
return None
|
248
|
-
|
249
|
-
@staticmethod
|
250
|
-
async def apply_before_tool_callback(
|
251
|
-
agent: BaseAgent,
|
252
|
-
callback: BeforeToolCallback,
|
253
|
-
all_mock_tools: set[str],
|
254
|
-
):
|
255
|
-
"""Recursively apply the before_tool_callback to the root agent and all its subagents."""
|
256
|
-
# Check if the agent has tools that are defined by evalset.
|
257
|
-
# We use function names to check if tools match
|
258
|
-
if not isinstance(agent, Agent) and not isinstance(agent, LlmAgent):
|
259
|
-
return
|
260
|
-
|
261
|
-
for tool in agent.canonical_tools:
|
262
|
-
tool_name = tool.name
|
263
|
-
if tool_name in all_mock_tools:
|
264
|
-
agent.before_tool_callback = callback
|
265
|
-
|
266
|
-
# Apply recursively to subagents if they exist
|
267
|
-
for sub_agent in agent.sub_agents:
|
268
|
-
await EvaluationGenerator.apply_before_tool_callback(
|
269
|
-
sub_agent, callback, all_mock_tools
|
270
|
-
)
|
244
|
+
return responses
|
245
|
+
return responses
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from abc import ABC
|
16
|
+
from enum import Enum
|
17
|
+
from typing import Optional
|
18
|
+
|
19
|
+
from pydantic import BaseModel
|
20
|
+
|
21
|
+
from .eval_case import Invocation
|
22
|
+
|
23
|
+
|
24
|
+
class EvalStatus(Enum):
|
25
|
+
PASSED = 1
|
26
|
+
FAILED = 2
|
27
|
+
NOT_EVALUATED = 3
|
28
|
+
|
29
|
+
|
30
|
+
class PerInvocationResult(BaseModel):
|
31
|
+
"""Metric evaluation score per invocation."""
|
32
|
+
|
33
|
+
actual_invocation: Invocation
|
34
|
+
expected_invocation: Invocation
|
35
|
+
score: Optional[float] = None
|
36
|
+
eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
|
37
|
+
|
38
|
+
|
39
|
+
class EvaluationResult(BaseModel):
|
40
|
+
overall_score: Optional[float] = None
|
41
|
+
"""Overall score, based on each invocation."""
|
42
|
+
|
43
|
+
overall_eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
|
44
|
+
"""Overall status, based on each invocation."""
|
45
|
+
|
46
|
+
per_invocation_results: list[PerInvocationResult] = []
|
47
|
+
|
48
|
+
|
49
|
+
class Evaluator(ABC):
|
50
|
+
"""A merics evaluator interface."""
|
51
|
+
|
52
|
+
def evaluate_invocations(
|
53
|
+
self,
|
54
|
+
actual_invocations: list[Invocation],
|
55
|
+
expected_invocations: list[Invocation],
|
56
|
+
) -> EvaluationResult:
|
57
|
+
"""Returns EvaluationResult after performing evaluations using actual and expected invocations."""
|
58
|
+
raise NotImplementedError()
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
import json
|
18
|
+
import logging
|
19
|
+
import os
|
20
|
+
import time
|
21
|
+
|
22
|
+
from typing_extensions import override
|
23
|
+
|
24
|
+
from .eval_result import EvalCaseResult
|
25
|
+
from .eval_result import EvalSetResult
|
26
|
+
from .eval_set_results_manager import EvalSetResultsManager
|
27
|
+
|
28
|
+
logger = logging.getLogger("google_adk." + __name__)
|
29
|
+
|
30
|
+
_ADK_EVAL_HISTORY_DIR = ".adk/eval_history"
|
31
|
+
_EVAL_SET_RESULT_FILE_EXTENSION = ".evalset_result.json"
|
32
|
+
|
33
|
+
|
34
|
+
def _sanitize_eval_set_result_name(eval_set_result_name: str) -> str:
|
35
|
+
return eval_set_result_name.replace("/", "_")
|
36
|
+
|
37
|
+
|
38
|
+
class LocalEvalSetResultsManager(EvalSetResultsManager):
|
39
|
+
"""An EvalSetResult manager that stores eval set results locally on disk."""
|
40
|
+
|
41
|
+
def __init__(self, agents_dir: str):
|
42
|
+
self._agents_dir = agents_dir
|
43
|
+
|
44
|
+
@override
|
45
|
+
def save_eval_set_result(
|
46
|
+
self,
|
47
|
+
app_name: str,
|
48
|
+
eval_set_id: str,
|
49
|
+
eval_case_results: list[EvalCaseResult],
|
50
|
+
) -> None:
|
51
|
+
"""Creates and saves a new EvalSetResult given eval_case_results."""
|
52
|
+
timestamp = time.time()
|
53
|
+
eval_set_result_id = app_name + "_" + eval_set_id + "_" + str(timestamp)
|
54
|
+
eval_set_result_name = _sanitize_eval_set_result_name(eval_set_result_id)
|
55
|
+
eval_set_result = EvalSetResult(
|
56
|
+
eval_set_result_id=eval_set_result_id,
|
57
|
+
eval_set_result_name=eval_set_result_name,
|
58
|
+
eval_set_id=eval_set_id,
|
59
|
+
eval_case_results=eval_case_results,
|
60
|
+
creation_timestamp=timestamp,
|
61
|
+
)
|
62
|
+
# Write eval result file, with eval_set_result_name.
|
63
|
+
app_eval_history_dir = self._get_eval_history_dir(app_name)
|
64
|
+
if not os.path.exists(app_eval_history_dir):
|
65
|
+
os.makedirs(app_eval_history_dir)
|
66
|
+
# Convert to json and write to file.
|
67
|
+
eval_set_result_json = eval_set_result.model_dump_json()
|
68
|
+
eval_set_result_file_path = os.path.join(
|
69
|
+
app_eval_history_dir,
|
70
|
+
eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION,
|
71
|
+
)
|
72
|
+
logger.info("Writing eval result to file: %s", eval_set_result_file_path)
|
73
|
+
with open(eval_set_result_file_path, "w") as f:
|
74
|
+
f.write(json.dumps(eval_set_result_json, indent=2))
|
75
|
+
|
76
|
+
@override
|
77
|
+
def get_eval_set_result(
|
78
|
+
self, app_name: str, eval_set_result_id: str
|
79
|
+
) -> EvalSetResult:
|
80
|
+
"""Returns an EvalSetResult identified by app_name and eval_set_result_id."""
|
81
|
+
# Load the eval set result file data.
|
82
|
+
maybe_eval_result_file_path = (
|
83
|
+
os.path.join(
|
84
|
+
self._get_eval_history_dir(app_name),
|
85
|
+
eval_set_result_id,
|
86
|
+
)
|
87
|
+
+ _EVAL_SET_RESULT_FILE_EXTENSION
|
88
|
+
)
|
89
|
+
if not os.path.exists(maybe_eval_result_file_path):
|
90
|
+
raise ValueError(
|
91
|
+
f"Eval set result `{eval_set_result_id}` does not exist."
|
92
|
+
)
|
93
|
+
with open(maybe_eval_result_file_path, "r") as file:
|
94
|
+
eval_result_data = json.load(file)
|
95
|
+
return EvalSetResult.model_validate_json(eval_result_data)
|
96
|
+
|
97
|
+
@override
|
98
|
+
def list_eval_set_results(self, app_name: str) -> list[str]:
|
99
|
+
"""Returns the eval result ids that belong to the given app_name."""
|
100
|
+
app_eval_history_directory = self._get_eval_history_dir(app_name)
|
101
|
+
|
102
|
+
if not os.path.exists(app_eval_history_directory):
|
103
|
+
return []
|
104
|
+
|
105
|
+
eval_result_files = [
|
106
|
+
file.removesuffix(_EVAL_SET_RESULT_FILE_EXTENSION)
|
107
|
+
for file in os.listdir(app_eval_history_directory)
|
108
|
+
if file.endswith(_EVAL_SET_RESULT_FILE_EXTENSION)
|
109
|
+
]
|
110
|
+
return eval_result_files
|
111
|
+
|
112
|
+
def _get_eval_history_dir(self, app_name: str) -> str:
|
113
|
+
return os.path.join(self._agents_dir, app_name, _ADK_EVAL_HISTORY_DIR)
|