google-adk 0.5.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. google/adk/agents/base_agent.py +76 -30
  2. google/adk/agents/callback_context.py +2 -6
  3. google/adk/agents/llm_agent.py +122 -30
  4. google/adk/agents/loop_agent.py +1 -1
  5. google/adk/agents/parallel_agent.py +7 -0
  6. google/adk/agents/readonly_context.py +8 -0
  7. google/adk/agents/run_config.py +1 -1
  8. google/adk/agents/sequential_agent.py +31 -0
  9. google/adk/agents/transcription_entry.py +4 -2
  10. google/adk/artifacts/gcs_artifact_service.py +1 -1
  11. google/adk/artifacts/in_memory_artifact_service.py +1 -1
  12. google/adk/auth/auth_credential.py +10 -2
  13. google/adk/auth/auth_preprocessor.py +7 -1
  14. google/adk/auth/auth_tool.py +3 -4
  15. google/adk/cli/agent_graph.py +5 -5
  16. google/adk/cli/browser/index.html +4 -4
  17. google/adk/cli/browser/{main-ULN5R5I5.js → main-PKDNKWJE.js} +59 -60
  18. google/adk/cli/browser/polyfills-B6TNHZQ6.js +17 -0
  19. google/adk/cli/cli.py +10 -9
  20. google/adk/cli/cli_deploy.py +7 -2
  21. google/adk/cli/cli_eval.py +109 -115
  22. google/adk/cli/cli_tools_click.py +179 -67
  23. google/adk/cli/fast_api.py +248 -197
  24. google/adk/cli/utils/agent_loader.py +137 -0
  25. google/adk/cli/utils/cleanup.py +40 -0
  26. google/adk/cli/utils/common.py +23 -0
  27. google/adk/cli/utils/evals.py +83 -0
  28. google/adk/cli/utils/logs.py +8 -5
  29. google/adk/code_executors/__init__.py +3 -1
  30. google/adk/code_executors/built_in_code_executor.py +52 -0
  31. google/adk/code_executors/code_execution_utils.py +2 -1
  32. google/adk/code_executors/container_code_executor.py +0 -1
  33. google/adk/code_executors/vertex_ai_code_executor.py +6 -8
  34. google/adk/evaluation/__init__.py +1 -1
  35. google/adk/evaluation/agent_evaluator.py +168 -128
  36. google/adk/evaluation/eval_case.py +104 -0
  37. google/adk/evaluation/eval_metrics.py +74 -0
  38. google/adk/evaluation/eval_result.py +86 -0
  39. google/adk/evaluation/eval_set.py +39 -0
  40. google/adk/evaluation/eval_set_results_manager.py +47 -0
  41. google/adk/evaluation/eval_sets_manager.py +43 -0
  42. google/adk/evaluation/evaluation_generator.py +88 -113
  43. google/adk/evaluation/evaluator.py +58 -0
  44. google/adk/evaluation/local_eval_set_results_manager.py +113 -0
  45. google/adk/evaluation/local_eval_sets_manager.py +264 -0
  46. google/adk/evaluation/response_evaluator.py +106 -1
  47. google/adk/evaluation/trajectory_evaluator.py +84 -2
  48. google/adk/events/event.py +6 -1
  49. google/adk/events/event_actions.py +6 -1
  50. google/adk/examples/base_example_provider.py +1 -0
  51. google/adk/examples/example_util.py +3 -2
  52. google/adk/flows/llm_flows/_code_execution.py +9 -1
  53. google/adk/flows/llm_flows/audio_transcriber.py +4 -3
  54. google/adk/flows/llm_flows/base_llm_flow.py +58 -21
  55. google/adk/flows/llm_flows/contents.py +3 -1
  56. google/adk/flows/llm_flows/functions.py +9 -8
  57. google/adk/flows/llm_flows/instructions.py +18 -80
  58. google/adk/flows/llm_flows/single_flow.py +2 -2
  59. google/adk/memory/__init__.py +1 -1
  60. google/adk/memory/_utils.py +23 -0
  61. google/adk/memory/base_memory_service.py +23 -21
  62. google/adk/memory/in_memory_memory_service.py +57 -25
  63. google/adk/memory/memory_entry.py +37 -0
  64. google/adk/memory/vertex_ai_rag_memory_service.py +38 -15
  65. google/adk/models/anthropic_llm.py +16 -9
  66. google/adk/models/base_llm.py +2 -1
  67. google/adk/models/base_llm_connection.py +2 -0
  68. google/adk/models/gemini_llm_connection.py +11 -11
  69. google/adk/models/google_llm.py +12 -2
  70. google/adk/models/lite_llm.py +80 -23
  71. google/adk/models/llm_response.py +16 -3
  72. google/adk/models/registry.py +1 -1
  73. google/adk/runners.py +98 -42
  74. google/adk/sessions/__init__.py +1 -1
  75. google/adk/sessions/_session_util.py +2 -1
  76. google/adk/sessions/base_session_service.py +6 -33
  77. google/adk/sessions/database_session_service.py +57 -67
  78. google/adk/sessions/in_memory_session_service.py +106 -24
  79. google/adk/sessions/session.py +3 -0
  80. google/adk/sessions/vertex_ai_session_service.py +44 -51
  81. google/adk/telemetry.py +7 -2
  82. google/adk/tools/__init__.py +4 -7
  83. google/adk/tools/_memory_entry_utils.py +30 -0
  84. google/adk/tools/agent_tool.py +10 -10
  85. google/adk/tools/apihub_tool/apihub_toolset.py +55 -74
  86. google/adk/tools/apihub_tool/clients/apihub_client.py +10 -3
  87. google/adk/tools/apihub_tool/clients/secret_client.py +1 -0
  88. google/adk/tools/application_integration_tool/application_integration_toolset.py +111 -85
  89. google/adk/tools/application_integration_tool/clients/connections_client.py +28 -1
  90. google/adk/tools/application_integration_tool/clients/integration_client.py +7 -5
  91. google/adk/tools/application_integration_tool/integration_connector_tool.py +69 -26
  92. google/adk/tools/base_toolset.py +96 -0
  93. google/adk/tools/bigquery/__init__.py +28 -0
  94. google/adk/tools/bigquery/bigquery_credentials.py +216 -0
  95. google/adk/tools/bigquery/bigquery_tool.py +116 -0
  96. google/adk/tools/{built_in_code_execution_tool.py → enterprise_search_tool.py} +17 -11
  97. google/adk/tools/function_parameter_parse_util.py +9 -2
  98. google/adk/tools/function_tool.py +33 -3
  99. google/adk/tools/get_user_choice_tool.py +1 -0
  100. google/adk/tools/google_api_tool/__init__.py +24 -70
  101. google/adk/tools/google_api_tool/google_api_tool.py +12 -6
  102. google/adk/tools/google_api_tool/{google_api_tool_set.py → google_api_toolset.py} +57 -55
  103. google/adk/tools/google_api_tool/google_api_toolsets.py +108 -0
  104. google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +40 -42
  105. google/adk/tools/google_search_tool.py +2 -2
  106. google/adk/tools/langchain_tool.py +96 -49
  107. google/adk/tools/load_memory_tool.py +14 -5
  108. google/adk/tools/mcp_tool/__init__.py +3 -2
  109. google/adk/tools/mcp_tool/conversion_utils.py +6 -2
  110. google/adk/tools/mcp_tool/mcp_session_manager.py +80 -69
  111. google/adk/tools/mcp_tool/mcp_tool.py +35 -32
  112. google/adk/tools/mcp_tool/mcp_toolset.py +99 -194
  113. google/adk/tools/openapi_tool/auth/credential_exchangers/base_credential_exchanger.py +1 -3
  114. google/adk/tools/openapi_tool/auth/credential_exchangers/service_account_exchanger.py +6 -7
  115. google/adk/tools/openapi_tool/common/common.py +5 -1
  116. google/adk/tools/openapi_tool/openapi_spec_parser/__init__.py +7 -2
  117. google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +27 -7
  118. google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +36 -32
  119. google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +11 -1
  120. google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +1 -1
  121. google/adk/tools/preload_memory_tool.py +27 -18
  122. google/adk/tools/retrieval/__init__.py +1 -1
  123. google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +1 -1
  124. google/adk/tools/toolbox_toolset.py +107 -0
  125. google/adk/tools/transfer_to_agent_tool.py +0 -1
  126. google/adk/utils/__init__.py +13 -0
  127. google/adk/utils/instructions_utils.py +131 -0
  128. google/adk/version.py +1 -1
  129. {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/METADATA +18 -19
  130. google_adk-1.1.0.dist-info/RECORD +200 -0
  131. google/adk/agents/remote_agent.py +0 -50
  132. google/adk/cli/browser/polyfills-FFHMD2TL.js +0 -18
  133. google/adk/cli/fast_api.py.orig +0 -728
  134. google/adk/tools/google_api_tool/google_api_tool_sets.py +0 -112
  135. google/adk/tools/toolbox_tool.py +0 -46
  136. google_adk-0.5.0.dist-info/RECORD +0 -180
  137. {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/WHEEL +0 -0
  138. {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/entry_points.txt +0 -0
  139. {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,43 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from abc import ABC
16
+ from abc import abstractmethod
17
+
18
+ from .eval_case import EvalCase
19
+ from .eval_set import EvalSet
20
+
21
+
22
+ class EvalSetsManager(ABC):
23
+ """An interface to manage an Eval Sets."""
24
+
25
+ @abstractmethod
26
+ def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
27
+ """Returns an EvalSet identified by an app_name and eval_set_id."""
28
+ raise NotImplementedError()
29
+
30
+ @abstractmethod
31
+ def create_eval_set(self, app_name: str, eval_set_id: str):
32
+ """Creates an empty EvalSet given the app_name and eval_set_id."""
33
+ raise NotImplementedError()
34
+
35
+ @abstractmethod
36
+ def list_eval_sets(self, app_name: str) -> list[str]:
37
+ """Returns a list of EvalSets that belong to the given app_name."""
38
+ raise NotImplementedError()
39
+
40
+ @abstractmethod
41
+ def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
42
+ """Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
43
+ raise NotImplementedError()
@@ -13,19 +13,34 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import importlib
16
+ from typing import Any
17
+ from typing import Optional
16
18
  import uuid
17
19
 
18
- from google.genai import types
20
+ from pydantic import BaseModel
19
21
 
20
- from ..agents.base_agent import BaseAgent
21
22
  from ..agents.llm_agent import Agent
22
- from ..agents.llm_agent import BeforeToolCallback
23
- from ..agents.llm_agent import LlmAgent
23
+ from ..artifacts.base_artifact_service import BaseArtifactService
24
24
  from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
25
25
  from ..runners import Runner
26
+ from ..sessions.base_session_service import BaseSessionService
26
27
  from ..sessions.in_memory_session_service import InMemorySessionService
27
28
  from ..sessions.session import Session
28
- from .evaluation_constants import EvalConstants
29
+ from .eval_case import EvalCase
30
+ from .eval_case import IntermediateData
31
+ from .eval_case import Invocation
32
+ from .eval_case import SessionInput
33
+ from .eval_set import EvalSet
34
+
35
+
36
+ class EvalCaseResponses(BaseModel):
37
+ """Contains multiple responses associated with an EvalCase.
38
+
39
+ Multiple responses are a result of repeated requests to genereate inferences.
40
+ """
41
+
42
+ eval_case: EvalCase
43
+ responses: list[list[Invocation]]
29
44
 
30
45
 
31
46
  class EvaluationGenerator:
@@ -33,12 +48,11 @@ class EvaluationGenerator:
33
48
 
34
49
  @staticmethod
35
50
  async def generate_responses(
36
- eval_dataset,
37
- agent_module_path,
38
- repeat_num=3,
39
- agent_name=None,
40
- initial_session={},
41
- ):
51
+ eval_set: EvalSet,
52
+ agent_module_path: str,
53
+ repeat_num: int = 3,
54
+ agent_name: str = None,
55
+ ) -> list[EvalCaseResponses]:
42
56
  """Returns evaluation responses for the given dataset and agent.
43
57
 
44
58
  Args:
@@ -48,17 +62,23 @@ class EvaluationGenerator:
48
62
  usually done to remove uncertainty that a single run may bring.
49
63
  agent_name: The name of the agent that should be evaluated. This is
50
64
  usually the sub-agent.
51
- initial_session: Initial session for the eval data.
52
65
  """
53
66
  results = []
54
67
 
55
- for _ in range(repeat_num):
56
- for data in eval_dataset:
57
- results.append(
58
- EvaluationGenerator._process_query(
59
- data, agent_module_path, agent_name, initial_session
60
- )
68
+ for eval_case in eval_set.eval_cases:
69
+ responses = []
70
+ for _ in range(repeat_num):
71
+ response_invocations = await EvaluationGenerator._process_query(
72
+ eval_case.conversation,
73
+ agent_module_path,
74
+ agent_name,
75
+ eval_case.session_input,
61
76
  )
77
+ responses.append(response_invocations)
78
+
79
+ results.append(
80
+ EvalCaseResponses(eval_case=eval_case, responses=responses)
81
+ )
62
82
 
63
83
  return results
64
84
 
@@ -89,7 +109,12 @@ class EvaluationGenerator:
89
109
  return results
90
110
 
91
111
  @staticmethod
92
- def _process_query(data, module_name, agent_name=None, initial_session={}):
112
+ async def _process_query(
113
+ invocations: list[Invocation],
114
+ module_name: str,
115
+ agent_name: Optional[str] = None,
116
+ initial_session: Optional[SessionInput] = None,
117
+ ) -> list[Invocation]:
93
118
  """Process a query using the agent and evaluation dataset."""
94
119
  module_path = f"{module_name}"
95
120
  agent_module = importlib.import_module(module_path)
@@ -102,56 +127,40 @@ class EvaluationGenerator:
102
127
  agent_to_evaluate = root_agent.find_agent(agent_name)
103
128
  assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
104
129
 
105
- return EvaluationGenerator._process_query_with_root_agent(
106
- data, agent_to_evaluate, reset_func, initial_session
130
+ return await EvaluationGenerator._generate_inferences_from_root_agent(
131
+ invocations, agent_to_evaluate, reset_func, initial_session
107
132
  )
108
133
 
109
134
  @staticmethod
110
- async def _process_query_with_root_agent(
111
- data,
112
- root_agent,
113
- reset_func,
114
- initial_session={},
115
- session_id=None,
116
- session_service=None,
117
- artifact_service=None,
118
- ):
119
- """Process a query using the agent and evaluation dataset."""
120
-
121
- # we don't know which tools belong to which agent
122
- # so we just apply to any agents that has certain tool outputs
123
- all_mock_tools = set()
124
- for eval_entry in data:
125
- expected_tool_use = eval_entry.get(EvalConstants.EXPECTED_TOOL_USE, [])
126
- for expected in expected_tool_use:
127
- if EvalConstants.MOCK_TOOL_OUTPUT in expected:
128
- all_mock_tools.add(expected[EvalConstants.TOOL_NAME])
129
-
130
- eval_data_copy = data.copy()
131
- await EvaluationGenerator.apply_before_tool_callback(
132
- root_agent,
133
- lambda *args: EvaluationGenerator.before_tool_callback(
134
- *args, eval_dataset=eval_data_copy
135
- ),
136
- all_mock_tools,
137
- )
138
-
135
+ async def _generate_inferences_from_root_agent(
136
+ invocations: list[Invocation],
137
+ root_agent: Agent,
138
+ reset_func: Any,
139
+ initial_session: Optional[SessionInput] = None,
140
+ session_id: Optional[str] = None,
141
+ session_service: Optional[BaseSessionService] = None,
142
+ artifact_service: Optional[BaseArtifactService] = None,
143
+ ) -> list[Invocation]:
144
+ """Scrapes the root agent given the list of Invocations."""
139
145
  if not session_service:
140
146
  session_service = InMemorySessionService()
141
147
 
142
- app_name = initial_session.get("app_name", "EvaluationGenerator")
143
- user_id = initial_session.get("user_id", "test_user_id")
148
+ app_name = (
149
+ initial_session.app_name if initial_session else "EvaluationGenerator"
150
+ )
151
+ user_id = initial_session.user_id if initial_session else "test_user_id"
144
152
  session_id = session_id if session_id else str(uuid.uuid4())
145
153
 
146
- _ = session_service.create_session(
154
+ _ = await session_service.create_session(
147
155
  app_name=app_name,
148
156
  user_id=user_id,
149
- state=initial_session.get("state", {}),
157
+ state=initial_session.state if initial_session else {},
150
158
  session_id=session_id,
151
159
  )
152
160
 
153
161
  if not artifact_service:
154
162
  artifact_service = InMemoryArtifactService()
163
+
155
164
  runner = Runner(
156
165
  app_name=app_name,
157
166
  agent=root_agent,
@@ -163,30 +172,37 @@ class EvaluationGenerator:
163
172
  if callable(reset_func):
164
173
  reset_func()
165
174
 
166
- responses = data.copy()
175
+ response_invocations = []
167
176
 
168
- for index, eval_entry in enumerate(responses):
169
- response = None
170
- query = eval_entry["query"]
171
- content = types.Content(role="user", parts=[types.Part(text=query)])
172
- turn_actual_tool_uses = []
177
+ for invocation in invocations:
178
+ final_response = None
179
+ user_content = invocation.user_content
180
+ tool_uses = []
181
+ invocation_id = ""
173
182
 
174
183
  for event in runner.run(
175
- user_id=user_id, session_id=session_id, new_message=content
184
+ user_id=user_id, session_id=session_id, new_message=user_content
176
185
  ):
186
+ invocation_id = (
187
+ event.invocation_id if not invocation_id else invocation_id
188
+ )
189
+
177
190
  if event.is_final_response() and event.content and event.content.parts:
178
- response = event.content.parts[0].text
191
+ final_response = event.content
179
192
  elif event.get_function_calls():
180
193
  for call in event.get_function_calls():
181
- turn_actual_tool_uses.append({
182
- EvalConstants.TOOL_NAME: call.name,
183
- EvalConstants.TOOL_INPUT: call.args,
184
- })
194
+ tool_uses.append(call)
195
+
196
+ response_invocations.append(
197
+ Invocation(
198
+ invocation_id=invocation_id,
199
+ user_content=user_content,
200
+ final_response=final_response,
201
+ intermediate_data=IntermediateData(tool_uses=tool_uses),
202
+ )
203
+ )
185
204
 
186
- responses[index]["actual_tool_use"] = turn_actual_tool_uses
187
- responses[index]["response"] = response
188
-
189
- return responses
205
+ return response_invocations
190
206
 
191
207
  @staticmethod
192
208
  def _process_query_with_session(session_data, data):
@@ -225,46 +241,5 @@ class EvaluationGenerator:
225
241
  responses[index]["actual_tool_use"] = actual_tool_uses
226
242
  responses[index]["response"] = response
227
243
  return responses
228
-
229
- @staticmethod
230
- def before_tool_callback(tool, args, tool_context, eval_dataset):
231
- """Intercept specific tool calls and return predefined outputs
232
-
233
- from eval_dataset.
234
- """
235
- for index, eval_entry in enumerate(eval_dataset):
236
- expected_tool_use = eval_entry.get("expected_tool_use", [])
237
- for expected in expected_tool_use:
238
- if (
239
- EvalConstants.MOCK_TOOL_OUTPUT in expected
240
- and tool.name == expected[EvalConstants.TOOL_NAME]
241
- and args == expected.get(EvalConstants.TOOL_INPUT, {})
242
- ):
243
- # pop the matched entry so we don't rematch again
244
- eval_dataset.pop(index)
245
- return {"result": expected[EvalConstants.MOCK_TOOL_OUTPUT]}
246
-
247
- return None
248
-
249
- @staticmethod
250
- async def apply_before_tool_callback(
251
- agent: BaseAgent,
252
- callback: BeforeToolCallback,
253
- all_mock_tools: set[str],
254
- ):
255
- """Recursively apply the before_tool_callback to the root agent and all its subagents."""
256
- # Check if the agent has tools that are defined by evalset.
257
- # We use function names to check if tools match
258
- if not isinstance(agent, Agent) and not isinstance(agent, LlmAgent):
259
- return
260
-
261
- for tool in agent.canonical_tools:
262
- tool_name = tool.name
263
- if tool_name in all_mock_tools:
264
- agent.before_tool_callback = callback
265
-
266
- # Apply recursively to subagents if they exist
267
- for sub_agent in agent.sub_agents:
268
- await EvaluationGenerator.apply_before_tool_callback(
269
- sub_agent, callback, all_mock_tools
270
- )
244
+ return responses
245
+ return responses
@@ -0,0 +1,58 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from abc import ABC
16
+ from enum import Enum
17
+ from typing import Optional
18
+
19
+ from pydantic import BaseModel
20
+
21
+ from .eval_case import Invocation
22
+
23
+
24
+ class EvalStatus(Enum):
25
+ PASSED = 1
26
+ FAILED = 2
27
+ NOT_EVALUATED = 3
28
+
29
+
30
+ class PerInvocationResult(BaseModel):
31
+ """Metric evaluation score per invocation."""
32
+
33
+ actual_invocation: Invocation
34
+ expected_invocation: Invocation
35
+ score: Optional[float] = None
36
+ eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
37
+
38
+
39
+ class EvaluationResult(BaseModel):
40
+ overall_score: Optional[float] = None
41
+ """Overall score, based on each invocation."""
42
+
43
+ overall_eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
44
+ """Overall status, based on each invocation."""
45
+
46
+ per_invocation_results: list[PerInvocationResult] = []
47
+
48
+
49
+ class Evaluator(ABC):
50
+ """A merics evaluator interface."""
51
+
52
+ def evaluate_invocations(
53
+ self,
54
+ actual_invocations: list[Invocation],
55
+ expected_invocations: list[Invocation],
56
+ ) -> EvaluationResult:
57
+ """Returns EvaluationResult after performing evaluations using actual and expected invocations."""
58
+ raise NotImplementedError()
@@ -0,0 +1,113 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import logging
19
+ import os
20
+ import time
21
+
22
+ from typing_extensions import override
23
+
24
+ from .eval_result import EvalCaseResult
25
+ from .eval_result import EvalSetResult
26
+ from .eval_set_results_manager import EvalSetResultsManager
27
+
28
+ logger = logging.getLogger("google_adk." + __name__)
29
+
30
+ _ADK_EVAL_HISTORY_DIR = ".adk/eval_history"
31
+ _EVAL_SET_RESULT_FILE_EXTENSION = ".evalset_result.json"
32
+
33
+
34
+ def _sanitize_eval_set_result_name(eval_set_result_name: str) -> str:
35
+ return eval_set_result_name.replace("/", "_")
36
+
37
+
38
+ class LocalEvalSetResultsManager(EvalSetResultsManager):
39
+ """An EvalSetResult manager that stores eval set results locally on disk."""
40
+
41
+ def __init__(self, agents_dir: str):
42
+ self._agents_dir = agents_dir
43
+
44
+ @override
45
+ def save_eval_set_result(
46
+ self,
47
+ app_name: str,
48
+ eval_set_id: str,
49
+ eval_case_results: list[EvalCaseResult],
50
+ ) -> None:
51
+ """Creates and saves a new EvalSetResult given eval_case_results."""
52
+ timestamp = time.time()
53
+ eval_set_result_id = app_name + "_" + eval_set_id + "_" + str(timestamp)
54
+ eval_set_result_name = _sanitize_eval_set_result_name(eval_set_result_id)
55
+ eval_set_result = EvalSetResult(
56
+ eval_set_result_id=eval_set_result_id,
57
+ eval_set_result_name=eval_set_result_name,
58
+ eval_set_id=eval_set_id,
59
+ eval_case_results=eval_case_results,
60
+ creation_timestamp=timestamp,
61
+ )
62
+ # Write eval result file, with eval_set_result_name.
63
+ app_eval_history_dir = self._get_eval_history_dir(app_name)
64
+ if not os.path.exists(app_eval_history_dir):
65
+ os.makedirs(app_eval_history_dir)
66
+ # Convert to json and write to file.
67
+ eval_set_result_json = eval_set_result.model_dump_json()
68
+ eval_set_result_file_path = os.path.join(
69
+ app_eval_history_dir,
70
+ eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION,
71
+ )
72
+ logger.info("Writing eval result to file: %s", eval_set_result_file_path)
73
+ with open(eval_set_result_file_path, "w") as f:
74
+ f.write(json.dumps(eval_set_result_json, indent=2))
75
+
76
+ @override
77
+ def get_eval_set_result(
78
+ self, app_name: str, eval_set_result_id: str
79
+ ) -> EvalSetResult:
80
+ """Returns an EvalSetResult identified by app_name and eval_set_result_id."""
81
+ # Load the eval set result file data.
82
+ maybe_eval_result_file_path = (
83
+ os.path.join(
84
+ self._get_eval_history_dir(app_name),
85
+ eval_set_result_id,
86
+ )
87
+ + _EVAL_SET_RESULT_FILE_EXTENSION
88
+ )
89
+ if not os.path.exists(maybe_eval_result_file_path):
90
+ raise ValueError(
91
+ f"Eval set result `{eval_set_result_id}` does not exist."
92
+ )
93
+ with open(maybe_eval_result_file_path, "r") as file:
94
+ eval_result_data = json.load(file)
95
+ return EvalSetResult.model_validate_json(eval_result_data)
96
+
97
+ @override
98
+ def list_eval_set_results(self, app_name: str) -> list[str]:
99
+ """Returns the eval result ids that belong to the given app_name."""
100
+ app_eval_history_directory = self._get_eval_history_dir(app_name)
101
+
102
+ if not os.path.exists(app_eval_history_directory):
103
+ return []
104
+
105
+ eval_result_files = [
106
+ file.removesuffix(_EVAL_SET_RESULT_FILE_EXTENSION)
107
+ for file in os.listdir(app_eval_history_directory)
108
+ if file.endswith(_EVAL_SET_RESULT_FILE_EXTENSION)
109
+ ]
110
+ return eval_result_files
111
+
112
+ def _get_eval_history_dir(self, app_name: str) -> str:
113
+ return os.path.join(self._agents_dir, app_name, _ADK_EVAL_HISTORY_DIR)