google-adk 0.5.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. google/adk/agents/base_agent.py +76 -30
  2. google/adk/agents/base_agent.py.orig +330 -0
  3. google/adk/agents/callback_context.py +0 -5
  4. google/adk/agents/llm_agent.py +122 -30
  5. google/adk/agents/loop_agent.py +1 -1
  6. google/adk/agents/parallel_agent.py +7 -0
  7. google/adk/agents/readonly_context.py +7 -1
  8. google/adk/agents/run_config.py +1 -1
  9. google/adk/agents/sequential_agent.py +31 -0
  10. google/adk/agents/transcription_entry.py +4 -2
  11. google/adk/artifacts/gcs_artifact_service.py +1 -1
  12. google/adk/artifacts/in_memory_artifact_service.py +1 -1
  13. google/adk/auth/auth_credential.py +6 -1
  14. google/adk/auth/auth_preprocessor.py +7 -1
  15. google/adk/auth/auth_tool.py +3 -4
  16. google/adk/cli/agent_graph.py +5 -5
  17. google/adk/cli/browser/index.html +2 -2
  18. google/adk/cli/browser/{main-ULN5R5I5.js → main-QOEMUXM4.js} +44 -45
  19. google/adk/cli/cli.py +7 -7
  20. google/adk/cli/cli_deploy.py +7 -2
  21. google/adk/cli/cli_eval.py +172 -99
  22. google/adk/cli/cli_tools_click.py +147 -64
  23. google/adk/cli/fast_api.py +330 -148
  24. google/adk/cli/fast_api.py.orig +174 -80
  25. google/adk/cli/utils/common.py +23 -0
  26. google/adk/cli/utils/evals.py +83 -1
  27. google/adk/cli/utils/logs.py +13 -5
  28. google/adk/code_executors/__init__.py +3 -1
  29. google/adk/code_executors/built_in_code_executor.py +52 -0
  30. google/adk/evaluation/__init__.py +1 -1
  31. google/adk/evaluation/agent_evaluator.py +168 -128
  32. google/adk/evaluation/eval_case.py +102 -0
  33. google/adk/evaluation/eval_set.py +37 -0
  34. google/adk/evaluation/eval_sets_manager.py +42 -0
  35. google/adk/evaluation/evaluation_generator.py +88 -113
  36. google/adk/evaluation/evaluator.py +56 -0
  37. google/adk/evaluation/local_eval_sets_manager.py +264 -0
  38. google/adk/evaluation/response_evaluator.py +106 -2
  39. google/adk/evaluation/trajectory_evaluator.py +83 -2
  40. google/adk/events/event.py +6 -1
  41. google/adk/events/event_actions.py +6 -1
  42. google/adk/examples/example_util.py +3 -2
  43. google/adk/flows/llm_flows/_code_execution.py +9 -1
  44. google/adk/flows/llm_flows/audio_transcriber.py +4 -3
  45. google/adk/flows/llm_flows/base_llm_flow.py +54 -15
  46. google/adk/flows/llm_flows/functions.py +9 -8
  47. google/adk/flows/llm_flows/instructions.py +13 -5
  48. google/adk/flows/llm_flows/single_flow.py +1 -1
  49. google/adk/memory/__init__.py +1 -1
  50. google/adk/memory/_utils.py +23 -0
  51. google/adk/memory/base_memory_service.py +23 -21
  52. google/adk/memory/base_memory_service.py.orig +76 -0
  53. google/adk/memory/in_memory_memory_service.py +57 -25
  54. google/adk/memory/memory_entry.py +37 -0
  55. google/adk/memory/vertex_ai_rag_memory_service.py +38 -15
  56. google/adk/models/anthropic_llm.py +16 -9
  57. google/adk/models/gemini_llm_connection.py +11 -11
  58. google/adk/models/google_llm.py +9 -2
  59. google/adk/models/google_llm.py.orig +305 -0
  60. google/adk/models/lite_llm.py +77 -21
  61. google/adk/models/llm_response.py +14 -2
  62. google/adk/models/registry.py +1 -1
  63. google/adk/runners.py +65 -41
  64. google/adk/sessions/__init__.py +1 -1
  65. google/adk/sessions/base_session_service.py +6 -33
  66. google/adk/sessions/database_session_service.py +58 -65
  67. google/adk/sessions/in_memory_session_service.py +106 -24
  68. google/adk/sessions/session.py +3 -0
  69. google/adk/sessions/vertex_ai_session_service.py +23 -45
  70. google/adk/telemetry.py +3 -0
  71. google/adk/tools/__init__.py +4 -7
  72. google/adk/tools/{built_in_code_execution_tool.py → _built_in_code_execution_tool.py} +11 -0
  73. google/adk/tools/_memory_entry_utils.py +30 -0
  74. google/adk/tools/agent_tool.py +9 -9
  75. google/adk/tools/apihub_tool/apihub_toolset.py +55 -74
  76. google/adk/tools/application_integration_tool/application_integration_toolset.py +107 -85
  77. google/adk/tools/application_integration_tool/clients/connections_client.py +20 -0
  78. google/adk/tools/application_integration_tool/clients/integration_client.py +6 -6
  79. google/adk/tools/application_integration_tool/integration_connector_tool.py +69 -26
  80. google/adk/tools/base_toolset.py +58 -0
  81. google/adk/tools/enterprise_search_tool.py +65 -0
  82. google/adk/tools/function_parameter_parse_util.py +2 -2
  83. google/adk/tools/google_api_tool/__init__.py +18 -70
  84. google/adk/tools/google_api_tool/google_api_tool.py +11 -5
  85. google/adk/tools/google_api_tool/google_api_toolset.py +126 -0
  86. google/adk/tools/google_api_tool/google_api_toolsets.py +102 -0
  87. google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +40 -42
  88. google/adk/tools/langchain_tool.py +96 -49
  89. google/adk/tools/load_memory_tool.py +14 -5
  90. google/adk/tools/mcp_tool/__init__.py +3 -2
  91. google/adk/tools/mcp_tool/mcp_session_manager.py +153 -16
  92. google/adk/tools/mcp_tool/mcp_session_manager.py.orig +322 -0
  93. google/adk/tools/mcp_tool/mcp_tool.py +12 -12
  94. google/adk/tools/mcp_tool/mcp_toolset.py +155 -195
  95. google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +32 -7
  96. google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +31 -31
  97. google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +1 -1
  98. google/adk/tools/preload_memory_tool.py +27 -18
  99. google/adk/tools/retrieval/__init__.py +1 -1
  100. google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +1 -1
  101. google/adk/tools/toolbox_toolset.py +79 -0
  102. google/adk/tools/transfer_to_agent_tool.py +0 -1
  103. google/adk/version.py +1 -1
  104. {google_adk-0.5.0.dist-info → google_adk-1.0.0.dist-info}/METADATA +7 -5
  105. google_adk-1.0.0.dist-info/RECORD +195 -0
  106. google/adk/agents/remote_agent.py +0 -50
  107. google/adk/tools/google_api_tool/google_api_tool_set.py +0 -110
  108. google/adk/tools/google_api_tool/google_api_tool_sets.py +0 -112
  109. google/adk/tools/toolbox_tool.py +0 -46
  110. google_adk-0.5.0.dist-info/RECORD +0 -180
  111. {google_adk-0.5.0.dist-info → google_adk-1.0.0.dist-info}/WHEEL +0 -0
  112. {google_adk-0.5.0.dist-info → google_adk-1.0.0.dist-info}/entry_points.txt +0 -0
  113. {google_adk-0.5.0.dist-info → google_adk-1.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -13,19 +13,34 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import importlib
16
+ from typing import Any
17
+ from typing import Optional
16
18
  import uuid
17
19
 
18
- from google.genai import types
20
+ from pydantic import BaseModel
19
21
 
20
- from ..agents.base_agent import BaseAgent
21
22
  from ..agents.llm_agent import Agent
22
- from ..agents.llm_agent import BeforeToolCallback
23
- from ..agents.llm_agent import LlmAgent
23
+ from ..artifacts.base_artifact_service import BaseArtifactService
24
24
  from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
25
25
  from ..runners import Runner
26
+ from ..sessions.base_session_service import BaseSessionService
26
27
  from ..sessions.in_memory_session_service import InMemorySessionService
27
28
  from ..sessions.session import Session
28
- from .evaluation_constants import EvalConstants
29
+ from .eval_case import EvalCase
30
+ from .eval_case import IntermediateData
31
+ from .eval_case import Invocation
32
+ from .eval_case import SessionInput
33
+ from .eval_set import EvalSet
34
+
35
+
36
+ class EvalCaseResponses(BaseModel):
37
+ """Contains multiple responses associated with an EvalCase.
38
+
39
+ Multiple responses are a result of repeated requests to genereate inferences.
40
+ """
41
+
42
+ eval_case: EvalCase
43
+ responses: list[list[Invocation]]
29
44
 
30
45
 
31
46
  class EvaluationGenerator:
@@ -33,12 +48,11 @@ class EvaluationGenerator:
33
48
 
34
49
  @staticmethod
35
50
  async def generate_responses(
36
- eval_dataset,
37
- agent_module_path,
38
- repeat_num=3,
39
- agent_name=None,
40
- initial_session={},
41
- ):
51
+ eval_set: EvalSet,
52
+ agent_module_path: str,
53
+ repeat_num: int = 3,
54
+ agent_name: str = None,
55
+ ) -> list[EvalCaseResponses]:
42
56
  """Returns evaluation responses for the given dataset and agent.
43
57
 
44
58
  Args:
@@ -48,17 +62,23 @@ class EvaluationGenerator:
48
62
  usually done to remove uncertainty that a single run may bring.
49
63
  agent_name: The name of the agent that should be evaluated. This is
50
64
  usually the sub-agent.
51
- initial_session: Initial session for the eval data.
52
65
  """
53
66
  results = []
54
67
 
55
- for _ in range(repeat_num):
56
- for data in eval_dataset:
57
- results.append(
58
- EvaluationGenerator._process_query(
59
- data, agent_module_path, agent_name, initial_session
60
- )
68
+ for eval_case in eval_set.eval_cases:
69
+ responses = []
70
+ for _ in range(repeat_num):
71
+ response_invocations = await EvaluationGenerator._process_query(
72
+ eval_case.conversation,
73
+ agent_module_path,
74
+ agent_name,
75
+ eval_case.session_input,
61
76
  )
77
+ responses.append(response_invocations)
78
+
79
+ results.append(
80
+ EvalCaseResponses(eval_case=eval_case, responses=responses)
81
+ )
62
82
 
63
83
  return results
64
84
 
@@ -89,7 +109,12 @@ class EvaluationGenerator:
89
109
  return results
90
110
 
91
111
  @staticmethod
92
- def _process_query(data, module_name, agent_name=None, initial_session={}):
112
+ async def _process_query(
113
+ invocations: list[Invocation],
114
+ module_name: str,
115
+ agent_name: Optional[str] = None,
116
+ initial_session: Optional[SessionInput] = None,
117
+ ) -> list[Invocation]:
93
118
  """Process a query using the agent and evaluation dataset."""
94
119
  module_path = f"{module_name}"
95
120
  agent_module = importlib.import_module(module_path)
@@ -102,56 +127,40 @@ class EvaluationGenerator:
102
127
  agent_to_evaluate = root_agent.find_agent(agent_name)
103
128
  assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found."
104
129
 
105
- return EvaluationGenerator._process_query_with_root_agent(
106
- data, agent_to_evaluate, reset_func, initial_session
130
+ return await EvaluationGenerator._generate_inferences_from_root_agent(
131
+ invocations, agent_to_evaluate, reset_func, initial_session
107
132
  )
108
133
 
109
134
  @staticmethod
110
- async def _process_query_with_root_agent(
111
- data,
112
- root_agent,
113
- reset_func,
114
- initial_session={},
115
- session_id=None,
116
- session_service=None,
117
- artifact_service=None,
118
- ):
119
- """Process a query using the agent and evaluation dataset."""
120
-
121
- # we don't know which tools belong to which agent
122
- # so we just apply to any agents that has certain tool outputs
123
- all_mock_tools = set()
124
- for eval_entry in data:
125
- expected_tool_use = eval_entry.get(EvalConstants.EXPECTED_TOOL_USE, [])
126
- for expected in expected_tool_use:
127
- if EvalConstants.MOCK_TOOL_OUTPUT in expected:
128
- all_mock_tools.add(expected[EvalConstants.TOOL_NAME])
129
-
130
- eval_data_copy = data.copy()
131
- await EvaluationGenerator.apply_before_tool_callback(
132
- root_agent,
133
- lambda *args: EvaluationGenerator.before_tool_callback(
134
- *args, eval_dataset=eval_data_copy
135
- ),
136
- all_mock_tools,
137
- )
138
-
135
+ async def _generate_inferences_from_root_agent(
136
+ invocations: list[Invocation],
137
+ root_agent: Agent,
138
+ reset_func: Any,
139
+ initial_session: Optional[SessionInput] = None,
140
+ session_id: Optional[str] = None,
141
+ session_service: Optional[BaseSessionService] = None,
142
+ artifact_service: Optional[BaseArtifactService] = None,
143
+ ) -> list[Invocation]:
144
+ """Scrapes the root agent given the list of Invocations."""
139
145
  if not session_service:
140
146
  session_service = InMemorySessionService()
141
147
 
142
- app_name = initial_session.get("app_name", "EvaluationGenerator")
143
- user_id = initial_session.get("user_id", "test_user_id")
148
+ app_name = (
149
+ initial_session.app_name if initial_session else "EvaluationGenerator"
150
+ )
151
+ user_id = initial_session.user_id if initial_session else "test_user_id"
144
152
  session_id = session_id if session_id else str(uuid.uuid4())
145
153
 
146
- _ = session_service.create_session(
154
+ _ = await session_service.create_session(
147
155
  app_name=app_name,
148
156
  user_id=user_id,
149
- state=initial_session.get("state", {}),
157
+ state=initial_session.state if initial_session else {},
150
158
  session_id=session_id,
151
159
  )
152
160
 
153
161
  if not artifact_service:
154
162
  artifact_service = InMemoryArtifactService()
163
+
155
164
  runner = Runner(
156
165
  app_name=app_name,
157
166
  agent=root_agent,
@@ -163,30 +172,37 @@ class EvaluationGenerator:
163
172
  if callable(reset_func):
164
173
  reset_func()
165
174
 
166
- responses = data.copy()
175
+ response_invocations = []
167
176
 
168
- for index, eval_entry in enumerate(responses):
169
- response = None
170
- query = eval_entry["query"]
171
- content = types.Content(role="user", parts=[types.Part(text=query)])
172
- turn_actual_tool_uses = []
177
+ for invocation in invocations:
178
+ final_response = None
179
+ user_content = invocation.user_content
180
+ tool_uses = []
181
+ invocation_id = ""
173
182
 
174
183
  for event in runner.run(
175
- user_id=user_id, session_id=session_id, new_message=content
184
+ user_id=user_id, session_id=session_id, new_message=user_content
176
185
  ):
186
+ invocation_id = (
187
+ event.invocation_id if not invocation_id else invocation_id
188
+ )
189
+
177
190
  if event.is_final_response() and event.content and event.content.parts:
178
- response = event.content.parts[0].text
191
+ final_response = event.content
179
192
  elif event.get_function_calls():
180
193
  for call in event.get_function_calls():
181
- turn_actual_tool_uses.append({
182
- EvalConstants.TOOL_NAME: call.name,
183
- EvalConstants.TOOL_INPUT: call.args,
184
- })
194
+ tool_uses.append(call)
195
+
196
+ response_invocations.append(
197
+ Invocation(
198
+ invocation_id=invocation_id,
199
+ user_content=user_content,
200
+ final_response=final_response,
201
+ intermediate_data=IntermediateData(tool_uses=tool_uses),
202
+ )
203
+ )
185
204
 
186
- responses[index]["actual_tool_use"] = turn_actual_tool_uses
187
- responses[index]["response"] = response
188
-
189
- return responses
205
+ return response_invocations
190
206
 
191
207
  @staticmethod
192
208
  def _process_query_with_session(session_data, data):
@@ -225,46 +241,5 @@ class EvaluationGenerator:
225
241
  responses[index]["actual_tool_use"] = actual_tool_uses
226
242
  responses[index]["response"] = response
227
243
  return responses
228
-
229
- @staticmethod
230
- def before_tool_callback(tool, args, tool_context, eval_dataset):
231
- """Intercept specific tool calls and return predefined outputs
232
-
233
- from eval_dataset.
234
- """
235
- for index, eval_entry in enumerate(eval_dataset):
236
- expected_tool_use = eval_entry.get("expected_tool_use", [])
237
- for expected in expected_tool_use:
238
- if (
239
- EvalConstants.MOCK_TOOL_OUTPUT in expected
240
- and tool.name == expected[EvalConstants.TOOL_NAME]
241
- and args == expected.get(EvalConstants.TOOL_INPUT, {})
242
- ):
243
- # pop the matched entry so we don't rematch again
244
- eval_dataset.pop(index)
245
- return {"result": expected[EvalConstants.MOCK_TOOL_OUTPUT]}
246
-
247
- return None
248
-
249
- @staticmethod
250
- async def apply_before_tool_callback(
251
- agent: BaseAgent,
252
- callback: BeforeToolCallback,
253
- all_mock_tools: set[str],
254
- ):
255
- """Recursively apply the before_tool_callback to the root agent and all its subagents."""
256
- # Check if the agent has tools that are defined by evalset.
257
- # We use function names to check if tools match
258
- if not isinstance(agent, Agent) and not isinstance(agent, LlmAgent):
259
- return
260
-
261
- for tool in agent.canonical_tools:
262
- tool_name = tool.name
263
- if tool_name in all_mock_tools:
264
- agent.before_tool_callback = callback
265
-
266
- # Apply recursively to subagents if they exist
267
- for sub_agent in agent.sub_agents:
268
- await EvaluationGenerator.apply_before_tool_callback(
269
- sub_agent, callback, all_mock_tools
270
- )
244
+ return responses
245
+ return responses
@@ -0,0 +1,56 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from abc import ABC
16
+ from enum import Enum
17
+ from typing import Optional
18
+ from pydantic import BaseModel
19
+ from .eval_case import Invocation
20
+
21
+
22
+ class EvalStatus(Enum):
23
+ PASSED = 1
24
+ FAILED = 2
25
+ NOT_EVALUATED = 3
26
+
27
+
28
+ class PerInvocationResult(BaseModel):
29
+ """Metric evaluation score per invocation."""
30
+
31
+ actual_invocation: Invocation
32
+ expected_invocation: Invocation
33
+ score: Optional[float] = None
34
+ eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
35
+
36
+
37
+ class EvaluationResult(BaseModel):
38
+ overall_score: Optional[float] = None
39
+ """Overall score, based on each invocation."""
40
+
41
+ overall_eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
42
+ """Overall status, based on each invocation."""
43
+
44
+ per_invocation_results: list[PerInvocationResult] = []
45
+
46
+
47
+ class Evaluator(ABC):
48
+ """A merics evaluator interface."""
49
+
50
+ def evaluate_invocations(
51
+ self,
52
+ actual_invocations: list[Invocation],
53
+ expected_invocations: list[Invocation],
54
+ ) -> EvaluationResult:
55
+ """Returns EvaluationResult after performing evaluations using actual and expected invocations."""
56
+ raise NotImplementedError()
@@ -0,0 +1,264 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import json
16
+ import logging
17
+ import os
18
+ import re
19
+ import time
20
+ from typing import Any
21
+ import uuid
22
+
23
+ from google.genai import types as genai_types
24
+ from pydantic import ValidationError
25
+ from typing_extensions import override
26
+
27
+ from .eval_case import EvalCase
28
+ from .eval_case import IntermediateData
29
+ from .eval_case import Invocation
30
+ from .eval_case import SessionInput
31
+ from .eval_set import EvalSet
32
+ from .eval_sets_manager import EvalSetsManager
33
+
34
+ logger = logging.getLogger("google_adk." + __name__)
35
+
36
+ _EVAL_SET_FILE_EXTENSION = ".evalset.json"
37
+
38
+
39
+ def _convert_invocation_to_pydantic_schema(
40
+ invocation_in_json_format: dict[str, Any],
41
+ ) -> Invocation:
42
+ """Converts an invocation from old json format to new Pydantic Schema"""
43
+ query = invocation_in_json_format["query"]
44
+ reference = invocation_in_json_format["reference"]
45
+ expected_tool_use = []
46
+ expected_intermediate_agent_responses = []
47
+
48
+ for old_tool_use in invocation_in_json_format.get("expected_tool_use", []):
49
+ expected_tool_use.append(
50
+ genai_types.FunctionCall(
51
+ name=old_tool_use["tool_name"], args=old_tool_use["tool_input"]
52
+ )
53
+ )
54
+
55
+ for old_intermediate_response in invocation_in_json_format.get(
56
+ "expected_intermediate_agent_responses", []
57
+ ):
58
+ expected_intermediate_agent_responses.append((
59
+ old_intermediate_response["author"],
60
+ [genai_types.Part.from_text(text=old_intermediate_response["text"])],
61
+ ))
62
+
63
+ return Invocation(
64
+ invocation_id=str(uuid.uuid4()),
65
+ user_content=genai_types.Content(
66
+ parts=[genai_types.Part.from_text(text=query)], role="user"
67
+ ),
68
+ final_response=genai_types.Content(
69
+ parts=[genai_types.Part.from_text(text=reference)], role="model"
70
+ ),
71
+ intermediate_data=IntermediateData(
72
+ tool_uses=expected_tool_use,
73
+ intermediate_responses=expected_intermediate_agent_responses,
74
+ ),
75
+ creation_timestamp=time.time(),
76
+ )
77
+
78
+
79
+ def convert_eval_set_to_pydanctic_schema(
80
+ eval_set_id: str,
81
+ eval_set_in_json_format: list[dict[str, Any]],
82
+ ) -> EvalSet:
83
+ r"""Returns an pydantic EvalSet generated from the json representation.
84
+
85
+ Args:
86
+ eval_set_id: Eval set id.
87
+ eval_set_in_json_format: Eval set specified in JSON format.
88
+
89
+ Here is a sample eval set in JSON format:
90
+ [
91
+ {
92
+ "name": "roll_17_sided_dice_twice",
93
+ "data": [
94
+ {
95
+ "query": "What can you do?",
96
+ "expected_tool_use": [],
97
+ "expected_intermediate_agent_responses": [],
98
+ "reference": "I can roll dice of different sizes and check if a number
99
+ is prime. I can also use multiple tools in parallel.\n"
100
+ },
101
+ {
102
+ "query": "Roll a 17 sided dice twice for me",
103
+ "expected_tool_use": [
104
+ {
105
+ "tool_name": "roll_die",
106
+ "tool_input": {
107
+ "sides": 17
108
+ }
109
+ },
110
+ {
111
+ "tool_name": "roll_die",
112
+ "tool_input": {
113
+ "sides": 17
114
+ }
115
+ }
116
+ ],
117
+ "expected_intermediate_agent_responses": [],
118
+ "reference": "I have rolled a 17 sided die twice. The first roll was
119
+ 13 and the second roll was 4.\n"
120
+ }
121
+ ],
122
+ "initial_session": {
123
+ "state": {},
124
+ "app_name": "hello_world",
125
+ "user_id": "user"
126
+ }
127
+ }
128
+ ]
129
+ """
130
+ eval_cases = []
131
+ for old_eval_case in eval_set_in_json_format:
132
+ new_invocations = []
133
+
134
+ for old_invocation in old_eval_case["data"]:
135
+ new_invocations.append(
136
+ _convert_invocation_to_pydantic_schema(old_invocation)
137
+ )
138
+
139
+ session_input = None
140
+ if (
141
+ "initial_session" in old_eval_case
142
+ and len(old_eval_case["initial_session"]) > 0
143
+ ):
144
+ session_input = SessionInput(
145
+ app_name=old_eval_case["initial_session"].get("app_name", ""),
146
+ user_id=old_eval_case["initial_session"].get("user_id", ""),
147
+ state=old_eval_case["initial_session"].get("state", {}),
148
+ )
149
+
150
+ new_eval_case = EvalCase(
151
+ eval_id=old_eval_case["name"],
152
+ conversation=new_invocations,
153
+ session_input=session_input,
154
+ creation_timestamp=time.time(),
155
+ )
156
+ eval_cases.append(new_eval_case)
157
+
158
+ return EvalSet(
159
+ eval_set_id=eval_set_id,
160
+ name=eval_set_id,
161
+ creation_timestamp=time.time(),
162
+ eval_cases=eval_cases,
163
+ )
164
+
165
+
166
+ def load_eval_set_from_file(
167
+ eval_set_file_path: str, eval_set_id: str
168
+ ) -> EvalSet:
169
+ """Returns an EvalSet that is read from the given file."""
170
+ with open(eval_set_file_path, "r", encoding="utf-8") as f:
171
+ content = f.read()
172
+ try:
173
+ return EvalSet.model_validate_json(content)
174
+ except ValidationError:
175
+ # We assume that the eval data was specified in the old format and try
176
+ # to convert it to the new format.
177
+ return convert_eval_set_to_pydanctic_schema(
178
+ eval_set_id, json.loads(content)
179
+ )
180
+
181
+
182
+ class LocalEvalSetsManager(EvalSetsManager):
183
+ """An EvalSets manager that stores eval sets locally on disk."""
184
+
185
+ def __init__(self, agent_dir: str):
186
+ self._agent_dir = agent_dir
187
+
188
+ @override
189
+ def get_eval_set(self, app_name: str, eval_set_id: str) -> EvalSet:
190
+ """Returns an EvalSet identified by an app_name and eval_set_id."""
191
+ # Load the eval set file data
192
+ eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
193
+ return load_eval_set_from_file(eval_set_file_path, eval_set_id)
194
+
195
+ @override
196
+ def create_eval_set(self, app_name: str, eval_set_id: str):
197
+ """Creates an empty EvalSet given the app_name and eval_set_id."""
198
+ self._validate_id(id_name="Eval Set Id", id_value=eval_set_id)
199
+
200
+ # Define the file path
201
+ new_eval_set_path = self._get_eval_set_file_path(app_name, eval_set_id)
202
+
203
+ logger.info("Creating eval set file `%s`", new_eval_set_path)
204
+
205
+ if not os.path.exists(new_eval_set_path):
206
+ # Write the JSON string to the file
207
+ logger.info("Eval set file doesn't exist, we will create a new one.")
208
+ new_eval_set = EvalSet(
209
+ eval_set_id=eval_set_id,
210
+ name=eval_set_id,
211
+ eval_cases=[],
212
+ creation_timestamp=time.time(),
213
+ )
214
+ self._write_eval_set(new_eval_set_path, new_eval_set)
215
+
216
+ @override
217
+ def list_eval_sets(self, app_name: str) -> list[str]:
218
+ """Returns a list of EvalSets that belong to the given app_name."""
219
+ eval_set_file_path = os.path.join(self._agent_dir, app_name)
220
+ eval_sets = []
221
+ for file in os.listdir(eval_set_file_path):
222
+ if file.endswith(_EVAL_SET_FILE_EXTENSION):
223
+ eval_sets.append(
224
+ os.path.basename(file).removesuffix(_EVAL_SET_FILE_EXTENSION)
225
+ )
226
+
227
+ return sorted(eval_sets)
228
+
229
+ @override
230
+ def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
231
+ """Adds the given EvalCase to an existing EvalSet identified by app_name and eval_set_id."""
232
+ eval_case_id = eval_case.eval_id
233
+ self._validate_id(id_name="Eval Case Id", id_value=eval_case_id)
234
+
235
+ eval_set = self.get_eval_set(app_name, eval_set_id)
236
+
237
+ if [x for x in eval_set.eval_cases if x.eval_id == eval_case_id]:
238
+ raise ValueError(
239
+ f"Eval id `{eval_case_id}` already exists in `{eval_set_id}`"
240
+ " eval set.",
241
+ )
242
+
243
+ eval_set.eval_cases.append(eval_case)
244
+
245
+ eval_set_file_path = self._get_eval_set_file_path(app_name, eval_set_id)
246
+ self._write_eval_set(eval_set_file_path, eval_set)
247
+
248
+ def _get_eval_set_file_path(self, app_name: str, eval_set_id: str) -> str:
249
+ return os.path.join(
250
+ self._agent_dir,
251
+ app_name,
252
+ eval_set_id + _EVAL_SET_FILE_EXTENSION,
253
+ )
254
+
255
+ def _validate_id(self, id_name: str, id_value: str):
256
+ pattern = r"^[a-zA-Z0-9_]+$"
257
+ if not bool(re.fullmatch(pattern, id_value)):
258
+ raise ValueError(
259
+ f"Invalid {id_name}. {id_name} should have the `{pattern}` format",
260
+ )
261
+
262
+ def _write_eval_set(self, eval_set_path: str, eval_set: EvalSet):
263
+ with open(eval_set_path, "w") as f:
264
+ f.write(eval_set.model_dump_json(indent=2))