google-adk 0.5.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. google/adk/agents/base_agent.py +76 -30
  2. google/adk/agents/callback_context.py +2 -6
  3. google/adk/agents/llm_agent.py +122 -30
  4. google/adk/agents/loop_agent.py +1 -1
  5. google/adk/agents/parallel_agent.py +7 -0
  6. google/adk/agents/readonly_context.py +8 -0
  7. google/adk/agents/run_config.py +1 -1
  8. google/adk/agents/sequential_agent.py +31 -0
  9. google/adk/agents/transcription_entry.py +4 -2
  10. google/adk/artifacts/gcs_artifact_service.py +1 -1
  11. google/adk/artifacts/in_memory_artifact_service.py +1 -1
  12. google/adk/auth/auth_credential.py +10 -2
  13. google/adk/auth/auth_preprocessor.py +7 -1
  14. google/adk/auth/auth_tool.py +3 -4
  15. google/adk/cli/agent_graph.py +5 -5
  16. google/adk/cli/browser/index.html +4 -4
  17. google/adk/cli/browser/{main-ULN5R5I5.js → main-PKDNKWJE.js} +59 -60
  18. google/adk/cli/browser/polyfills-B6TNHZQ6.js +17 -0
  19. google/adk/cli/cli.py +10 -9
  20. google/adk/cli/cli_deploy.py +7 -2
  21. google/adk/cli/cli_eval.py +109 -115
  22. google/adk/cli/cli_tools_click.py +179 -67
  23. google/adk/cli/fast_api.py +248 -197
  24. google/adk/cli/utils/agent_loader.py +137 -0
  25. google/adk/cli/utils/cleanup.py +40 -0
  26. google/adk/cli/utils/common.py +23 -0
  27. google/adk/cli/utils/evals.py +83 -0
  28. google/adk/cli/utils/logs.py +8 -5
  29. google/adk/code_executors/__init__.py +3 -1
  30. google/adk/code_executors/built_in_code_executor.py +52 -0
  31. google/adk/code_executors/code_execution_utils.py +2 -1
  32. google/adk/code_executors/container_code_executor.py +0 -1
  33. google/adk/code_executors/vertex_ai_code_executor.py +6 -8
  34. google/adk/evaluation/__init__.py +1 -1
  35. google/adk/evaluation/agent_evaluator.py +168 -128
  36. google/adk/evaluation/eval_case.py +104 -0
  37. google/adk/evaluation/eval_metrics.py +74 -0
  38. google/adk/evaluation/eval_result.py +86 -0
  39. google/adk/evaluation/eval_set.py +39 -0
  40. google/adk/evaluation/eval_set_results_manager.py +47 -0
  41. google/adk/evaluation/eval_sets_manager.py +43 -0
  42. google/adk/evaluation/evaluation_generator.py +88 -113
  43. google/adk/evaluation/evaluator.py +58 -0
  44. google/adk/evaluation/local_eval_set_results_manager.py +113 -0
  45. google/adk/evaluation/local_eval_sets_manager.py +264 -0
  46. google/adk/evaluation/response_evaluator.py +106 -1
  47. google/adk/evaluation/trajectory_evaluator.py +84 -2
  48. google/adk/events/event.py +6 -1
  49. google/adk/events/event_actions.py +6 -1
  50. google/adk/examples/base_example_provider.py +1 -0
  51. google/adk/examples/example_util.py +3 -2
  52. google/adk/flows/llm_flows/_code_execution.py +9 -1
  53. google/adk/flows/llm_flows/audio_transcriber.py +4 -3
  54. google/adk/flows/llm_flows/base_llm_flow.py +58 -21
  55. google/adk/flows/llm_flows/contents.py +3 -1
  56. google/adk/flows/llm_flows/functions.py +9 -8
  57. google/adk/flows/llm_flows/instructions.py +18 -80
  58. google/adk/flows/llm_flows/single_flow.py +2 -2
  59. google/adk/memory/__init__.py +1 -1
  60. google/adk/memory/_utils.py +23 -0
  61. google/adk/memory/base_memory_service.py +23 -21
  62. google/adk/memory/in_memory_memory_service.py +57 -25
  63. google/adk/memory/memory_entry.py +37 -0
  64. google/adk/memory/vertex_ai_rag_memory_service.py +38 -15
  65. google/adk/models/anthropic_llm.py +16 -9
  66. google/adk/models/base_llm.py +2 -1
  67. google/adk/models/base_llm_connection.py +2 -0
  68. google/adk/models/gemini_llm_connection.py +11 -11
  69. google/adk/models/google_llm.py +12 -2
  70. google/adk/models/lite_llm.py +80 -23
  71. google/adk/models/llm_response.py +16 -3
  72. google/adk/models/registry.py +1 -1
  73. google/adk/runners.py +98 -42
  74. google/adk/sessions/__init__.py +1 -1
  75. google/adk/sessions/_session_util.py +2 -1
  76. google/adk/sessions/base_session_service.py +6 -33
  77. google/adk/sessions/database_session_service.py +57 -67
  78. google/adk/sessions/in_memory_session_service.py +106 -24
  79. google/adk/sessions/session.py +3 -0
  80. google/adk/sessions/vertex_ai_session_service.py +44 -51
  81. google/adk/telemetry.py +7 -2
  82. google/adk/tools/__init__.py +4 -7
  83. google/adk/tools/_memory_entry_utils.py +30 -0
  84. google/adk/tools/agent_tool.py +10 -10
  85. google/adk/tools/apihub_tool/apihub_toolset.py +55 -74
  86. google/adk/tools/apihub_tool/clients/apihub_client.py +10 -3
  87. google/adk/tools/apihub_tool/clients/secret_client.py +1 -0
  88. google/adk/tools/application_integration_tool/application_integration_toolset.py +111 -85
  89. google/adk/tools/application_integration_tool/clients/connections_client.py +28 -1
  90. google/adk/tools/application_integration_tool/clients/integration_client.py +7 -5
  91. google/adk/tools/application_integration_tool/integration_connector_tool.py +69 -26
  92. google/adk/tools/base_toolset.py +96 -0
  93. google/adk/tools/bigquery/__init__.py +28 -0
  94. google/adk/tools/bigquery/bigquery_credentials.py +216 -0
  95. google/adk/tools/bigquery/bigquery_tool.py +116 -0
  96. google/adk/tools/{built_in_code_execution_tool.py → enterprise_search_tool.py} +17 -11
  97. google/adk/tools/function_parameter_parse_util.py +9 -2
  98. google/adk/tools/function_tool.py +33 -3
  99. google/adk/tools/get_user_choice_tool.py +1 -0
  100. google/adk/tools/google_api_tool/__init__.py +24 -70
  101. google/adk/tools/google_api_tool/google_api_tool.py +12 -6
  102. google/adk/tools/google_api_tool/{google_api_tool_set.py → google_api_toolset.py} +57 -55
  103. google/adk/tools/google_api_tool/google_api_toolsets.py +108 -0
  104. google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +40 -42
  105. google/adk/tools/google_search_tool.py +2 -2
  106. google/adk/tools/langchain_tool.py +96 -49
  107. google/adk/tools/load_memory_tool.py +14 -5
  108. google/adk/tools/mcp_tool/__init__.py +3 -2
  109. google/adk/tools/mcp_tool/conversion_utils.py +6 -2
  110. google/adk/tools/mcp_tool/mcp_session_manager.py +80 -69
  111. google/adk/tools/mcp_tool/mcp_tool.py +35 -32
  112. google/adk/tools/mcp_tool/mcp_toolset.py +99 -194
  113. google/adk/tools/openapi_tool/auth/credential_exchangers/base_credential_exchanger.py +1 -3
  114. google/adk/tools/openapi_tool/auth/credential_exchangers/service_account_exchanger.py +6 -7
  115. google/adk/tools/openapi_tool/common/common.py +5 -1
  116. google/adk/tools/openapi_tool/openapi_spec_parser/__init__.py +7 -2
  117. google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +27 -7
  118. google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +36 -32
  119. google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +11 -1
  120. google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +1 -1
  121. google/adk/tools/preload_memory_tool.py +27 -18
  122. google/adk/tools/retrieval/__init__.py +1 -1
  123. google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +1 -1
  124. google/adk/tools/toolbox_toolset.py +107 -0
  125. google/adk/tools/transfer_to_agent_tool.py +0 -1
  126. google/adk/utils/__init__.py +13 -0
  127. google/adk/utils/instructions_utils.py +131 -0
  128. google/adk/version.py +1 -1
  129. {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/METADATA +18 -19
  130. google_adk-1.1.0.dist-info/RECORD +200 -0
  131. google/adk/agents/remote_agent.py +0 -50
  132. google/adk/cli/browser/polyfills-FFHMD2TL.js +0 -18
  133. google/adk/cli/fast_api.py.orig +0 -728
  134. google/adk/tools/google_api_tool/google_api_tool_sets.py +0 -112
  135. google/adk/tools/toolbox_tool.py +0 -46
  136. google_adk-0.5.0.dist-info/RECORD +0 -180
  137. {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/WHEEL +0 -0
  138. {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/entry_points.txt +0 -0
  139. {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -13,16 +13,30 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import json
16
+ import logging
16
17
  import os
17
18
  from os import path
19
+ from typing import Any
18
20
  from typing import Dict
19
21
  from typing import List
22
+ from typing import Optional
20
23
  from typing import Union
24
+ import uuid
21
25
 
26
+ from pydantic import ValidationError
27
+
28
+ from .eval_set import EvalSet
22
29
  from .evaluation_generator import EvaluationGenerator
30
+ from .evaluator import EvalStatus
31
+ from .evaluator import EvaluationResult
32
+ from .evaluator import Evaluator
33
+ from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
23
34
  from .response_evaluator import ResponseEvaluator
24
35
  from .trajectory_evaluator import TrajectoryEvaluator
25
36
 
37
+ logger = logging.getLogger("google_adk." + __name__)
38
+
39
+
26
40
  # Constants for default runs and evaluation criteria
27
41
  NUM_RUNS = 2
28
42
  TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
@@ -76,12 +90,67 @@ class AgentEvaluator:
76
90
  return DEFAULT_CRITERIA
77
91
 
78
92
  @staticmethod
79
- async def evaluate(
80
- agent_module,
81
- eval_dataset_file_path_or_dir,
93
+ async def evaluate_eval_set(
94
+ agent_module: str,
95
+ eval_set: EvalSet,
96
+ criteria: dict[str, float],
82
97
  num_runs=NUM_RUNS,
83
98
  agent_name=None,
84
- initial_session_file=None,
99
+ ):
100
+ """Evaluates an agent using the given EvalSet.
101
+
102
+ Args:
103
+ agent_module: The path to python module that contains the definition of
104
+ the agent. There is convention in place here, where the code is going to
105
+ look for 'root_agent' in the loaded module.
106
+ eval_set: The eval set.
107
+ criteria: Evauation criterias, a dictionary of metric names to their
108
+ respective thresholds.
109
+ num_runs: Number of times all entries in the eval dataset should be
110
+ assessed.
111
+ agent_name: The name of the agent.
112
+ """
113
+ eval_case_responses_list = await EvaluationGenerator.generate_responses(
114
+ eval_set=eval_set,
115
+ agent_module_path=agent_module,
116
+ repeat_num=num_runs,
117
+ agent_name=agent_name,
118
+ )
119
+
120
+ for eval_case_responses in eval_case_responses_list:
121
+ actual_invocations = [
122
+ invocation
123
+ for invocations in eval_case_responses.responses
124
+ for invocation in invocations
125
+ ]
126
+ expected_invocations = (
127
+ eval_case_responses.eval_case.conversation * num_runs
128
+ )
129
+
130
+ for metric_name, threshold in criteria.items():
131
+ metric_evaluator = AgentEvaluator._get_metric_evaluator(
132
+ metric_name=metric_name, threshold=threshold
133
+ )
134
+
135
+ evaluation_result: EvaluationResult = (
136
+ metric_evaluator.evaluate_invocations(
137
+ actual_invocations=actual_invocations,
138
+ expected_invocations=expected_invocations,
139
+ )
140
+ )
141
+
142
+ assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
143
+ f"{metric_name} for {agent_module} Failed. Expected {threshold},"
144
+ f" but got {evaluation_result.overall_score}."
145
+ )
146
+
147
+ @staticmethod
148
+ async def evaluate(
149
+ agent_module: str,
150
+ eval_dataset_file_path_or_dir: str,
151
+ num_runs: int = NUM_RUNS,
152
+ agent_name: Optional[str] = None,
153
+ initial_session_file: Optional[str] = None,
85
154
  ):
86
155
  """Evaluates an Agent given eval data.
87
156
 
@@ -109,35 +178,102 @@ class AgentEvaluator:
109
178
  else:
110
179
  test_files = [eval_dataset_file_path_or_dir]
111
180
 
112
- initial_session_state = {}
113
- if initial_session_file:
114
- with open(initial_session_file, "r") as f:
115
- initial_session_state = json.loads(f.read())["state"]
181
+ initial_session = AgentEvaluator._get_initial_session(initial_session_file)
116
182
 
117
183
  for test_file in test_files:
118
- dataset = AgentEvaluator._load_dataset(test_file)[0]
119
184
  criteria = AgentEvaluator.find_config_for_test_file(test_file)
185
+ eval_set = AgentEvaluator._load_eval_set_from_file(
186
+ test_file, criteria, initial_session
187
+ )
120
188
 
121
- AgentEvaluator._validate_input([dataset], criteria)
122
-
123
- evaluation_response = await AgentEvaluator._generate_responses(
124
- agent_module,
125
- [dataset],
126
- num_runs,
189
+ await AgentEvaluator.evaluate_eval_set(
190
+ agent_module=agent_module,
191
+ eval_set=eval_set,
192
+ criteria=criteria,
193
+ num_runs=num_runs,
127
194
  agent_name=agent_name,
128
- initial_session={"state": initial_session_state},
129
195
  )
130
196
 
131
- if AgentEvaluator._response_evaluation_required(criteria, [dataset]):
132
- AgentEvaluator._evaluate_response_scores(
133
- agent_module, evaluation_response, criteria
134
- )
197
+ @staticmethod
198
+ def migrate_eval_data_to_new_schema(
199
+ old_eval_data_file: str,
200
+ new_eval_data_file: str,
201
+ initial_session_file: Optional[str] = None,
202
+ ):
203
+ """A utility for migrating eval data to new schema backed by EvalSet."""
204
+ if not old_eval_data_file or not new_eval_data_file:
205
+ raise ValueError(
206
+ "One of old_eval_data_file or new_eval_data_file is empty."
207
+ )
208
+
209
+ criteria = AgentEvaluator.find_config_for_test_file(old_eval_data_file)
210
+ initial_session = AgentEvaluator._get_initial_session(initial_session_file)
211
+
212
+ eval_set = AgentEvaluator._get_eval_set_from_old_format(
213
+ old_eval_data_file, criteria, initial_session
214
+ )
135
215
 
136
- if AgentEvaluator._trajectory_evaluation_required(criteria, [dataset]):
137
- AgentEvaluator._evaluate_tool_trajectory(
138
- agent_module, evaluation_response, criteria
216
+ with open(new_eval_data_file, "w") as f:
217
+ f.write(eval_set.model_dump_json(indent=2))
218
+
219
+ @staticmethod
220
+ def _load_eval_set_from_file(
221
+ eval_set_file: str,
222
+ criteria: dict[str, float],
223
+ initial_session: dict[str, Any],
224
+ ) -> EvalSet:
225
+ """Loads an EvalSet from the given file."""
226
+ if os.path.isfile(eval_set_file):
227
+ with open(eval_set_file, "r", encoding="utf-8") as f:
228
+ content = f.read()
229
+
230
+ try:
231
+ eval_set = EvalSet.model_validate_json(content)
232
+ assert len(initial_session) == 0, (
233
+ "Intial session should be specified as a part of EvalSet file."
234
+ " Explicit initial session is only needed, when specifying data in"
235
+ " the older schema."
236
+ )
237
+ return eval_set
238
+ except ValidationError:
239
+ # We assume that the eval data was specified in the old format
240
+ logger.warning(
241
+ f"Contents of {eval_set_file} appear to be in older format.To avoid"
242
+ " this warning, please update your test files to contain data in"
243
+ " EvalSet schema. You can use `migrate_eval_data_to_new_schema`"
244
+ " for migrating your old test files."
139
245
  )
140
246
 
247
+ # If we are here, the data must be specified in the older format.
248
+ return AgentEvaluator._get_eval_set_from_old_format(
249
+ eval_set_file, criteria, initial_session
250
+ )
251
+
252
+ @staticmethod
253
+ def _get_eval_set_from_old_format(
254
+ eval_set_file: str,
255
+ criteria: dict[str, float],
256
+ initial_session: dict[str, Any],
257
+ ) -> EvalSet:
258
+ data = AgentEvaluator._load_dataset(eval_set_file)[0]
259
+ AgentEvaluator._validate_input([data], criteria)
260
+ eval_data = {
261
+ "name": eval_set_file,
262
+ "data": data,
263
+ "initial_session": initial_session,
264
+ }
265
+ return convert_eval_set_to_pydanctic_schema(
266
+ eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data]
267
+ )
268
+
269
+ @staticmethod
270
+ def _get_initial_session(initial_session_file: Optional[str] = None):
271
+ initial_session = {}
272
+ if initial_session_file:
273
+ with open(initial_session_file, "r") as f:
274
+ initial_session = json.loads(f.read())
275
+ return initial_session
276
+
141
277
  @staticmethod
142
278
  def _load_dataset(
143
279
  input_data: Union[str, List[str], List[Dict], List[List[Dict]]],
@@ -221,109 +357,13 @@ class AgentEvaluator:
221
357
  )
222
358
 
223
359
  @staticmethod
224
- def _get_infer_criteria(eval_dataset):
225
- """Infers evaluation criteria based on the provided dataset.
226
-
227
- Args:
228
- eval_dataset (list): A list of evaluation samples.
229
-
230
- Returns:
231
- dict: Inferred evaluation criteria based on dataset fields.
232
- """
233
- inferred_criteria = {}
234
- sample = eval_dataset[0][0]
235
-
236
- if QUERY_COLUMN in sample and EXPECTED_TOOL_USE_COLUMN in sample:
237
- inferred_criteria[TOOL_TRAJECTORY_SCORE_KEY] = DEFAULT_CRITERIA[
238
- TOOL_TRAJECTORY_SCORE_KEY
239
- ]
240
-
241
- if QUERY_COLUMN in sample and REFERENCE_COLUMN in sample:
242
- inferred_criteria[RESPONSE_MATCH_SCORE_KEY] = DEFAULT_CRITERIA[
243
- RESPONSE_MATCH_SCORE_KEY
244
- ]
245
-
246
- return inferred_criteria
247
-
248
- @staticmethod
249
- async def _generate_responses(
250
- agent_module, eval_dataset, num_runs, agent_name=None, initial_session={}
251
- ):
252
- """Generates evaluation responses by running the agent module multiple times."""
253
- return EvaluationGenerator.generate_responses(
254
- eval_dataset,
255
- agent_module,
256
- repeat_num=num_runs,
257
- agent_name=agent_name,
258
- initial_session=initial_session,
259
- )
260
-
261
- @staticmethod
262
- def _generate_responses_from_session(eval_dataset, session_path):
263
- """Generates evaluation responses by running the agent module multiple times."""
264
- return EvaluationGenerator.generate_responses_from_session(
265
- session_path, eval_dataset
266
- )
267
-
268
- @staticmethod
269
- def _response_evaluation_required(criteria, eval_dataset):
270
- """Checks if response evaluation are needed."""
271
- return REFERENCE_COLUMN in eval_dataset[0][0] and any(
272
- key in criteria
273
- for key in [RESPONSE_EVALUATION_SCORE_KEY, RESPONSE_MATCH_SCORE_KEY]
274
- )
275
-
276
- @staticmethod
277
- def _trajectory_evaluation_required(evaluation_criteria, eval_dataset):
278
- """Checks if response evaluation are needed."""
279
- return (
280
- EXPECTED_TOOL_USE_COLUMN in eval_dataset[0][0]
281
- and TOOL_TRAJECTORY_SCORE_KEY in evaluation_criteria
282
- )
283
-
284
- @staticmethod
285
- def _evaluate_response_scores(agent_module, evaluation_response, criteria):
286
- """Evaluates response scores and raises an assertion error if they don't meet the criteria."""
287
- metrics = ResponseEvaluator.evaluate(
288
- evaluation_response, criteria, print_detailed_results=True
289
- )
290
-
291
- AgentEvaluator._assert_score(
292
- metrics,
293
- "coherence/mean",
294
- criteria.get(RESPONSE_EVALUATION_SCORE_KEY),
295
- "Average response evaluation score",
296
- agent_module,
297
- )
298
-
299
- AgentEvaluator._assert_score(
300
- metrics,
301
- "rouge_1/mean",
302
- criteria.get(RESPONSE_MATCH_SCORE_KEY),
303
- "Average response match score",
304
- agent_module,
305
- )
306
-
307
- @staticmethod
308
- def _evaluate_tool_trajectory(agent_module, evaluation_response, criteria):
309
- """Evaluates tool trajectory scores and raises an assertion error if they don't meet the criteria."""
310
- score = TrajectoryEvaluator.evaluate(
311
- evaluation_response, print_detailed_results=True
312
- )
313
- AgentEvaluator._assert_score(
314
- {TOOL_TRAJECTORY_SCORE_KEY: score},
315
- TOOL_TRAJECTORY_SCORE_KEY,
316
- criteria[TOOL_TRAJECTORY_SCORE_KEY],
317
- "Average tool trajectory evaluation score",
318
- agent_module,
319
- )
360
+ def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
361
+ if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
362
+ return TrajectoryEvaluator(threshold=threshold)
363
+ elif (
364
+ metric_name == RESPONSE_MATCH_SCORE_KEY
365
+ or metric_name == RESPONSE_EVALUATION_SCORE_KEY
366
+ ):
367
+ return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
320
368
 
321
- @staticmethod
322
- def _assert_score(metrics, metric_key, threshold, description, agent_module):
323
- """Asserts that a metric meets the specified threshold."""
324
- if metric_key in metrics:
325
- actual_score = metrics[metric_key]
326
- assert actual_score >= threshold, (
327
- f"{description} for {agent_module} is lower than expected. "
328
- f"Expected >= {threshold}, but got {actual_score}."
329
- )
369
+ raise ValueError(f"Unsupported eval metric: {metric_name}")
@@ -0,0 +1,104 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import Any
17
+ from typing import Optional
18
+ from typing import Tuple
19
+
20
+ from google.genai import types as genai_types
21
+ from pydantic import alias_generators
22
+ from pydantic import BaseModel
23
+ from pydantic import ConfigDict
24
+ from pydantic import Field
25
+
26
+
27
+ class EvalBaseModel(BaseModel):
28
+ model_config = ConfigDict(
29
+ alias_generator=alias_generators.to_camel,
30
+ populate_by_name=True,
31
+ )
32
+
33
+
34
+ class IntermediateData(EvalBaseModel):
35
+ """Container for intermediate data that an agent would generate as it responds with a final answer."""
36
+
37
+ tool_uses: list[genai_types.FunctionCall] = []
38
+ """Tool use trajectory in chronological order."""
39
+
40
+ intermediate_responses: list[Tuple[str, list[genai_types.Part]]] = []
41
+ """Intermediate responses generated by sub-agents to convey progress or status
42
+ in a multi-agent system, distinct from the final response.
43
+
44
+ This is expressed as a Tuple of:
45
+ - Author: Usually the sub-agent name that generated the intermediate
46
+ response.
47
+
48
+ - A list of Parts that comprise of the response.
49
+ """
50
+
51
+
52
+ class Invocation(EvalBaseModel):
53
+ """Represents a single invocation."""
54
+
55
+ invocation_id: str = ''
56
+ """Unique identifier for the invocation."""
57
+
58
+ user_content: genai_types.Content
59
+ """Content provided by the user in this invocation."""
60
+
61
+ final_response: Optional[genai_types.Content] = None
62
+ """Final response from the agent."""
63
+
64
+ intermediate_data: Optional[IntermediateData] = None
65
+ """Intermediate steps generated as a part of Agent execution.
66
+
67
+ For a multi-agent system, it is also helpful to inspect the route that
68
+ the agent took to generate final response.
69
+ """
70
+
71
+ creation_timestamp: float = 0.0
72
+ """Timestamp for the current invocation, primarily intended for debugging purposes."""
73
+
74
+
75
+ class SessionInput(EvalBaseModel):
76
+ """Values that help initialize a Session."""
77
+
78
+ app_name: str
79
+ """The name of the app."""
80
+
81
+ user_id: str
82
+ """The user id."""
83
+
84
+ state: dict[str, Any] = Field(default_factory=dict)
85
+ """The state of the session."""
86
+
87
+
88
+ class EvalCase(EvalBaseModel):
89
+ """An eval case."""
90
+
91
+ eval_id: str
92
+ """Unique identifier for the evaluation case."""
93
+
94
+ conversation: list[Invocation]
95
+ """A conversation between the user and the Agent. The conversation can have any number of invocations."""
96
+
97
+ session_input: Optional[SessionInput] = None
98
+ """Session input that will be passed on to the Agent during eval.
99
+ It is common for Agents state to be initialized to some initial/default value,
100
+ for example, your agent may need to know today's date.
101
+ """
102
+
103
+ creation_timestamp: float = 0.0
104
+ """The time at which this eval case was created."""
@@ -0,0 +1,74 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Optional
18
+
19
+ from pydantic import alias_generators
20
+ from pydantic import BaseModel
21
+ from pydantic import ConfigDict
22
+
23
+ from .eval_case import Invocation
24
+ from .evaluator import EvalStatus
25
+
26
+
27
+ class EvalMetric(BaseModel):
28
+ """A metric used to evaluate a particular aspect of an eval case."""
29
+
30
+ model_config = ConfigDict(
31
+ alias_generator=alias_generators.to_camel,
32
+ populate_by_name=True,
33
+ )
34
+
35
+ model_config = ConfigDict(
36
+ alias_generator=alias_generators.to_camel,
37
+ populate_by_name=True,
38
+ )
39
+
40
+ metric_name: str
41
+ """The name of the metric."""
42
+
43
+ threshold: float
44
+ """A threshold value. Each metric decides how to interpret this threshold."""
45
+
46
+
47
+ class EvalMetricResult(EvalMetric):
48
+ """The actual computed score/value of a particular EvalMetric."""
49
+
50
+ model_config = ConfigDict(
51
+ alias_generator=alias_generators.to_camel,
52
+ populate_by_name=True,
53
+ )
54
+
55
+ score: Optional[float] = None
56
+ eval_status: EvalStatus
57
+
58
+
59
+ class EvalMetricResultPerInvocation(BaseModel):
60
+ """Eval metric results per invocation."""
61
+
62
+ model_config = ConfigDict(
63
+ alias_generator=alias_generators.to_camel,
64
+ populate_by_name=True,
65
+ )
66
+
67
+ actual_invocation: Invocation
68
+ """The actual invocation, usually obtained by inferencing the agent."""
69
+
70
+ expected_invocation: Invocation
71
+ """The expected invocation, usually the reference or golden invocation."""
72
+
73
+ eval_metric_results: list[EvalMetricResult] = []
74
+ """Eval resutls for each applicable metric."""
@@ -0,0 +1,86 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Optional
18
+
19
+ from pydantic import alias_generators
20
+ from pydantic import BaseModel
21
+ from pydantic import ConfigDict
22
+ from pydantic import Field
23
+
24
+ from ..sessions.session import Session
25
+ from .eval_metrics import EvalMetric
26
+ from .eval_metrics import EvalMetricResult
27
+ from .eval_metrics import EvalMetricResultPerInvocation
28
+ from .evaluator import EvalStatus
29
+
30
+
31
+ class EvalCaseResult(BaseModel):
32
+ """Case level evaluation results."""
33
+
34
+ model_config = ConfigDict(
35
+ alias_generator=alias_generators.to_camel,
36
+ populate_by_name=True,
37
+ )
38
+
39
+ eval_set_file: str = Field(
40
+ deprecated=True,
41
+ description="This field is deprecated, use eval_set_id instead.",
42
+ )
43
+ eval_set_id: str = ""
44
+ """The eval set id."""
45
+
46
+ eval_id: str = ""
47
+ """The eval case id."""
48
+
49
+ final_eval_status: EvalStatus
50
+ """Final eval status for this eval case."""
51
+
52
+ eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field(
53
+ deprecated=True,
54
+ description=(
55
+ "This field is deprecated, use overall_eval_metric_results instead."
56
+ ),
57
+ )
58
+
59
+ overall_eval_metric_results: list[EvalMetricResult]
60
+ """Overall result for each metric for the entire eval case."""
61
+
62
+ eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation]
63
+ """Result for each metric on a per invocation basis."""
64
+
65
+ session_id: str
66
+ """Session id of the session generated as result of inferencing/scraping stage of the eval."""
67
+
68
+ session_details: Optional[Session] = None
69
+ """Session generated as result of inferencing/scraping stage of the eval."""
70
+
71
+ user_id: Optional[str] = None
72
+ """User id used during inferencing/scraping stage of the eval."""
73
+
74
+
75
+ class EvalSetResult(BaseModel):
76
+ """Eval set level evaluation results."""
77
+
78
+ model_config = ConfigDict(
79
+ alias_generator=alias_generators.to_camel,
80
+ populate_by_name=True,
81
+ )
82
+ eval_set_result_id: str
83
+ eval_set_result_name: str
84
+ eval_set_id: str
85
+ eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
86
+ creation_timestamp: float = 0.0
@@ -0,0 +1,39 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional
16
+
17
+ from pydantic import BaseModel
18
+
19
+ from .eval_case import EvalCase
20
+
21
+
22
+ class EvalSet(BaseModel):
23
+ """A set of eval cases."""
24
+
25
+ eval_set_id: str
26
+ """Unique identifier for the eval set."""
27
+
28
+ name: Optional[str] = None
29
+ """Name of the dataset."""
30
+
31
+ description: Optional[str] = None
32
+ """Description of the dataset."""
33
+
34
+ eval_cases: list[EvalCase]
35
+ """List of eval cases in the dataset. Each case represents a single
36
+ interaction to be evaluated."""
37
+
38
+ creation_timestamp: float = 0.0
39
+ """The time at which this eval set was created."""
@@ -0,0 +1,47 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ from abc import ABC
18
+ from abc import abstractmethod
19
+
20
+ from .eval_result import EvalCaseResult
21
+ from .eval_result import EvalSetResult
22
+
23
+
24
+ class EvalSetResultsManager(ABC):
25
+ """An interface to manage Eval Set Results."""
26
+
27
+ @abstractmethod
28
+ def save_eval_set_result(
29
+ self,
30
+ app_name: str,
31
+ eval_set_id: str,
32
+ eval_case_results: list[EvalCaseResult],
33
+ ) -> None:
34
+ """Creates and saves a new EvalSetResult given eval_case_results."""
35
+ raise NotImplementedError()
36
+
37
+ @abstractmethod
38
+ def get_eval_set_result(
39
+ self, app_name: str, eval_set_result_id: str
40
+ ) -> EvalSetResult:
41
+ """Returns an EvalSetResult identified by app_name and eval_set_result_id."""
42
+ raise NotImplementedError()
43
+
44
+ @abstractmethod
45
+ def list_eval_set_results(self, app_name: str) -> list[str]:
46
+ """Returns the eval result ids that belong to the given app_name."""
47
+ raise NotImplementedError()