google-adk 1.6.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. google/adk/a2a/converters/event_converter.py +5 -85
  2. google/adk/a2a/converters/request_converter.py +1 -2
  3. google/adk/a2a/executor/a2a_agent_executor.py +45 -16
  4. google/adk/a2a/logs/log_utils.py +1 -2
  5. google/adk/a2a/utils/__init__.py +0 -0
  6. google/adk/a2a/utils/agent_card_builder.py +544 -0
  7. google/adk/a2a/utils/agent_to_a2a.py +118 -0
  8. google/adk/agents/__init__.py +5 -0
  9. google/adk/agents/agent_config.py +46 -0
  10. google/adk/agents/base_agent.py +239 -41
  11. google/adk/agents/callback_context.py +41 -0
  12. google/adk/agents/common_configs.py +79 -0
  13. google/adk/agents/config_agent_utils.py +184 -0
  14. google/adk/agents/config_schemas/AgentConfig.json +566 -0
  15. google/adk/agents/invocation_context.py +5 -1
  16. google/adk/agents/live_request_queue.py +15 -0
  17. google/adk/agents/llm_agent.py +201 -9
  18. google/adk/agents/loop_agent.py +35 -1
  19. google/adk/agents/parallel_agent.py +24 -3
  20. google/adk/agents/remote_a2a_agent.py +17 -5
  21. google/adk/agents/sequential_agent.py +22 -1
  22. google/adk/artifacts/gcs_artifact_service.py +110 -20
  23. google/adk/auth/auth_handler.py +3 -3
  24. google/adk/auth/credential_manager.py +23 -23
  25. google/adk/auth/credential_service/base_credential_service.py +6 -6
  26. google/adk/auth/credential_service/in_memory_credential_service.py +10 -8
  27. google/adk/auth/credential_service/session_state_credential_service.py +8 -8
  28. google/adk/auth/exchanger/oauth2_credential_exchanger.py +3 -3
  29. google/adk/auth/oauth2_credential_util.py +2 -2
  30. google/adk/auth/refresher/oauth2_credential_refresher.py +4 -4
  31. google/adk/cli/agent_graph.py +3 -1
  32. google/adk/cli/browser/index.html +2 -2
  33. google/adk/cli/browser/main-W7QZBYAR.js +3914 -0
  34. google/adk/cli/browser/polyfills-B6TNHZQ6.js +17 -0
  35. google/adk/cli/cli_eval.py +87 -12
  36. google/adk/cli/cli_tools_click.py +143 -82
  37. google/adk/cli/fast_api.py +150 -69
  38. google/adk/cli/utils/agent_loader.py +35 -1
  39. google/adk/code_executors/base_code_executor.py +14 -19
  40. google/adk/code_executors/built_in_code_executor.py +4 -1
  41. google/adk/evaluation/base_eval_service.py +46 -2
  42. google/adk/evaluation/eval_metrics.py +4 -0
  43. google/adk/evaluation/eval_sets_manager.py +5 -1
  44. google/adk/evaluation/evaluation_generator.py +1 -1
  45. google/adk/evaluation/final_response_match_v2.py +2 -2
  46. google/adk/evaluation/gcs_eval_sets_manager.py +2 -1
  47. google/adk/evaluation/in_memory_eval_sets_manager.py +151 -0
  48. google/adk/evaluation/local_eval_service.py +389 -0
  49. google/adk/evaluation/local_eval_set_results_manager.py +2 -2
  50. google/adk/evaluation/local_eval_sets_manager.py +24 -9
  51. google/adk/evaluation/metric_evaluator_registry.py +16 -6
  52. google/adk/evaluation/vertex_ai_eval_facade.py +7 -1
  53. google/adk/events/event.py +7 -2
  54. google/adk/flows/llm_flows/auto_flow.py +6 -11
  55. google/adk/flows/llm_flows/base_llm_flow.py +66 -29
  56. google/adk/flows/llm_flows/contents.py +16 -10
  57. google/adk/flows/llm_flows/functions.py +89 -52
  58. google/adk/memory/in_memory_memory_service.py +21 -15
  59. google/adk/memory/vertex_ai_memory_bank_service.py +12 -10
  60. google/adk/models/anthropic_llm.py +46 -6
  61. google/adk/models/base_llm_connection.py +2 -0
  62. google/adk/models/gemini_llm_connection.py +17 -6
  63. google/adk/models/google_llm.py +46 -11
  64. google/adk/models/lite_llm.py +52 -22
  65. google/adk/plugins/__init__.py +17 -0
  66. google/adk/plugins/base_plugin.py +317 -0
  67. google/adk/plugins/plugin_manager.py +265 -0
  68. google/adk/runners.py +122 -18
  69. google/adk/sessions/database_session_service.py +51 -52
  70. google/adk/sessions/vertex_ai_session_service.py +27 -12
  71. google/adk/tools/__init__.py +2 -0
  72. google/adk/tools/_automatic_function_calling_util.py +20 -2
  73. google/adk/tools/agent_tool.py +15 -3
  74. google/adk/tools/apihub_tool/apihub_toolset.py +38 -39
  75. google/adk/tools/application_integration_tool/application_integration_toolset.py +35 -37
  76. google/adk/tools/application_integration_tool/integration_connector_tool.py +2 -3
  77. google/adk/tools/base_tool.py +9 -9
  78. google/adk/tools/base_toolset.py +29 -5
  79. google/adk/tools/bigquery/__init__.py +3 -3
  80. google/adk/tools/bigquery/metadata_tool.py +2 -0
  81. google/adk/tools/bigquery/query_tool.py +15 -1
  82. google/adk/tools/computer_use/__init__.py +13 -0
  83. google/adk/tools/computer_use/base_computer.py +265 -0
  84. google/adk/tools/computer_use/computer_use_tool.py +166 -0
  85. google/adk/tools/computer_use/computer_use_toolset.py +220 -0
  86. google/adk/tools/enterprise_search_tool.py +4 -2
  87. google/adk/tools/exit_loop_tool.py +1 -0
  88. google/adk/tools/google_api_tool/google_api_tool.py +16 -1
  89. google/adk/tools/google_api_tool/google_api_toolset.py +9 -7
  90. google/adk/tools/google_api_tool/google_api_toolsets.py +41 -20
  91. google/adk/tools/google_search_tool.py +4 -2
  92. google/adk/tools/langchain_tool.py +16 -6
  93. google/adk/tools/long_running_tool.py +21 -0
  94. google/adk/tools/mcp_tool/mcp_toolset.py +27 -28
  95. google/adk/tools/openapi_tool/openapi_spec_parser/openapi_spec_parser.py +5 -0
  96. google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +8 -8
  97. google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +4 -6
  98. google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +3 -2
  99. google/adk/tools/tool_context.py +0 -10
  100. google/adk/tools/url_context_tool.py +4 -2
  101. google/adk/tools/vertex_ai_search_tool.py +4 -2
  102. google/adk/utils/model_name_utils.py +90 -0
  103. google/adk/version.py +1 -1
  104. {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/METADATA +3 -2
  105. {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/RECORD +108 -91
  106. google/adk/cli/browser/main-RXDVX3K6.js +0 -3914
  107. google/adk/cli/browser/polyfills-FFHMD2TL.js +0 -17
  108. {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/WHEEL +0 -0
  109. {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/entry_points.txt +0 -0
  110. {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -21,7 +21,7 @@ from typing import Optional
21
21
  from typing_extensions import override
22
22
 
23
23
  from ..models.llm_response import LlmResponse
24
- from ..utils.feature_decorator import working_in_progress
24
+ from ..utils.feature_decorator import experimental
25
25
  from .eval_case import Invocation
26
26
  from .eval_metrics import EvalMetric
27
27
  from .evaluator import EvalStatus
@@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
125
125
  return label
126
126
 
127
127
 
128
- @working_in_progress
128
+ @experimental
129
129
  class FinalResponseMatchV2Evaluator(LlmAsJudge):
130
130
  """V2 final response match evaluator which uses an LLM to judge responses.
131
131
 
@@ -23,6 +23,7 @@ from google.cloud import exceptions as cloud_exceptions
23
23
  from google.cloud import storage
24
24
  from typing_extensions import override
25
25
 
26
+ from ..errors.not_found_error import NotFoundError
26
27
  from ._eval_sets_manager_utils import add_eval_case_to_eval_set
27
28
  from ._eval_sets_manager_utils import delete_eval_case_from_eval_set
28
29
  from ._eval_sets_manager_utils import get_eval_case_from_eval_set
@@ -130,7 +131,7 @@ class GcsEvalSetsManager(EvalSetsManager):
130
131
  eval_sets.append(eval_set_id)
131
132
  return sorted(eval_sets)
132
133
  except cloud_exceptions.NotFound as e:
133
- raise ValueError(
134
+ raise NotFoundError(
134
135
  f"App `{app_name}` not found in GCS bucket `{self.bucket_name}`."
135
136
  ) from e
136
137
 
@@ -0,0 +1,151 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import time
18
+ from typing import Optional
19
+
20
+ from typing_extensions import override
21
+
22
+ from ..errors.not_found_error import NotFoundError
23
+ from .eval_case import EvalCase
24
+ from .eval_set import EvalSet
25
+ from .eval_sets_manager import EvalSetsManager
26
+
27
+
28
+ class InMemoryEvalSetsManager(EvalSetsManager):
29
+ """An in-memory implementation of EvalSetsManager using dictionaries.
30
+
31
+ You can use this class:
32
+ 1) As a part of your testcase.
33
+ 2) For cases where other implementations of EvalSetsManager are too expensive
34
+ to use.
35
+ """
36
+
37
+ def __init__(self):
38
+ # {app_name: {eval_set_id: EvalSet}}
39
+ self._eval_sets: dict[str, dict[str, EvalSet]] = {}
40
+ # {app_name: {eval_set_id: {eval_case_id: EvalCase}}}
41
+ self._eval_cases: dict[str, dict[str, dict[str, EvalCase]]] = {}
42
+
43
+ def _ensure_app_exists(self, app_name: str):
44
+ if app_name not in self._eval_sets:
45
+ self._eval_sets[app_name] = {}
46
+ self._eval_cases[app_name] = {}
47
+
48
+ @override
49
+ def get_eval_set(self, app_name: str, eval_set_id: str) -> Optional[EvalSet]:
50
+ self._ensure_app_exists(app_name)
51
+ return self._eval_sets[app_name].get(eval_set_id, None)
52
+
53
+ @override
54
+ def create_eval_set(self, app_name: str, eval_set_id: str):
55
+ self._ensure_app_exists(app_name)
56
+ if eval_set_id in self._eval_sets[app_name]:
57
+ raise ValueError(
58
+ f"EvalSet {eval_set_id} already exists for app {app_name}."
59
+ )
60
+
61
+ new_eval_set = EvalSet(
62
+ eval_set_id=eval_set_id,
63
+ eval_cases=[],
64
+ creation_timestamp=time.time(),
65
+ )
66
+ self._eval_sets[app_name][eval_set_id] = new_eval_set
67
+ self._eval_cases[app_name][eval_set_id] = {}
68
+
69
+ @override
70
+ def list_eval_sets(self, app_name: str) -> list[str]:
71
+ if app_name not in self._eval_sets:
72
+ return []
73
+
74
+ return list(self._eval_sets[app_name].keys())
75
+
76
+ @override
77
+ def get_eval_case(
78
+ self, app_name: str, eval_set_id: str, eval_case_id: str
79
+ ) -> Optional[EvalCase]:
80
+ if app_name not in self._eval_cases:
81
+ return None
82
+ if eval_set_id not in self._eval_cases[app_name]:
83
+ return None
84
+ return self._eval_cases[app_name][eval_set_id].get(eval_case_id)
85
+
86
+ @override
87
+ def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
88
+ self._ensure_app_exists(app_name)
89
+ if eval_set_id not in self._eval_sets[app_name]:
90
+ raise NotFoundError(
91
+ f"EvalSet {eval_set_id} not found for app {app_name}."
92
+ )
93
+ if eval_case.eval_id in self._eval_cases[app_name][eval_set_id]:
94
+ raise ValueError(
95
+ f"EvalCase {eval_case.eval_id} already exists in EvalSet"
96
+ f" {eval_set_id} for app {app_name}."
97
+ )
98
+
99
+ self._eval_cases[app_name][eval_set_id][eval_case.eval_id] = eval_case
100
+ # Also update the list in the EvalSet object
101
+ self._eval_sets[app_name][eval_set_id].eval_cases.append(eval_case)
102
+
103
+ @override
104
+ def update_eval_case(
105
+ self, app_name: str, eval_set_id: str, updated_eval_case: EvalCase
106
+ ):
107
+ self._ensure_app_exists(app_name)
108
+ if eval_set_id not in self._eval_sets[app_name]:
109
+ raise NotFoundError(
110
+ f"EvalSet {eval_set_id} not found for app {app_name}."
111
+ )
112
+ if updated_eval_case.eval_id not in self._eval_cases[app_name][eval_set_id]:
113
+ raise NotFoundError(
114
+ f"EvalCase {updated_eval_case.eval_id} not found in EvalSet"
115
+ f" {eval_set_id} for app {app_name}."
116
+ )
117
+
118
+ # Full replace
119
+ self._eval_cases[app_name][eval_set_id][
120
+ updated_eval_case.eval_id
121
+ ] = updated_eval_case
122
+
123
+ # Update the list in the EvalSet object
124
+ eval_set = self._eval_sets[app_name][eval_set_id]
125
+ for i, case in enumerate(eval_set.eval_cases):
126
+ if case.eval_id == updated_eval_case.eval_id:
127
+ eval_set.eval_cases[i] = updated_eval_case
128
+ break
129
+
130
+ @override
131
+ def delete_eval_case(
132
+ self, app_name: str, eval_set_id: str, eval_case_id: str
133
+ ):
134
+ self._ensure_app_exists(app_name)
135
+ if eval_set_id not in self._eval_sets[app_name]:
136
+ raise NotFoundError(
137
+ f"EvalSet {eval_set_id} not found for app {app_name}."
138
+ )
139
+ if eval_case_id not in self._eval_cases[app_name][eval_set_id]:
140
+ raise NotFoundError(
141
+ f"EvalCase {eval_case_id} not found in EvalSet {eval_set_id}"
142
+ f" for app {app_name}."
143
+ )
144
+
145
+ del self._eval_cases[app_name][eval_set_id][eval_case_id]
146
+
147
+ # Remove from the list in the EvalSet object
148
+ eval_set = self._eval_sets[app_name][eval_set_id]
149
+ eval_set.eval_cases = [
150
+ case for case in eval_set.eval_cases if case.eval_id != eval_case_id
151
+ ]
@@ -0,0 +1,389 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import inspect
19
+ import logging
20
+ from typing import AsyncGenerator
21
+ from typing import Callable
22
+ from typing import Optional
23
+ import uuid
24
+
25
+ from typing_extensions import override
26
+
27
+ from ..agents import BaseAgent
28
+ from ..artifacts.base_artifact_service import BaseArtifactService
29
+ from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
30
+ from ..errors.not_found_error import NotFoundError
31
+ from ..sessions.base_session_service import BaseSessionService
32
+ from ..sessions.in_memory_session_service import InMemorySessionService
33
+ from ..utils.feature_decorator import experimental
34
+ from .base_eval_service import BaseEvalService
35
+ from .base_eval_service import EvaluateConfig
36
+ from .base_eval_service import EvaluateRequest
37
+ from .base_eval_service import InferenceRequest
38
+ from .base_eval_service import InferenceResult
39
+ from .base_eval_service import InferenceStatus
40
+ from .eval_case import Invocation
41
+ from .eval_metrics import EvalMetric
42
+ from .eval_metrics import EvalMetricResult
43
+ from .eval_metrics import EvalMetricResultPerInvocation
44
+ from .eval_result import EvalCaseResult
45
+ from .eval_set import EvalCase
46
+ from .eval_set_results_manager import EvalSetResultsManager
47
+ from .eval_sets_manager import EvalSetsManager
48
+ from .evaluation_generator import EvaluationGenerator
49
+ from .evaluator import EvalStatus
50
+ from .evaluator import EvaluationResult
51
+ from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
52
+ from .metric_evaluator_registry import MetricEvaluatorRegistry
53
+
54
+ logger = logging.getLogger('google_adk.' + __name__)
55
+
56
+ EVAL_SESSION_ID_PREFIX = '___eval___session___'
57
+
58
+
59
+ def _get_session_id() -> str:
60
+ return f'{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}'
61
+
62
+
63
+ @experimental
64
+ class LocalEvalService(BaseEvalService):
65
+ """An implementation of BaseEvalService, that runs the evals locally."""
66
+
67
+ def __init__(
68
+ self,
69
+ root_agent: BaseAgent,
70
+ eval_sets_manager: EvalSetsManager,
71
+ metric_evaluator_registry: MetricEvaluatorRegistry = DEFAULT_METRIC_EVALUATOR_REGISTRY,
72
+ session_service: BaseSessionService = InMemorySessionService(),
73
+ artifact_service: BaseArtifactService = InMemoryArtifactService(),
74
+ eval_set_results_manager: Optional[EvalSetResultsManager] = None,
75
+ session_id_supplier: Callable[[], str] = _get_session_id,
76
+ ):
77
+ self._root_agent = root_agent
78
+ self._eval_sets_manager = eval_sets_manager
79
+ self._metric_evaluator_registry = metric_evaluator_registry
80
+ self._session_service = session_service
81
+ self._artifact_service = artifact_service
82
+ self._eval_set_results_manager = eval_set_results_manager
83
+ self._session_id_supplier = session_id_supplier
84
+
85
+ @override
86
+ async def perform_inference(
87
+ self,
88
+ inference_request: InferenceRequest,
89
+ ) -> AsyncGenerator[InferenceResult, None]:
90
+ """Returns InferenceResult obtained from the Agent as and when they are available.
91
+
92
+ Args:
93
+ inference_request: The request for generating inferences.
94
+ """
95
+ # Get the eval set from the storage.
96
+ eval_set = self._eval_sets_manager.get_eval_set(
97
+ app_name=inference_request.app_name,
98
+ eval_set_id=inference_request.eval_set_id,
99
+ )
100
+
101
+ if not eval_set:
102
+ raise NotFoundError(
103
+ f'Eval set with id {inference_request.eval_set_id} not found for app'
104
+ f' {inference_request.app_name}'
105
+ )
106
+
107
+ # Select eval cases for which we need to run inferencing. If the inference
108
+ # request specified eval cases, then we use only those.
109
+ eval_cases = eval_set.eval_cases
110
+ if inference_request.eval_case_ids:
111
+ eval_cases = [
112
+ eval_case
113
+ for eval_case in eval_cases
114
+ if eval_case.eval_id in inference_request.eval_case_ids
115
+ ]
116
+
117
+ root_agent = self._root_agent.clone()
118
+
119
+ semaphore = asyncio.Semaphore(
120
+ value=inference_request.inference_config.parallelism
121
+ )
122
+
123
+ async def run_inference(eval_case):
124
+ async with semaphore:
125
+ return await self._perform_inference_sigle_eval_item(
126
+ app_name=inference_request.app_name,
127
+ eval_set_id=inference_request.eval_set_id,
128
+ eval_case=eval_case,
129
+ root_agent=root_agent,
130
+ )
131
+
132
+ inference_results = [run_inference(eval_case) for eval_case in eval_cases]
133
+ for inference_result in asyncio.as_completed(inference_results):
134
+ yield await inference_result
135
+
136
+ @override
137
+ async def evaluate(
138
+ self,
139
+ evaluate_request: EvaluateRequest,
140
+ ) -> AsyncGenerator[EvalCaseResult, None]:
141
+ """Returns EvalCaseResult for each item as and when they are available.
142
+
143
+ Args:
144
+ evaluate_request: The request to perform metric evaluations on the
145
+ inferences.
146
+ """
147
+ semaphore = asyncio.Semaphore(
148
+ value=evaluate_request.evaluate_config.parallelism
149
+ )
150
+
151
+ async def run_evaluation(inference_result):
152
+ async with semaphore:
153
+ return await self._evaluate_single_inference_result(
154
+ inference_result=inference_result,
155
+ evaluate_config=evaluate_request.evaluate_config,
156
+ )
157
+
158
+ evaluation_tasks = [
159
+ run_evaluation(inference_result)
160
+ for inference_result in evaluate_request.inference_results
161
+ ]
162
+
163
+ for evaluation_task in asyncio.as_completed(evaluation_tasks):
164
+ inference_result, eval_case_result = await evaluation_task
165
+
166
+ if self._eval_set_results_manager:
167
+ self._eval_set_results_manager.save_eval_set_result(
168
+ app_name=inference_result.app_name,
169
+ eval_set_id=inference_result.eval_set_id,
170
+ eval_case_results=[eval_case_result],
171
+ )
172
+
173
+ yield eval_case_result
174
+
175
+ async def _evaluate_single_inference_result(
176
+ self, inference_result: InferenceResult, evaluate_config: EvaluateConfig
177
+ ) -> tuple[InferenceResult, EvalCaseResult]:
178
+ """Returns EvalCaseResult for the given inference result.
179
+
180
+ A single inference result can have multiple invocations. For each
181
+ invocaiton, this method evaluates the metrics present in evaluate config.
182
+
183
+ The EvalCaseResult contains scores for each metric per invocation and the
184
+ overall score.
185
+ """
186
+ eval_case = self._eval_sets_manager.get_eval_case(
187
+ app_name=inference_result.app_name,
188
+ eval_set_id=inference_result.eval_set_id,
189
+ eval_case_id=inference_result.eval_case_id,
190
+ )
191
+
192
+ if eval_case is None:
193
+ raise NotFoundError(
194
+ f'Eval case with id {inference_result.eval_case_id} not found for'
195
+ f' app {inference_result.app_name} and eval set'
196
+ f' {inference_result.eval_set_id}.'
197
+ )
198
+
199
+ # Metric results for each invocation
200
+ eval_metric_result_per_invocation = []
201
+
202
+ # We also keep track of the overall score for a metric, derived from all
203
+ # invocation. For example, if we were keeping track the metric that compares
204
+ # how well is the final resposne as compared to a golden answer, then each
205
+ # invocation will have the value of this metric. We will also have an
206
+ # overall score using aggregation strategy across all invocations. This
207
+ # would be the score for the eval case.
208
+ overall_eval_metric_results = []
209
+
210
+ if len(inference_result.inferences) != len(eval_case.conversation):
211
+ raise ValueError(
212
+ 'Inferences should match conversations in eval case. Found'
213
+ f'{len(inference_result.inferences)} inferences '
214
+ f'{len(eval_case.conversation)} conversations in eval cases.'
215
+ )
216
+
217
+ # Pre-creating the EvalMetricResults entries for each invocation.
218
+ for actual, expected in zip(
219
+ inference_result.inferences, eval_case.conversation
220
+ ):
221
+ eval_metric_result_per_invocation.append(
222
+ EvalMetricResultPerInvocation(
223
+ actual_invocation=actual,
224
+ expected_invocation=expected,
225
+ # We will fill this as we evaluate each metric per invocation.
226
+ eval_metric_results=[],
227
+ )
228
+ )
229
+
230
+ for eval_metric in evaluate_config.eval_metrics:
231
+ # Perform evaluation of the metric.
232
+ evaluation_result = await self._evaluate_metric(
233
+ eval_metric=eval_metric,
234
+ actual_invocations=inference_result.inferences,
235
+ expected_invocations=eval_case.conversation,
236
+ )
237
+
238
+ # Track overall scrore across all invocations.
239
+ overall_eval_metric_results.append(
240
+ EvalMetricResult(
241
+ metric_name=eval_metric.metric_name,
242
+ threshold=eval_metric.threshold,
243
+ score=evaluation_result.overall_score,
244
+ eval_status=evaluation_result.overall_eval_status,
245
+ )
246
+ )
247
+
248
+ if len(evaluation_result.per_invocation_results) != len(
249
+ eval_metric_result_per_invocation
250
+ ):
251
+ raise ValueError(
252
+ 'Eval metric should return results for each invocation. Found '
253
+ f'{len(evaluation_result.per_invocation_results)} results for '
254
+ f'{len(eval_metric_result_per_invocation)} invocations.'
255
+ )
256
+
257
+ # Track score across individual invocations.
258
+ for invocation_result, invocation in zip(
259
+ evaluation_result.per_invocation_results,
260
+ eval_metric_result_per_invocation,
261
+ ):
262
+ invocation.eval_metric_results.append(
263
+ EvalMetricResult(
264
+ metric_name=eval_metric.metric_name,
265
+ threshold=eval_metric.threshold,
266
+ score=invocation_result.score,
267
+ eval_status=invocation_result.eval_status,
268
+ )
269
+ )
270
+
271
+ final_eval_status = self._generate_final_eval_status(
272
+ overall_eval_metric_results
273
+ )
274
+ user_id = (
275
+ eval_case.session_input.user_id
276
+ if eval_case.session_input and eval_case.session_input.user_id
277
+ else 'test_user_id'
278
+ )
279
+
280
+ eval_case_result = EvalCaseResult(
281
+ eval_set_file=inference_result.eval_set_id,
282
+ eval_set_id=inference_result.eval_set_id,
283
+ eval_id=inference_result.eval_case_id,
284
+ final_eval_status=final_eval_status,
285
+ overall_eval_metric_results=overall_eval_metric_results,
286
+ eval_metric_result_per_invocation=eval_metric_result_per_invocation,
287
+ session_id=inference_result.session_id,
288
+ session_details=await self._session_service.get_session(
289
+ app_name=inference_result.app_name,
290
+ user_id=user_id,
291
+ session_id=inference_result.session_id,
292
+ ),
293
+ user_id=user_id,
294
+ )
295
+
296
+ return (inference_result, eval_case_result)
297
+
298
+ async def _evaluate_metric(
299
+ self,
300
+ eval_metric: EvalMetric,
301
+ actual_invocations: list[Invocation],
302
+ expected_invocations: list[Invocation],
303
+ ) -> EvaluationResult:
304
+ """Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
305
+
306
+ # Get the metric evaluator from the registry.
307
+ metric_evaluator = self._metric_evaluator_registry.get_evaluator(
308
+ eval_metric=eval_metric
309
+ )
310
+
311
+ if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
312
+ # Some evaluators could be async, for example those that use llm as a
313
+ # judge, so we need to make sure that we wait on them.
314
+ return await metric_evaluator.evaluate_invocations(
315
+ actual_invocations=actual_invocations,
316
+ expected_invocations=expected_invocations,
317
+ )
318
+ else:
319
+ # Metrics that perform computation synchronously, mostly these don't
320
+ # perform any i/o. An example of this would calculation of rouge_1 score.
321
+ return metric_evaluator.evaluate_invocations(
322
+ actual_invocations=actual_invocations,
323
+ expected_invocations=expected_invocations,
324
+ )
325
+
326
+ def _generate_final_eval_status(
327
+ self, overall_eval_metric_results: list[EvalMetricResult]
328
+ ) -> EvalStatus:
329
+ final_eval_status = EvalStatus.NOT_EVALUATED
330
+ # Go over the all the eval statuses and mark the final eval status as
331
+ # passed if all of them pass, otherwise mark the final eval status to
332
+ # failed.
333
+ for overall_eval_metric_result in overall_eval_metric_results:
334
+ overall_eval_status = overall_eval_metric_result.eval_status
335
+ if overall_eval_status == EvalStatus.PASSED:
336
+ final_eval_status = EvalStatus.PASSED
337
+ elif overall_eval_status == EvalStatus.NOT_EVALUATED:
338
+ continue
339
+ elif overall_eval_status == EvalStatus.FAILED:
340
+ final_eval_status = EvalStatus.FAILED
341
+ break
342
+ else:
343
+ raise ValueError(f'Unknown eval status: {overall_eval_status}.')
344
+
345
+ return final_eval_status
346
+
347
+ async def _perform_inference_sigle_eval_item(
348
+ self,
349
+ app_name: str,
350
+ eval_set_id: str,
351
+ eval_case: EvalCase,
352
+ root_agent: BaseAgent,
353
+ ) -> InferenceResult:
354
+ initial_session = eval_case.session_input
355
+ session_id = self._session_id_supplier()
356
+ inference_result = InferenceResult(
357
+ app_name=app_name,
358
+ eval_set_id=eval_set_id,
359
+ eval_case_id=eval_case.eval_id,
360
+ session_id=session_id,
361
+ )
362
+
363
+ try:
364
+ inferences = (
365
+ await EvaluationGenerator._generate_inferences_from_root_agent(
366
+ invocations=eval_case.conversation,
367
+ root_agent=root_agent,
368
+ initial_session=initial_session,
369
+ session_id=session_id,
370
+ session_service=self._session_service,
371
+ artifact_service=self._artifact_service,
372
+ )
373
+ )
374
+
375
+ inference_result.inferences = inferences
376
+ inference_result.status = InferenceStatus.SUCCESS
377
+
378
+ return inference_result
379
+ except Exception as e:
380
+ # We intentionally catch the Exception as we don't failures to affect
381
+ # other inferences.
382
+ logger.error(
383
+ 'Inference failed for eval case `%s` with error %s',
384
+ eval_case.eval_id,
385
+ e,
386
+ )
387
+ inference_result.status = InferenceStatus.FAILURE
388
+ inference_result.error_message = str(e)
389
+ return inference_result
@@ -60,7 +60,7 @@ class LocalEvalSetResultsManager(EvalSetResultsManager):
60
60
  eval_set_result.eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION,
61
61
  )
62
62
  logger.info("Writing eval result to file: %s", eval_set_result_file_path)
63
- with open(eval_set_result_file_path, "w") as f:
63
+ with open(eval_set_result_file_path, "w", encoding="utf-8") as f:
64
64
  f.write(json.dumps(eval_set_result_json, indent=2))
65
65
 
66
66
  @override
@@ -78,7 +78,7 @@ class LocalEvalSetResultsManager(EvalSetResultsManager):
78
78
  )
79
79
  if not os.path.exists(maybe_eval_result_file_path):
80
80
  raise NotFoundError(f"Eval set result `{eval_set_result_id}` not found.")
81
- with open(maybe_eval_result_file_path, "r") as file:
81
+ with open(maybe_eval_result_file_path, "r", encoding="utf-8") as file:
82
82
  eval_result_data = json.load(file)
83
83
  return EvalSetResult.model_validate_json(eval_result_data)
84
84
 
@@ -27,6 +27,7 @@ from google.genai import types as genai_types
27
27
  from pydantic import ValidationError
28
28
  from typing_extensions import override
29
29
 
30
+ from ..errors.not_found_error import NotFoundError
30
31
  from ._eval_sets_manager_utils import add_eval_case_to_eval_set
31
32
  from ._eval_sets_manager_utils import delete_eval_case_from_eval_set
32
33
  from ._eval_sets_manager_utils import get_eval_case_from_eval_set
@@ -226,16 +227,30 @@ class LocalEvalSetsManager(EvalSetsManager):
226
227
 
227
228
  @override
228
229
  def list_eval_sets(self, app_name: str) -> list[str]:
229
- """Returns a list of EvalSets that belong to the given app_name."""
230
+ """Returns a list of EvalSets that belong to the given app_name.
231
+
232
+ Args:
233
+ app_name: The app name to list the eval sets for.
234
+
235
+ Returns:
236
+ A list of EvalSet ids.
237
+
238
+ Raises:
239
+ NotFoundError: If the eval directory for the app is not found.
240
+ """
230
241
  eval_set_file_path = os.path.join(self._agents_dir, app_name)
231
242
  eval_sets = []
232
- for file in os.listdir(eval_set_file_path):
233
- if file.endswith(_EVAL_SET_FILE_EXTENSION):
234
- eval_sets.append(
235
- os.path.basename(file).removesuffix(_EVAL_SET_FILE_EXTENSION)
236
- )
237
-
238
- return sorted(eval_sets)
243
+ try:
244
+ for file in os.listdir(eval_set_file_path):
245
+ if file.endswith(_EVAL_SET_FILE_EXTENSION):
246
+ eval_sets.append(
247
+ os.path.basename(file).removesuffix(_EVAL_SET_FILE_EXTENSION)
248
+ )
249
+ return sorted(eval_sets)
250
+ except FileNotFoundError as e:
251
+ raise NotFoundError(
252
+ f"Eval directory for app `{app_name}` not found."
253
+ ) from e
239
254
 
240
255
  @override
241
256
  def get_eval_case(
@@ -300,7 +315,7 @@ class LocalEvalSetsManager(EvalSetsManager):
300
315
  )
301
316
 
302
317
  def _write_eval_set_to_path(self, eval_set_path: str, eval_set: EvalSet):
303
- with open(eval_set_path, "w") as f:
318
+ with open(eval_set_path, "w", encoding="utf-8") as f:
304
319
  f.write(eval_set.model_dump_json(indent=2))
305
320
 
306
321
  def _save_eval_set(self, app_name: str, eval_set_id: str, eval_set: EvalSet):
@@ -21,7 +21,9 @@ from .eval_metrics import EvalMetric
21
21
  from .eval_metrics import MetricName
22
22
  from .eval_metrics import PrebuiltMetrics
23
23
  from .evaluator import Evaluator
24
+ from .final_response_match_v2 import FinalResponseMatchV2Evaluator
24
25
  from .response_evaluator import ResponseEvaluator
26
+ from .safety_evaluator import SafetyEvaluatorV1
25
27
  from .trajectory_evaluator import TrajectoryEvaluator
26
28
 
27
29
  logger = logging.getLogger("google_adk." + __name__)
@@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
71
73
  metric_evaluator_registry = MetricEvaluatorRegistry()
72
74
 
73
75
  metric_evaluator_registry.register_evaluator(
74
- metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
75
- evaluator=type(TrajectoryEvaluator),
76
+ metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
77
+ evaluator=TrajectoryEvaluator,
76
78
  )
77
79
  metric_evaluator_registry.register_evaluator(
78
- metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
79
- evaluator=type(ResponseEvaluator),
80
+ metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
81
+ evaluator=ResponseEvaluator,
80
82
  )
81
83
  metric_evaluator_registry.register_evaluator(
82
- metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
83
- evaluator=type(ResponseEvaluator),
84
+ metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
85
+ evaluator=ResponseEvaluator,
86
+ )
87
+ metric_evaluator_registry.register_evaluator(
88
+ metric_name=PrebuiltMetrics.SAFETY_V1.value,
89
+ evaluator=SafetyEvaluatorV1,
90
+ )
91
+ metric_evaluator_registry.register_evaluator(
92
+ metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
93
+ evaluator=FinalResponseMatchV2Evaluator,
84
94
  )
85
95
 
86
96
  return metric_evaluator_registry