google-adk 1.5.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. google/adk/a2a/converters/event_converter.py +257 -36
  2. google/adk/a2a/converters/part_converter.py +93 -25
  3. google/adk/a2a/converters/request_converter.py +12 -32
  4. google/adk/a2a/converters/utils.py +22 -4
  5. google/adk/a2a/executor/__init__.py +13 -0
  6. google/adk/a2a/executor/a2a_agent_executor.py +260 -0
  7. google/adk/a2a/executor/task_result_aggregator.py +71 -0
  8. google/adk/a2a/logs/__init__.py +13 -0
  9. google/adk/a2a/logs/log_utils.py +349 -0
  10. google/adk/agents/base_agent.py +54 -0
  11. google/adk/agents/llm_agent.py +15 -0
  12. google/adk/agents/remote_a2a_agent.py +532 -0
  13. google/adk/artifacts/in_memory_artifact_service.py +6 -3
  14. google/adk/cli/browser/chunk-EQDQRRRY.js +1 -0
  15. google/adk/cli/browser/chunk-TXJFAAIW.js +2 -0
  16. google/adk/cli/browser/index.html +4 -3
  17. google/adk/cli/browser/main-RXDVX3K6.js +3914 -0
  18. google/adk/cli/browser/polyfills-FFHMD2TL.js +17 -0
  19. google/adk/cli/cli_deploy.py +4 -1
  20. google/adk/cli/cli_eval.py +8 -6
  21. google/adk/cli/cli_tools_click.py +30 -10
  22. google/adk/cli/fast_api.py +120 -5
  23. google/adk/cli/utils/agent_loader.py +12 -0
  24. google/adk/evaluation/agent_evaluator.py +107 -10
  25. google/adk/evaluation/base_eval_service.py +157 -0
  26. google/adk/evaluation/constants.py +20 -0
  27. google/adk/evaluation/eval_case.py +3 -3
  28. google/adk/evaluation/eval_metrics.py +39 -0
  29. google/adk/evaluation/evaluation_generator.py +1 -1
  30. google/adk/evaluation/final_response_match_v2.py +230 -0
  31. google/adk/evaluation/llm_as_judge.py +141 -0
  32. google/adk/evaluation/llm_as_judge_utils.py +48 -0
  33. google/adk/evaluation/metric_evaluator_registry.py +89 -0
  34. google/adk/evaluation/response_evaluator.py +38 -211
  35. google/adk/evaluation/safety_evaluator.py +54 -0
  36. google/adk/evaluation/trajectory_evaluator.py +16 -2
  37. google/adk/evaluation/vertex_ai_eval_facade.py +147 -0
  38. google/adk/events/event.py +2 -4
  39. google/adk/flows/llm_flows/base_llm_flow.py +2 -0
  40. google/adk/memory/in_memory_memory_service.py +3 -2
  41. google/adk/models/lite_llm.py +50 -10
  42. google/adk/runners.py +27 -10
  43. google/adk/sessions/database_session_service.py +25 -7
  44. google/adk/sessions/in_memory_session_service.py +5 -1
  45. google/adk/sessions/vertex_ai_session_service.py +67 -42
  46. google/adk/tools/bigquery/config.py +11 -1
  47. google/adk/tools/bigquery/query_tool.py +306 -12
  48. google/adk/tools/enterprise_search_tool.py +2 -2
  49. google/adk/tools/function_tool.py +7 -1
  50. google/adk/tools/google_search_tool.py +1 -1
  51. google/adk/tools/mcp_tool/mcp_session_manager.py +44 -30
  52. google/adk/tools/mcp_tool/mcp_tool.py +44 -7
  53. google/adk/version.py +1 -1
  54. {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/METADATA +6 -4
  55. {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/RECORD +58 -42
  56. google/adk/cli/browser/main-JAAWEV7F.js +0 -92
  57. google/adk/cli/browser/polyfills-B6TNHZQ6.js +0 -17
  58. {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/WHEEL +0 -0
  59. {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/entry_points.txt +0 -0
  60. {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/licenses/LICENSE +0 -0
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from __future__ import annotations
16
+
15
17
  import json
16
18
  import logging
17
19
  import os
@@ -23,16 +25,17 @@ from typing import Optional
23
25
  from typing import Union
24
26
  import uuid
25
27
 
28
+ from google.genai import types as genai_types
26
29
  from pydantic import ValidationError
27
30
 
31
+ from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
32
+ from .eval_case import IntermediateData
33
+ from .eval_metrics import EvalMetric
28
34
  from .eval_set import EvalSet
29
- from .evaluation_generator import EvaluationGenerator
30
35
  from .evaluator import EvalStatus
31
36
  from .evaluator import EvaluationResult
32
37
  from .evaluator import Evaluator
33
38
  from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
34
- from .response_evaluator import ResponseEvaluator
35
- from .trajectory_evaluator import TrajectoryEvaluator
36
39
 
37
40
  logger = logging.getLogger("google_adk." + __name__)
38
41
 
@@ -44,11 +47,13 @@ TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
44
47
  # This is always optional unless explicitly specified.
45
48
  RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
46
49
  RESPONSE_MATCH_SCORE_KEY = "response_match_score"
50
+ SAFETY_V1_KEY = "safety_v1"
47
51
 
48
52
  ALLOWED_CRITERIA = [
49
53
  TOOL_TRAJECTORY_SCORE_KEY,
50
54
  RESPONSE_EVALUATION_SCORE_KEY,
51
55
  RESPONSE_MATCH_SCORE_KEY,
56
+ SAFETY_V1_KEY,
52
57
  ]
53
58
 
54
59
 
@@ -96,6 +101,7 @@ class AgentEvaluator:
96
101
  criteria: dict[str, float],
97
102
  num_runs=NUM_RUNS,
98
103
  agent_name=None,
104
+ print_detailed_results: bool = True,
99
105
  ):
100
106
  """Evaluates an agent using the given EvalSet.
101
107
 
@@ -109,7 +115,13 @@ class AgentEvaluator:
109
115
  num_runs: Number of times all entries in the eval dataset should be
110
116
  assessed.
111
117
  agent_name: The name of the agent.
118
+ print_detailed_results: Whether to print detailed results for each metric
119
+ evaluation.
112
120
  """
121
+ try:
122
+ from .evaluation_generator import EvaluationGenerator
123
+ except ModuleNotFoundError as e:
124
+ raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
113
125
  eval_case_responses_list = await EvaluationGenerator.generate_responses(
114
126
  eval_set=eval_set,
115
127
  agent_module_path=agent_module,
@@ -117,6 +129,8 @@ class AgentEvaluator:
117
129
  agent_name=agent_name,
118
130
  )
119
131
 
132
+ failures = []
133
+
120
134
  for eval_case_responses in eval_case_responses_list:
121
135
  actual_invocations = [
122
136
  invocation
@@ -139,10 +153,25 @@ class AgentEvaluator:
139
153
  )
140
154
  )
141
155
 
142
- assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
143
- f"{metric_name} for {agent_module} Failed. Expected {threshold},"
144
- f" but got {evaluation_result.overall_score}."
145
- )
156
+ if print_detailed_results:
157
+ AgentEvaluator._print_details(
158
+ evaluation_result=evaluation_result,
159
+ metric_name=metric_name,
160
+ threshold=threshold,
161
+ )
162
+
163
+ # Gather all the failures.
164
+ if evaluation_result.overall_eval_status != EvalStatus.PASSED:
165
+ failures.append(
166
+ f"{metric_name} for {agent_module} Failed. Expected {threshold},"
167
+ f" but got {evaluation_result.overall_score}."
168
+ )
169
+
170
+ assert not failures, (
171
+ "Following are all the test failures. If you looking to get more"
172
+ " details on the failures, then please re-run this test with"
173
+ " `print_details` set to `True`.\n{}".format("\n".join(failures))
174
+ )
146
175
 
147
176
  @staticmethod
148
177
  async def evaluate(
@@ -158,9 +187,10 @@ class AgentEvaluator:
158
187
  agent_module: The path to python module that contains the definition of
159
188
  the agent. There is convention in place here, where the code is going to
160
189
  look for 'root_agent' in the loaded module.
161
- eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
162
- full path to the file containing eval dataset, or a directory that is
163
- recursively explored for all files that have a `.test.json` suffix.
190
+ eval_dataset_file_path_or_dir: The eval data set. This can be either a
191
+ string representing full path to the file containing eval dataset, or a
192
+ directory that is recursively explored for all files that have a
193
+ `.test.json` suffix.
164
194
  num_runs: Number of times all entries in the eval dataset should be
165
195
  assessed.
166
196
  agent_name: The name of the agent.
@@ -358,6 +388,12 @@ class AgentEvaluator:
358
388
 
359
389
  @staticmethod
360
390
  def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
391
+ try:
392
+ from .response_evaluator import ResponseEvaluator
393
+ from .safety_evaluator import SafetyEvaluatorV1
394
+ from .trajectory_evaluator import TrajectoryEvaluator
395
+ except ModuleNotFoundError as e:
396
+ raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
361
397
  if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
362
398
  return TrajectoryEvaluator(threshold=threshold)
363
399
  elif (
@@ -365,5 +401,66 @@ class AgentEvaluator:
365
401
  or metric_name == RESPONSE_EVALUATION_SCORE_KEY
366
402
  ):
367
403
  return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
404
+ elif metric_name == SAFETY_V1_KEY:
405
+ return SafetyEvaluatorV1(
406
+ eval_metric=EvalMetric(threshold=threshold, metric_name=metric_name)
407
+ )
368
408
 
369
409
  raise ValueError(f"Unsupported eval metric: {metric_name}")
410
+
411
+ @staticmethod
412
+ def _print_details(
413
+ evaluation_result: EvaluationResult, metric_name: str, threshold: float
414
+ ):
415
+ try:
416
+ from pandas import pandas as pd
417
+ from tabulate import tabulate
418
+ except ModuleNotFoundError as e:
419
+ raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
420
+ print(
421
+ f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
422
+ f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
423
+ f" `{evaluation_result.overall_score}`."
424
+ )
425
+
426
+ data = []
427
+ for per_invocation_result in evaluation_result.per_invocation_results:
428
+ data.append({
429
+ "eval_status": per_invocation_result.eval_status,
430
+ "score": per_invocation_result.score,
431
+ "threshold": threshold,
432
+ "prompt": AgentEvaluator._convert_content_to_text(
433
+ per_invocation_result.expected_invocation.user_content
434
+ ),
435
+ "expected_response": AgentEvaluator._convert_content_to_text(
436
+ per_invocation_result.expected_invocation.final_response
437
+ ),
438
+ "actual_response": AgentEvaluator._convert_content_to_text(
439
+ per_invocation_result.actual_invocation.final_response
440
+ ),
441
+ "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
442
+ per_invocation_result.expected_invocation.intermediate_data
443
+ ),
444
+ "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
445
+ per_invocation_result.actual_invocation.intermediate_data
446
+ ),
447
+ })
448
+
449
+ print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
450
+ print("\n\n") # Few empty lines for visual clarity
451
+
452
+ @staticmethod
453
+ def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
454
+ if content and content.parts:
455
+ return "\n".join([p.text for p in content.parts if p.text])
456
+
457
+ return ""
458
+
459
+ @staticmethod
460
+ def _convert_tool_calls_to_text(
461
+ intermediate_data: Optional[IntermediateData],
462
+ ) -> str:
463
+ if intermediate_data and intermediate_data.tool_uses:
464
+ return "\n".join([str(t) for t in intermediate_data.tool_uses])
465
+
466
+ return ""
@@ -0,0 +1,157 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ from abc import ABC
18
+ from abc import abstractmethod
19
+ from typing import AsyncGenerator
20
+ from typing import Optional
21
+
22
+ from pydantic import alias_generators
23
+ from pydantic import BaseModel
24
+ from pydantic import ConfigDict
25
+ from pydantic import Field
26
+
27
+ from .eval_case import Invocation
28
+ from .eval_metrics import EvalMetric
29
+ from .eval_result import EvalCaseResult
30
+
31
+
32
+ class EvaluateConfig(BaseModel):
33
+ """Contains configurations need to run an evaluations."""
34
+
35
+ model_config = ConfigDict(
36
+ alias_generator=alias_generators.to_camel,
37
+ populate_by_name=True,
38
+ )
39
+
40
+ eval_metrics: list[EvalMetric] = Field(
41
+ description="""The list of metrics to be used in Eval.""",
42
+ )
43
+
44
+
45
+ class InferenceConfig(BaseModel):
46
+ """Contains configurations need to run inferences."""
47
+
48
+ model_config = ConfigDict(
49
+ alias_generator=alias_generators.to_camel,
50
+ populate_by_name=True,
51
+ )
52
+
53
+ labels: Optional[dict[str, str]] = Field(
54
+ default=None,
55
+ description="""Labels with user-defined metadata to break down billed
56
+ charges.""",
57
+ )
58
+
59
+
60
+ class InferenceRequest(BaseModel):
61
+ """Represent a request to perform inferences for the eval cases in an eval set."""
62
+
63
+ model_config = ConfigDict(
64
+ alias_generator=alias_generators.to_camel,
65
+ populate_by_name=True,
66
+ )
67
+
68
+ app_name: str = Field(
69
+ description="""The name of the app to which the eval case belongs to."""
70
+ )
71
+
72
+ eval_set_id: str = Field(description="""Id of the eval set.""")
73
+
74
+ eval_case_ids: Optional[list[str]] = Field(
75
+ default=None,
76
+ description="""Id of the eval cases for which inferences need to be
77
+ generated.
78
+
79
+ All the eval case ids should belong to the EvalSet.
80
+
81
+ If the list of eval case ids are empty or not specified, then all the eval cases
82
+ in an eval set are evaluated.
83
+ """,
84
+ )
85
+
86
+ inference_config: InferenceConfig = Field(
87
+ description="""The config to use for inferencing.""",
88
+ )
89
+
90
+
91
+ class InferenceResult(BaseModel):
92
+ """Contains inference results for a single eval case."""
93
+
94
+ model_config = ConfigDict(
95
+ alias_generator=alias_generators.to_camel,
96
+ populate_by_name=True,
97
+ )
98
+
99
+ app_name: str = Field(
100
+ description="""The name of the app to which the eval case belongs to."""
101
+ )
102
+
103
+ eval_set_id: str = Field(description="""Id of the eval set.""")
104
+
105
+ eval_case_id: str = Field(
106
+ description="""Id of the eval case for which inferences were generated.""",
107
+ )
108
+
109
+ inferences: list[Invocation] = Field(
110
+ description="""Inferences obtained from the Agent for the eval case."""
111
+ )
112
+
113
+ session_id: Optional[str] = Field(
114
+ description="""Id of the inference session."""
115
+ )
116
+
117
+
118
+ class EvaluateRequest(BaseModel):
119
+ model_config = ConfigDict(
120
+ alias_generator=alias_generators.to_camel,
121
+ populate_by_name=True,
122
+ )
123
+
124
+ inference_results: list[InferenceResult] = Field(
125
+ description="""A list of inferences that need to be evaluated.""",
126
+ )
127
+
128
+ evaluate_config: EvaluateConfig = Field(
129
+ description="""The config to use for evaluations.""",
130
+ )
131
+
132
+
133
+ class BaseEvalService(ABC):
134
+ """A service to run Evals for an ADK agent."""
135
+
136
+ @abstractmethod
137
+ async def perform_inference(
138
+ self,
139
+ inference_request: InferenceRequest,
140
+ ) -> AsyncGenerator[InferenceResult, None]:
141
+ """Returns InferenceResult obtained from the Agent as and when they are available.
142
+
143
+ Args:
144
+ inference_request: The request for generating inferences.
145
+ """
146
+
147
+ @abstractmethod
148
+ async def evaluate(
149
+ self,
150
+ evaluate_request: EvaluateRequest,
151
+ ) -> AsyncGenerator[EvalCaseResult, None]:
152
+ """Returns EvalCaseResult for each item as and when they are available.
153
+
154
+ Args:
155
+ evaluate_request: The request to perform metric evaluations on the
156
+ inferences.
157
+ """
@@ -0,0 +1,20 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ MISSING_EVAL_DEPENDENCIES_MESSAGE = (
18
+ "Eval module is not installed, please install via `pip install"
19
+ " google-adk[eval]`."
20
+ )
@@ -12,10 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from __future__ import annotations
15
16
 
16
17
  from typing import Any
17
18
  from typing import Optional
18
- from typing import Tuple
19
19
 
20
20
  from google.genai import types as genai_types
21
21
  from pydantic import alias_generators
@@ -37,11 +37,11 @@ class IntermediateData(EvalBaseModel):
37
37
  tool_uses: list[genai_types.FunctionCall] = []
38
38
  """Tool use trajectory in chronological order."""
39
39
 
40
- intermediate_responses: list[Tuple[str, list[genai_types.Part]]] = []
40
+ intermediate_responses: list[tuple[str, list[genai_types.Part]]] = []
41
41
  """Intermediate responses generated by sub-agents to convey progress or status
42
42
  in a multi-agent system, distinct from the final response.
43
43
 
44
- This is expressed as a Tuple of:
44
+ This is expressed as a tuple of:
45
45
  - Author: Usually the sub-agent name that generated the intermediate
46
46
  response.
47
47
 
@@ -14,16 +14,50 @@
14
14
 
15
15
  from __future__ import annotations
16
16
 
17
+ from enum import Enum
17
18
  from typing import Optional
19
+ from typing import Union
18
20
 
21
+ from google.genai import types as genai_types
19
22
  from pydantic import alias_generators
20
23
  from pydantic import BaseModel
21
24
  from pydantic import ConfigDict
25
+ from pydantic import Field
26
+ from typing_extensions import TypeAlias
22
27
 
23
28
  from .eval_case import Invocation
24
29
  from .evaluator import EvalStatus
25
30
 
26
31
 
32
+ class PrebuiltMetrics(Enum):
33
+ TOOL_TRAJECTORY_AVG_SCORE = "tool_trajectory_avg_score"
34
+
35
+ RESPONSE_EVALUATION_SCORE = "response_evaluation_score"
36
+
37
+ RESPONSE_MATCH_SCORE = "response_match_score"
38
+
39
+
40
+ MetricName: TypeAlias = Union[str, PrebuiltMetrics]
41
+
42
+
43
+ class JudgeModelOptions(BaseModel):
44
+ """Options for an eval metric's judge model."""
45
+
46
+ judge_model: str = Field(
47
+ default="gemini-2.5-flash",
48
+ description="""The judge model to use for evaluation. It can be a model name.""",
49
+ )
50
+
51
+ judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
52
+ default=None, description="""The configuration for the judge model."""
53
+ )
54
+
55
+ num_samples: Optional[int] = Field(
56
+ default=None,
57
+ description="""The number of times to sample the model for each invocation evaluation.""",
58
+ )
59
+
60
+
27
61
  class EvalMetric(BaseModel):
28
62
  """A metric used to evaluate a particular aspect of an eval case."""
29
63
 
@@ -38,6 +72,11 @@ class EvalMetric(BaseModel):
38
72
  threshold: float
39
73
  """A threshold value. Each metric decides how to interpret this threshold."""
40
74
 
75
+ judge_model_options: Optional[JudgeModelOptions] = Field(
76
+ default=None,
77
+ description="""Options for the judge model.""",
78
+ )
79
+
41
80
 
42
81
  class EvalMetricResult(EvalMetric):
43
82
  """The actual computed score/value of a particular EvalMetric."""
@@ -182,7 +182,7 @@ class EvaluationGenerator:
182
182
  tool_uses = []
183
183
  invocation_id = ""
184
184
 
185
- for event in runner.run(
185
+ async for event in runner.run_async(
186
186
  user_id=user_id, session_id=session_id, new_message=user_content
187
187
  ):
188
188
  invocation_id = (