PyPI - google-adk - Versions diffs - 1.5.0__py3-none-any.whl → 1.6.1__py3-none-any.whl - Mend

google-adk 1.5.0py3-none-any.whl → 1.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

google/adk/a2a/converters/event_converter.py +257 -36
google/adk/a2a/converters/part_converter.py +93 -25
google/adk/a2a/converters/request_converter.py +12 -32
google/adk/a2a/converters/utils.py +22 -4
google/adk/a2a/executor/__init__.py +13 -0
google/adk/a2a/executor/a2a_agent_executor.py +260 -0
google/adk/a2a/executor/task_result_aggregator.py +71 -0
google/adk/a2a/logs/__init__.py +13 -0
google/adk/a2a/logs/log_utils.py +349 -0
google/adk/agents/base_agent.py +54 -0
google/adk/agents/llm_agent.py +15 -0
google/adk/agents/remote_a2a_agent.py +532 -0
google/adk/artifacts/in_memory_artifact_service.py +6 -3
google/adk/cli/browser/chunk-EQDQRRRY.js +1 -0
google/adk/cli/browser/chunk-TXJFAAIW.js +2 -0
google/adk/cli/browser/index.html +4 -3
google/adk/cli/browser/main-RXDVX3K6.js +3914 -0
google/adk/cli/browser/polyfills-FFHMD2TL.js +17 -0
google/adk/cli/cli_deploy.py +4 -1
google/adk/cli/cli_eval.py +8 -6
google/adk/cli/cli_tools_click.py +30 -10
google/adk/cli/fast_api.py +120 -5
google/adk/cli/utils/agent_loader.py +12 -0
google/adk/evaluation/agent_evaluator.py +107 -10
google/adk/evaluation/base_eval_service.py +157 -0
google/adk/evaluation/constants.py +20 -0
google/adk/evaluation/eval_case.py +3 -3
google/adk/evaluation/eval_metrics.py +39 -0
google/adk/evaluation/evaluation_generator.py +1 -1
google/adk/evaluation/final_response_match_v2.py +230 -0
google/adk/evaluation/llm_as_judge.py +141 -0
google/adk/evaluation/llm_as_judge_utils.py +48 -0
google/adk/evaluation/metric_evaluator_registry.py +89 -0
google/adk/evaluation/response_evaluator.py +38 -211
google/adk/evaluation/safety_evaluator.py +54 -0
google/adk/evaluation/trajectory_evaluator.py +16 -2
google/adk/evaluation/vertex_ai_eval_facade.py +147 -0
google/adk/events/event.py +2 -4
google/adk/flows/llm_flows/base_llm_flow.py +2 -0
google/adk/memory/in_memory_memory_service.py +3 -2
google/adk/models/lite_llm.py +50 -10
google/adk/runners.py +27 -10
google/adk/sessions/database_session_service.py +25 -7
google/adk/sessions/in_memory_session_service.py +5 -1
google/adk/sessions/vertex_ai_session_service.py +67 -42
google/adk/tools/bigquery/config.py +11 -1
google/adk/tools/bigquery/query_tool.py +306 -12
google/adk/tools/enterprise_search_tool.py +2 -2
google/adk/tools/function_tool.py +7 -1
google/adk/tools/google_search_tool.py +1 -1
google/adk/tools/mcp_tool/mcp_session_manager.py +44 -30
google/adk/tools/mcp_tool/mcp_tool.py +44 -7
google/adk/version.py +1 -1
{google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/METADATA +6 -4
{google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/RECORD +58 -42
google/adk/cli/browser/main-JAAWEV7F.js +0 -92
google/adk/cli/browser/polyfills-B6TNHZQ6.js +0 -17
{google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/WHEEL +0 -0
{google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/entry_points.txt +0 -0
{google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/licenses/LICENSE +0 -0

google/adk/evaluation/agent_evaluator.py CHANGED Viewed

@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 import json
 import logging
 import os
@@ -23,16 +25,17 @@ from typing import Optional
 from typing import Union
 import uuid
+from google.genai import types as genai_types
 from pydantic import ValidationError
+from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
+from .eval_case import IntermediateData
+from .eval_metrics import EvalMetric
 from .eval_set import EvalSet
-from .evaluation_generator import EvaluationGenerator
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
-from .response_evaluator import ResponseEvaluator
-from .trajectory_evaluator import TrajectoryEvaluator
 logger = logging.getLogger("google_adk." + __name__)
@@ -44,11 +47,13 @@ TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
 # This is always optional unless explicitly specified.
 RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
 RESPONSE_MATCH_SCORE_KEY = "response_match_score"
+SAFETY_V1_KEY = "safety_v1"
 ALLOWED_CRITERIA = [
     TOOL_TRAJECTORY_SCORE_KEY,
     RESPONSE_EVALUATION_SCORE_KEY,
     RESPONSE_MATCH_SCORE_KEY,
+    SAFETY_V1_KEY,
 ]
@@ -96,6 +101,7 @@ class AgentEvaluator:
       criteria: dict[str, float],
       num_runs=NUM_RUNS,
       agent_name=None,
+      print_detailed_results: bool = True,
   ):
     """Evaluates an agent using the given EvalSet.
@@ -109,7 +115,13 @@ class AgentEvaluator:
       num_runs: Number of times all entries in the eval dataset should be
         assessed.
       agent_name: The name of the agent.
+      print_detailed_results: Whether to print detailed results for each metric
+        evaluation.
     """
+    try:
+      from .evaluation_generator import EvaluationGenerator
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
     eval_case_responses_list = await EvaluationGenerator.generate_responses(
         eval_set=eval_set,
         agent_module_path=agent_module,
@@ -117,6 +129,8 @@ class AgentEvaluator:
         agent_name=agent_name,
     )
+    failures = []
     for eval_case_responses in eval_case_responses_list:
       actual_invocations = [
           invocation
@@ -139,10 +153,25 @@ class AgentEvaluator:
             )
         )
-        assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
-            f"{metric_name} for {agent_module} Failed. Expected {threshold},"
-            f" but got {evaluation_result.overall_score}."
-        )
+        if print_detailed_results:
+          AgentEvaluator._print_details(
+              evaluation_result=evaluation_result,
+              metric_name=metric_name,
+              threshold=threshold,
+          )
+        # Gather all the failures.
+        if evaluation_result.overall_eval_status != EvalStatus.PASSED:
+          failures.append(
+              f"{metric_name} for {agent_module} Failed. Expected {threshold},"
+              f" but got {evaluation_result.overall_score}."
+          )
+    assert not failures, (
+        "Following are all the test failures. If you looking to get more"
+        " details on the failures, then please re-run this test with"
+        " `print_details` set to `True`.\n{}".format("\n".join(failures))
+    )
   @staticmethod
   async def evaluate(
@@ -158,9 +187,10 @@ class AgentEvaluator:
       agent_module: The path to python module that contains the definition of
         the agent. There is convention in place here, where the code is going to
         look for 'root_agent' in the loaded module.
-      eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
-        full path to the file containing eval dataset, or a directory that is
-        recursively explored for all files that have a `.test.json` suffix.
+      eval_dataset_file_path_or_dir: The eval data set. This can be either a
+        string representing full path to the file containing eval dataset, or a
+        directory that is recursively explored for all files that have a
+        `.test.json` suffix.
       num_runs: Number of times all entries in the eval dataset should be
         assessed.
       agent_name: The name of the agent.
@@ -358,6 +388,12 @@ class AgentEvaluator:
   @staticmethod
   def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
+    try:
+      from .response_evaluator import ResponseEvaluator
+      from .safety_evaluator import SafetyEvaluatorV1
+      from .trajectory_evaluator import TrajectoryEvaluator
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
     if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
       return TrajectoryEvaluator(threshold=threshold)
     elif (
@@ -365,5 +401,66 @@ class AgentEvaluator:
         or metric_name == RESPONSE_EVALUATION_SCORE_KEY
     ):
       return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
+    elif metric_name == SAFETY_V1_KEY:
+      return SafetyEvaluatorV1(
+          eval_metric=EvalMetric(threshold=threshold, metric_name=metric_name)
+      )
     raise ValueError(f"Unsupported eval metric: {metric_name}")
+  @staticmethod
+  def _print_details(
+      evaluation_result: EvaluationResult, metric_name: str, threshold: float
+  ):
+    try:
+      from pandas import pandas as pd
+      from tabulate import tabulate
+    except ModuleNotFoundError as e:
+      raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
+    print(
+        f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
+        f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
+        f" `{evaluation_result.overall_score}`."
+    )
+    data = []
+    for per_invocation_result in evaluation_result.per_invocation_results:
+      data.append({
+          "eval_status": per_invocation_result.eval_status,
+          "score": per_invocation_result.score,
+          "threshold": threshold,
+          "prompt": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.expected_invocation.user_content
+          ),
+          "expected_response": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.expected_invocation.final_response
+          ),
+          "actual_response": AgentEvaluator._convert_content_to_text(
+              per_invocation_result.actual_invocation.final_response
+          ),
+          "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+              per_invocation_result.expected_invocation.intermediate_data
+          ),
+          "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+              per_invocation_result.actual_invocation.intermediate_data
+          ),
+      })
+    print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
+    print("\n\n")  # Few empty lines for visual clarity
+  @staticmethod
+  def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
+    if content and content.parts:
+      return "\n".join([p.text for p in content.parts if p.text])
+    return ""
+  @staticmethod
+  def _convert_tool_calls_to_text(
+      intermediate_data: Optional[IntermediateData],
+  ) -> str:
+    if intermediate_data and intermediate_data.tool_uses:
+      return "\n".join([str(t) for t in intermediate_data.tool_uses])
+    return ""

google/adk/evaluation/base_eval_service.py ADDED Viewed

@@ -0,0 +1,157 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from abc import ABC
+from abc import abstractmethod
+from typing import AsyncGenerator
+from typing import Optional
+from pydantic import alias_generators
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic import Field
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .eval_result import EvalCaseResult
+class EvaluateConfig(BaseModel):
+  """Contains configurations need to run an evaluations."""
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+  eval_metrics: list[EvalMetric] = Field(
+      description="""The list of metrics to be used in Eval.""",
+  )
+class InferenceConfig(BaseModel):
+  """Contains configurations need to run inferences."""
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+  labels: Optional[dict[str, str]] = Field(
+      default=None,
+      description="""Labels with user-defined metadata to break down billed
+charges.""",
+  )
+class InferenceRequest(BaseModel):
+  """Represent a request to perform inferences for the eval cases in an eval set."""
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+  app_name: str = Field(
+      description="""The name of the app to which the eval case belongs to."""
+  )
+  eval_set_id: str = Field(description="""Id of the eval set.""")
+  eval_case_ids: Optional[list[str]] = Field(
+      default=None,
+      description="""Id of the eval cases for which inferences need to be
+generated.
+All the eval case ids should belong to the EvalSet.
+If the list of eval case ids are empty or not specified, then all the eval cases
+in an eval set are evaluated.
+      """,
+  )
+  inference_config: InferenceConfig = Field(
+      description="""The config to use for inferencing.""",
+  )
+class InferenceResult(BaseModel):
+  """Contains inference results for a single eval case."""
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+  app_name: str = Field(
+      description="""The name of the app to which the eval case belongs to."""
+  )
+  eval_set_id: str = Field(description="""Id of the eval set.""")
+  eval_case_id: str = Field(
+      description="""Id of the eval case for which inferences were generated.""",
+  )
+  inferences: list[Invocation] = Field(
+      description="""Inferences obtained from the Agent for the eval case."""
+  )
+  session_id: Optional[str] = Field(
+      description="""Id of the inference session."""
+  )
+class EvaluateRequest(BaseModel):
+  model_config = ConfigDict(
+      alias_generator=alias_generators.to_camel,
+      populate_by_name=True,
+  )
+  inference_results: list[InferenceResult] = Field(
+      description="""A list of inferences that need to be evaluated.""",
+  )
+  evaluate_config: EvaluateConfig = Field(
+      description="""The config to use for evaluations.""",
+  )
+class BaseEvalService(ABC):
+  """A service to run Evals for an ADK agent."""
+  @abstractmethod
+  async def perform_inference(
+      self,
+      inference_request: InferenceRequest,
+  ) -> AsyncGenerator[InferenceResult, None]:
+    """Returns InferenceResult obtained from the Agent as and when they are available.
+    Args:
+      inference_request: The request for generating inferences.
+    """
+  @abstractmethod
+  async def evaluate(
+      self,
+      evaluate_request: EvaluateRequest,
+  ) -> AsyncGenerator[EvalCaseResult, None]:
+    """Returns EvalCaseResult for each item as and when they are available.
+    Args:
+      evaluate_request: The request to perform metric evaluations on the
+        inferences.
+    """

google/adk/evaluation/constants.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+MISSING_EVAL_DEPENDENCIES_MESSAGE = (
+    "Eval module is not installed, please install via `pip install"
+    " google-adk[eval]`."
+)

google/adk/evaluation/eval_case.py CHANGED Viewed

@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 from typing import Any
 from typing import Optional
-from typing import Tuple
 from google.genai import types as genai_types
 from pydantic import alias_generators
@@ -37,11 +37,11 @@ class IntermediateData(EvalBaseModel):
   tool_uses: list[genai_types.FunctionCall] = []
   """Tool use trajectory in chronological order."""
-  intermediate_responses: list[Tuple[str, list[genai_types.Part]]] = []
+  intermediate_responses: list[tuple[str, list[genai_types.Part]]] = []
   """Intermediate responses generated by sub-agents to convey progress or status
   in a multi-agent system, distinct from the final response.
-  This is expressed as a Tuple of:
+  This is expressed as a tuple of:
     - Author: Usually the sub-agent name that generated the intermediate
       response.

google/adk/evaluation/eval_metrics.py CHANGED Viewed

@@ -14,16 +14,50 @@
 from __future__ import annotations
+from enum import Enum
 from typing import Optional
+from typing import Union
+from google.genai import types as genai_types
 from pydantic import alias_generators
 from pydantic import BaseModel
 from pydantic import ConfigDict
+from pydantic import Field
+from typing_extensions import TypeAlias
 from .eval_case import Invocation
 from .evaluator import EvalStatus
+class PrebuiltMetrics(Enum):
+  TOOL_TRAJECTORY_AVG_SCORE = "tool_trajectory_avg_score"
+  RESPONSE_EVALUATION_SCORE = "response_evaluation_score"
+  RESPONSE_MATCH_SCORE = "response_match_score"
+MetricName: TypeAlias = Union[str, PrebuiltMetrics]
+class JudgeModelOptions(BaseModel):
+  """Options for an eval metric's judge model."""
+  judge_model: str = Field(
+      default="gemini-2.5-flash",
+      description="""The judge model to use for evaluation. It can be a model name.""",
+  )
+  judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
+      default=None, description="""The configuration for the judge model."""
+  )
+  num_samples: Optional[int] = Field(
+      default=None,
+      description="""The number of times to sample the model for each invocation evaluation.""",
+  )
 class EvalMetric(BaseModel):
   """A metric used to evaluate a particular aspect of an eval case."""
@@ -38,6 +72,11 @@ class EvalMetric(BaseModel):
   threshold: float
   """A threshold value. Each metric decides how to interpret this threshold."""
+  judge_model_options: Optional[JudgeModelOptions] = Field(
+      default=None,
+      description="""Options for the judge model.""",
+  )
 class EvalMetricResult(EvalMetric):
   """The actual computed score/value of a particular EvalMetric."""

google/adk/evaluation/evaluation_generator.py CHANGED Viewed

@@ -182,7 +182,7 @@ class EvaluationGenerator:
       tool_uses = []
       invocation_id = ""
-      for event in runner.run(
+      async for event in runner.run_async(
           user_id=user_id, session_id=session_id, new_message=user_content
       ):
         invocation_id = (

google-adk 1.5.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

google-adk 1.5.0py3-none-any.whl → 1.6.1py3-none-any.whl