PyPI - google-adk - Versions diffs - 1.6.1__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

google-adk 1.6.1py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

google/adk/a2a/converters/event_converter.py +5 -85
google/adk/a2a/converters/request_converter.py +1 -2
google/adk/a2a/executor/a2a_agent_executor.py +45 -16
google/adk/a2a/logs/log_utils.py +1 -2
google/adk/a2a/utils/__init__.py +0 -0
google/adk/a2a/utils/agent_card_builder.py +544 -0
google/adk/a2a/utils/agent_to_a2a.py +118 -0
google/adk/agents/__init__.py +5 -0
google/adk/agents/agent_config.py +46 -0
google/adk/agents/base_agent.py +239 -41
google/adk/agents/callback_context.py +41 -0
google/adk/agents/common_configs.py +79 -0
google/adk/agents/config_agent_utils.py +184 -0
google/adk/agents/config_schemas/AgentConfig.json +566 -0
google/adk/agents/invocation_context.py +5 -1
google/adk/agents/live_request_queue.py +15 -0
google/adk/agents/llm_agent.py +201 -9
google/adk/agents/loop_agent.py +35 -1
google/adk/agents/parallel_agent.py +24 -3
google/adk/agents/remote_a2a_agent.py +17 -5
google/adk/agents/sequential_agent.py +22 -1
google/adk/artifacts/gcs_artifact_service.py +110 -20
google/adk/auth/auth_handler.py +3 -3
google/adk/auth/credential_manager.py +23 -23
google/adk/auth/credential_service/base_credential_service.py +6 -6
google/adk/auth/credential_service/in_memory_credential_service.py +10 -8
google/adk/auth/credential_service/session_state_credential_service.py +8 -8
google/adk/auth/exchanger/oauth2_credential_exchanger.py +3 -3
google/adk/auth/oauth2_credential_util.py +2 -2
google/adk/auth/refresher/oauth2_credential_refresher.py +4 -4
google/adk/cli/agent_graph.py +3 -1
google/adk/cli/browser/index.html +2 -2
google/adk/cli/browser/main-W7QZBYAR.js +3914 -0
google/adk/cli/browser/polyfills-B6TNHZQ6.js +17 -0
google/adk/cli/cli_eval.py +87 -12
google/adk/cli/cli_tools_click.py +143 -82
google/adk/cli/fast_api.py +150 -69
google/adk/cli/utils/agent_loader.py +35 -1
google/adk/code_executors/base_code_executor.py +14 -19
google/adk/code_executors/built_in_code_executor.py +4 -1
google/adk/evaluation/base_eval_service.py +46 -2
google/adk/evaluation/eval_metrics.py +4 -0
google/adk/evaluation/eval_sets_manager.py +5 -1
google/adk/evaluation/evaluation_generator.py +1 -1
google/adk/evaluation/final_response_match_v2.py +2 -2
google/adk/evaluation/gcs_eval_sets_manager.py +2 -1
google/adk/evaluation/in_memory_eval_sets_manager.py +151 -0
google/adk/evaluation/local_eval_service.py +389 -0
google/adk/evaluation/local_eval_set_results_manager.py +2 -2
google/adk/evaluation/local_eval_sets_manager.py +24 -9
google/adk/evaluation/metric_evaluator_registry.py +16 -6
google/adk/evaluation/vertex_ai_eval_facade.py +7 -1
google/adk/events/event.py +7 -2
google/adk/flows/llm_flows/auto_flow.py +6 -11
google/adk/flows/llm_flows/base_llm_flow.py +66 -29
google/adk/flows/llm_flows/contents.py +16 -10
google/adk/flows/llm_flows/functions.py +89 -52
google/adk/memory/in_memory_memory_service.py +21 -15
google/adk/memory/vertex_ai_memory_bank_service.py +12 -10
google/adk/models/anthropic_llm.py +46 -6
google/adk/models/base_llm_connection.py +2 -0
google/adk/models/gemini_llm_connection.py +17 -6
google/adk/models/google_llm.py +46 -11
google/adk/models/lite_llm.py +52 -22
google/adk/plugins/__init__.py +17 -0
google/adk/plugins/base_plugin.py +317 -0
google/adk/plugins/plugin_manager.py +265 -0
google/adk/runners.py +122 -18
google/adk/sessions/database_session_service.py +51 -52
google/adk/sessions/vertex_ai_session_service.py +27 -12
google/adk/tools/__init__.py +2 -0
google/adk/tools/_automatic_function_calling_util.py +20 -2
google/adk/tools/agent_tool.py +15 -3
google/adk/tools/apihub_tool/apihub_toolset.py +38 -39
google/adk/tools/application_integration_tool/application_integration_toolset.py +35 -37
google/adk/tools/application_integration_tool/integration_connector_tool.py +2 -3
google/adk/tools/base_tool.py +9 -9
google/adk/tools/base_toolset.py +29 -5
google/adk/tools/bigquery/__init__.py +3 -3
google/adk/tools/bigquery/metadata_tool.py +2 -0
google/adk/tools/bigquery/query_tool.py +15 -1
google/adk/tools/computer_use/__init__.py +13 -0
google/adk/tools/computer_use/base_computer.py +265 -0
google/adk/tools/computer_use/computer_use_tool.py +166 -0
google/adk/tools/computer_use/computer_use_toolset.py +220 -0
google/adk/tools/enterprise_search_tool.py +4 -2
google/adk/tools/exit_loop_tool.py +1 -0
google/adk/tools/google_api_tool/google_api_tool.py +16 -1
google/adk/tools/google_api_tool/google_api_toolset.py +9 -7
google/adk/tools/google_api_tool/google_api_toolsets.py +41 -20
google/adk/tools/google_search_tool.py +4 -2
google/adk/tools/langchain_tool.py +16 -6
google/adk/tools/long_running_tool.py +21 -0
google/adk/tools/mcp_tool/mcp_toolset.py +27 -28
google/adk/tools/openapi_tool/openapi_spec_parser/openapi_spec_parser.py +5 -0
google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +8 -8
google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +4 -6
google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +3 -2
google/adk/tools/tool_context.py +0 -10
google/adk/tools/url_context_tool.py +4 -2
google/adk/tools/vertex_ai_search_tool.py +4 -2
google/adk/utils/model_name_utils.py +90 -0
google/adk/version.py +1 -1
{google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/METADATA +3 -2
{google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/RECORD +108 -91
google/adk/cli/browser/main-RXDVX3K6.js +0 -3914
google/adk/cli/browser/polyfills-FFHMD2TL.js +0 -17
{google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/WHEEL +0 -0
{google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/entry_points.txt +0 -0
{google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/licenses/LICENSE +0 -0

google/adk/evaluation/final_response_match_v2.py CHANGED Viewed

@@ -21,7 +21,7 @@ from typing import Optional
 from typing_extensions import override
 from ..models.llm_response import LlmResponse
-from ..utils.feature_decorator import working_in_progress
+from ..utils.feature_decorator import experimental
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
 from .evaluator import EvalStatus
@@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
   return label
-@working_in_progress
+@experimental
 class FinalResponseMatchV2Evaluator(LlmAsJudge):
   """V2 final response match evaluator which uses an LLM to judge responses.

google/adk/evaluation/gcs_eval_sets_manager.py CHANGED Viewed

@@ -23,6 +23,7 @@ from google.cloud import exceptions as cloud_exceptions
 from google.cloud import storage
 from typing_extensions import override
+from ..errors.not_found_error import NotFoundError
 from ._eval_sets_manager_utils import add_eval_case_to_eval_set
 from ._eval_sets_manager_utils import delete_eval_case_from_eval_set
 from ._eval_sets_manager_utils import get_eval_case_from_eval_set
@@ -130,7 +131,7 @@ class GcsEvalSetsManager(EvalSetsManager):
         eval_sets.append(eval_set_id)
       return sorted(eval_sets)
     except cloud_exceptions.NotFound as e:
-      raise ValueError(
+      raise NotFoundError(
           f"App `{app_name}` not found in GCS bucket `{self.bucket_name}`."
       ) from e

google/adk/evaluation/in_memory_eval_sets_manager.py ADDED Viewed

@@ -0,0 +1,151 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import time
+from typing import Optional
+from typing_extensions import override
+from ..errors.not_found_error import NotFoundError
+from .eval_case import EvalCase
+from .eval_set import EvalSet
+from .eval_sets_manager import EvalSetsManager
+class InMemoryEvalSetsManager(EvalSetsManager):
+  """An in-memory implementation of EvalSetsManager using dictionaries.
+  You can use this class:
+  1) As a part of your testcase.
+  2) For cases where other implementations of EvalSetsManager are too expensive
+  to use.
+  """
+  def __init__(self):
+    # {app_name: {eval_set_id: EvalSet}}
+    self._eval_sets: dict[str, dict[str, EvalSet]] = {}
+    # {app_name: {eval_set_id: {eval_case_id: EvalCase}}}
+    self._eval_cases: dict[str, dict[str, dict[str, EvalCase]]] = {}
+  def _ensure_app_exists(self, app_name: str):
+    if app_name not in self._eval_sets:
+      self._eval_sets[app_name] = {}
+      self._eval_cases[app_name] = {}
+  @override
+  def get_eval_set(self, app_name: str, eval_set_id: str) -> Optional[EvalSet]:
+    self._ensure_app_exists(app_name)
+    return self._eval_sets[app_name].get(eval_set_id, None)
+  @override
+  def create_eval_set(self, app_name: str, eval_set_id: str):
+    self._ensure_app_exists(app_name)
+    if eval_set_id in self._eval_sets[app_name]:
+      raise ValueError(
+          f"EvalSet {eval_set_id} already exists for app {app_name}."
+      )
+    new_eval_set = EvalSet(
+        eval_set_id=eval_set_id,
+        eval_cases=[],
+        creation_timestamp=time.time(),
+    )
+    self._eval_sets[app_name][eval_set_id] = new_eval_set
+    self._eval_cases[app_name][eval_set_id] = {}
+  @override
+  def list_eval_sets(self, app_name: str) -> list[str]:
+    if app_name not in self._eval_sets:
+      return []
+    return list(self._eval_sets[app_name].keys())
+  @override
+  def get_eval_case(
+      self, app_name: str, eval_set_id: str, eval_case_id: str
+  ) -> Optional[EvalCase]:
+    if app_name not in self._eval_cases:
+      return None
+    if eval_set_id not in self._eval_cases[app_name]:
+      return None
+    return self._eval_cases[app_name][eval_set_id].get(eval_case_id)
+  @override
+  def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
+    self._ensure_app_exists(app_name)
+    if eval_set_id not in self._eval_sets[app_name]:
+      raise NotFoundError(
+          f"EvalSet {eval_set_id} not found for app {app_name}."
+      )
+    if eval_case.eval_id in self._eval_cases[app_name][eval_set_id]:
+      raise ValueError(
+          f"EvalCase {eval_case.eval_id} already exists in EvalSet"
+          f" {eval_set_id} for app {app_name}."
+      )
+    self._eval_cases[app_name][eval_set_id][eval_case.eval_id] = eval_case
+    # Also update the list in the EvalSet object
+    self._eval_sets[app_name][eval_set_id].eval_cases.append(eval_case)
+  @override
+  def update_eval_case(
+      self, app_name: str, eval_set_id: str, updated_eval_case: EvalCase
+  ):
+    self._ensure_app_exists(app_name)
+    if eval_set_id not in self._eval_sets[app_name]:
+      raise NotFoundError(
+          f"EvalSet {eval_set_id} not found for app {app_name}."
+      )
+    if updated_eval_case.eval_id not in self._eval_cases[app_name][eval_set_id]:
+      raise NotFoundError(
+          f"EvalCase {updated_eval_case.eval_id} not found in EvalSet"
+          f" {eval_set_id} for app {app_name}."
+      )
+    # Full replace
+    self._eval_cases[app_name][eval_set_id][
+        updated_eval_case.eval_id
+    ] = updated_eval_case
+    # Update the list in the EvalSet object
+    eval_set = self._eval_sets[app_name][eval_set_id]
+    for i, case in enumerate(eval_set.eval_cases):
+      if case.eval_id == updated_eval_case.eval_id:
+        eval_set.eval_cases[i] = updated_eval_case
+        break
+  @override
+  def delete_eval_case(
+      self, app_name: str, eval_set_id: str, eval_case_id: str
+  ):
+    self._ensure_app_exists(app_name)
+    if eval_set_id not in self._eval_sets[app_name]:
+      raise NotFoundError(
+          f"EvalSet {eval_set_id} not found for app {app_name}."
+      )
+    if eval_case_id not in self._eval_cases[app_name][eval_set_id]:
+      raise NotFoundError(
+          f"EvalCase {eval_case_id} not found in EvalSet {eval_set_id}"
+          f" for app {app_name}."
+      )
+    del self._eval_cases[app_name][eval_set_id][eval_case_id]
+    # Remove from the list in the EvalSet object
+    eval_set = self._eval_sets[app_name][eval_set_id]
+    eval_set.eval_cases = [
+        case for case in eval_set.eval_cases if case.eval_id != eval_case_id
+    ]

google/adk/evaluation/local_eval_service.py ADDED Viewed

@@ -0,0 +1,389 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import asyncio
+import inspect
+import logging
+from typing import AsyncGenerator
+from typing import Callable
+from typing import Optional
+import uuid
+from typing_extensions import override
+from ..agents import BaseAgent
+from ..artifacts.base_artifact_service import BaseArtifactService
+from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
+from ..errors.not_found_error import NotFoundError
+from ..sessions.base_session_service import BaseSessionService
+from ..sessions.in_memory_session_service import InMemorySessionService
+from ..utils.feature_decorator import experimental
+from .base_eval_service import BaseEvalService
+from .base_eval_service import EvaluateConfig
+from .base_eval_service import EvaluateRequest
+from .base_eval_service import InferenceRequest
+from .base_eval_service import InferenceResult
+from .base_eval_service import InferenceStatus
+from .eval_case import Invocation
+from .eval_metrics import EvalMetric
+from .eval_metrics import EvalMetricResult
+from .eval_metrics import EvalMetricResultPerInvocation
+from .eval_result import EvalCaseResult
+from .eval_set import EvalCase
+from .eval_set_results_manager import EvalSetResultsManager
+from .eval_sets_manager import EvalSetsManager
+from .evaluation_generator import EvaluationGenerator
+from .evaluator import EvalStatus
+from .evaluator import EvaluationResult
+from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
+from .metric_evaluator_registry import MetricEvaluatorRegistry
+logger = logging.getLogger('google_adk.' + __name__)
+EVAL_SESSION_ID_PREFIX = '___eval___session___'
+def _get_session_id() -> str:
+  return f'{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}'
+@experimental
+class LocalEvalService(BaseEvalService):
+  """An implementation of BaseEvalService, that runs the evals locally."""
+  def __init__(
+      self,
+      root_agent: BaseAgent,
+      eval_sets_manager: EvalSetsManager,
+      metric_evaluator_registry: MetricEvaluatorRegistry = DEFAULT_METRIC_EVALUATOR_REGISTRY,
+      session_service: BaseSessionService = InMemorySessionService(),
+      artifact_service: BaseArtifactService = InMemoryArtifactService(),
+      eval_set_results_manager: Optional[EvalSetResultsManager] = None,
+      session_id_supplier: Callable[[], str] = _get_session_id,
+  ):
+    self._root_agent = root_agent
+    self._eval_sets_manager = eval_sets_manager
+    self._metric_evaluator_registry = metric_evaluator_registry
+    self._session_service = session_service
+    self._artifact_service = artifact_service
+    self._eval_set_results_manager = eval_set_results_manager
+    self._session_id_supplier = session_id_supplier
+  @override
+  async def perform_inference(
+      self,
+      inference_request: InferenceRequest,
+  ) -> AsyncGenerator[InferenceResult, None]:
+    """Returns InferenceResult obtained from the Agent as and when they are available.
+    Args:
+      inference_request: The request for generating inferences.
+    """
+    # Get the eval set from the storage.
+    eval_set = self._eval_sets_manager.get_eval_set(
+        app_name=inference_request.app_name,
+        eval_set_id=inference_request.eval_set_id,
+    )
+    if not eval_set:
+      raise NotFoundError(
+          f'Eval set with id {inference_request.eval_set_id} not found for app'
+          f' {inference_request.app_name}'
+      )
+    # Select eval cases for which we need to run inferencing. If the inference
+    # request specified eval cases, then we use only those.
+    eval_cases = eval_set.eval_cases
+    if inference_request.eval_case_ids:
+      eval_cases = [
+          eval_case
+          for eval_case in eval_cases
+          if eval_case.eval_id in inference_request.eval_case_ids
+      ]
+    root_agent = self._root_agent.clone()
+    semaphore = asyncio.Semaphore(
+        value=inference_request.inference_config.parallelism
+    )
+    async def run_inference(eval_case):
+      async with semaphore:
+        return await self._perform_inference_sigle_eval_item(
+            app_name=inference_request.app_name,
+            eval_set_id=inference_request.eval_set_id,
+            eval_case=eval_case,
+            root_agent=root_agent,
+        )
+    inference_results = [run_inference(eval_case) for eval_case in eval_cases]
+    for inference_result in asyncio.as_completed(inference_results):
+      yield await inference_result
+  @override
+  async def evaluate(
+      self,
+      evaluate_request: EvaluateRequest,
+  ) -> AsyncGenerator[EvalCaseResult, None]:
+    """Returns EvalCaseResult for each item as and when they are available.
+    Args:
+      evaluate_request: The request to perform metric evaluations on the
+        inferences.
+    """
+    semaphore = asyncio.Semaphore(
+        value=evaluate_request.evaluate_config.parallelism
+    )
+    async def run_evaluation(inference_result):
+      async with semaphore:
+        return await self._evaluate_single_inference_result(
+            inference_result=inference_result,
+            evaluate_config=evaluate_request.evaluate_config,
+        )
+    evaluation_tasks = [
+        run_evaluation(inference_result)
+        for inference_result in evaluate_request.inference_results
+    ]
+    for evaluation_task in asyncio.as_completed(evaluation_tasks):
+      inference_result, eval_case_result = await evaluation_task
+      if self._eval_set_results_manager:
+        self._eval_set_results_manager.save_eval_set_result(
+            app_name=inference_result.app_name,
+            eval_set_id=inference_result.eval_set_id,
+            eval_case_results=[eval_case_result],
+        )
+      yield eval_case_result
+  async def _evaluate_single_inference_result(
+      self, inference_result: InferenceResult, evaluate_config: EvaluateConfig
+  ) -> tuple[InferenceResult, EvalCaseResult]:
+    """Returns EvalCaseResult for the given inference result.
+    A single inference result can have multiple invocations. For each
+    invocaiton, this method evaluates the metrics present in evaluate config.
+    The EvalCaseResult contains scores for each metric per invocation and the
+    overall score.
+    """
+    eval_case = self._eval_sets_manager.get_eval_case(
+        app_name=inference_result.app_name,
+        eval_set_id=inference_result.eval_set_id,
+        eval_case_id=inference_result.eval_case_id,
+    )
+    if eval_case is None:
+      raise NotFoundError(
+          f'Eval case with id {inference_result.eval_case_id} not found for'
+          f' app {inference_result.app_name} and eval set'
+          f' {inference_result.eval_set_id}.'
+      )
+    # Metric results for each invocation
+    eval_metric_result_per_invocation = []
+    # We also keep track of the overall score for a metric, derived from all
+    # invocation. For example, if we were keeping track the metric that compares
+    # how well is the final resposne as compared to a golden answer, then each
+    # invocation will have the value of this metric. We will also have an
+    # overall score using aggregation strategy across all invocations. This
+    # would be the score for the eval case.
+    overall_eval_metric_results = []
+    if len(inference_result.inferences) != len(eval_case.conversation):
+      raise ValueError(
+          'Inferences should match conversations in eval case. Found'
+          f'{len(inference_result.inferences)} inferences '
+          f'{len(eval_case.conversation)} conversations in eval cases.'
+      )
+    # Pre-creating the EvalMetricResults entries for each invocation.
+    for actual, expected in zip(
+        inference_result.inferences, eval_case.conversation
+    ):
+      eval_metric_result_per_invocation.append(
+          EvalMetricResultPerInvocation(
+              actual_invocation=actual,
+              expected_invocation=expected,
+              # We will fill this as we evaluate each metric per invocation.
+              eval_metric_results=[],
+          )
+      )
+    for eval_metric in evaluate_config.eval_metrics:
+      # Perform evaluation of the metric.
+      evaluation_result = await self._evaluate_metric(
+          eval_metric=eval_metric,
+          actual_invocations=inference_result.inferences,
+          expected_invocations=eval_case.conversation,
+      )
+      # Track overall scrore across all invocations.
+      overall_eval_metric_results.append(
+          EvalMetricResult(
+              metric_name=eval_metric.metric_name,
+              threshold=eval_metric.threshold,
+              score=evaluation_result.overall_score,
+              eval_status=evaluation_result.overall_eval_status,
+          )
+      )
+      if len(evaluation_result.per_invocation_results) != len(
+          eval_metric_result_per_invocation
+      ):
+        raise ValueError(
+            'Eval metric should return results for each invocation. Found '
+            f'{len(evaluation_result.per_invocation_results)} results for '
+            f'{len(eval_metric_result_per_invocation)} invocations.'
+        )
+      # Track score across individual invocations.
+      for invocation_result, invocation in zip(
+          evaluation_result.per_invocation_results,
+          eval_metric_result_per_invocation,
+      ):
+        invocation.eval_metric_results.append(
+            EvalMetricResult(
+                metric_name=eval_metric.metric_name,
+                threshold=eval_metric.threshold,
+                score=invocation_result.score,
+                eval_status=invocation_result.eval_status,
+            )
+        )
+    final_eval_status = self._generate_final_eval_status(
+        overall_eval_metric_results
+    )
+    user_id = (
+        eval_case.session_input.user_id
+        if eval_case.session_input and eval_case.session_input.user_id
+        else 'test_user_id'
+    )
+    eval_case_result = EvalCaseResult(
+        eval_set_file=inference_result.eval_set_id,
+        eval_set_id=inference_result.eval_set_id,
+        eval_id=inference_result.eval_case_id,
+        final_eval_status=final_eval_status,
+        overall_eval_metric_results=overall_eval_metric_results,
+        eval_metric_result_per_invocation=eval_metric_result_per_invocation,
+        session_id=inference_result.session_id,
+        session_details=await self._session_service.get_session(
+            app_name=inference_result.app_name,
+            user_id=user_id,
+            session_id=inference_result.session_id,
+        ),
+        user_id=user_id,
+    )
+    return (inference_result, eval_case_result)
+  async def _evaluate_metric(
+      self,
+      eval_metric: EvalMetric,
+      actual_invocations: list[Invocation],
+      expected_invocations: list[Invocation],
+  ) -> EvaluationResult:
+    """Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
+    # Get the metric evaluator from the registry.
+    metric_evaluator = self._metric_evaluator_registry.get_evaluator(
+        eval_metric=eval_metric
+    )
+    if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
+      # Some evaluators could be async, for example those that use llm as a
+      # judge, so we need to make sure that we wait on them.
+      return await metric_evaluator.evaluate_invocations(
+          actual_invocations=actual_invocations,
+          expected_invocations=expected_invocations,
+      )
+    else:
+      # Metrics that perform computation synchronously, mostly these don't
+      # perform any i/o. An example of this would calculation of rouge_1 score.
+      return metric_evaluator.evaluate_invocations(
+          actual_invocations=actual_invocations,
+          expected_invocations=expected_invocations,
+      )
+  def _generate_final_eval_status(
+      self, overall_eval_metric_results: list[EvalMetricResult]
+  ) -> EvalStatus:
+    final_eval_status = EvalStatus.NOT_EVALUATED
+    # Go over the all the eval statuses and mark the final eval status as
+    # passed if all of them pass, otherwise mark the final eval status to
+    # failed.
+    for overall_eval_metric_result in overall_eval_metric_results:
+      overall_eval_status = overall_eval_metric_result.eval_status
+      if overall_eval_status == EvalStatus.PASSED:
+        final_eval_status = EvalStatus.PASSED
+      elif overall_eval_status == EvalStatus.NOT_EVALUATED:
+        continue
+      elif overall_eval_status == EvalStatus.FAILED:
+        final_eval_status = EvalStatus.FAILED
+        break
+      else:
+        raise ValueError(f'Unknown eval status: {overall_eval_status}.')
+    return final_eval_status
+  async def _perform_inference_sigle_eval_item(
+      self,
+      app_name: str,
+      eval_set_id: str,
+      eval_case: EvalCase,
+      root_agent: BaseAgent,
+  ) -> InferenceResult:
+    initial_session = eval_case.session_input
+    session_id = self._session_id_supplier()
+    inference_result = InferenceResult(
+        app_name=app_name,
+        eval_set_id=eval_set_id,
+        eval_case_id=eval_case.eval_id,
+        session_id=session_id,
+    )
+    try:
+      inferences = (
+          await EvaluationGenerator._generate_inferences_from_root_agent(
+              invocations=eval_case.conversation,
+              root_agent=root_agent,
+              initial_session=initial_session,
+              session_id=session_id,
+              session_service=self._session_service,
+              artifact_service=self._artifact_service,
+          )
+      )
+      inference_result.inferences = inferences
+      inference_result.status = InferenceStatus.SUCCESS
+      return inference_result
+    except Exception as e:
+      # We intentionally catch the Exception as we don't failures to affect
+      # other inferences.
+      logger.error(
+          'Inference failed for eval case `%s` with error %s',
+          eval_case.eval_id,
+          e,
+      )
+      inference_result.status = InferenceStatus.FAILURE
+      inference_result.error_message = str(e)
+      return inference_result

google/adk/evaluation/local_eval_set_results_manager.py CHANGED Viewed

@@ -60,7 +60,7 @@ class LocalEvalSetResultsManager(EvalSetResultsManager):
         eval_set_result.eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION,
     )
     logger.info("Writing eval result to file: %s", eval_set_result_file_path)
-    with open(eval_set_result_file_path, "w") as f:
+    with open(eval_set_result_file_path, "w", encoding="utf-8") as f:
       f.write(json.dumps(eval_set_result_json, indent=2))
   @override
@@ -78,7 +78,7 @@ class LocalEvalSetResultsManager(EvalSetResultsManager):
     )
     if not os.path.exists(maybe_eval_result_file_path):
       raise NotFoundError(f"Eval set result `{eval_set_result_id}` not found.")
-    with open(maybe_eval_result_file_path, "r") as file:
+    with open(maybe_eval_result_file_path, "r", encoding="utf-8") as file:
       eval_result_data = json.load(file)
     return EvalSetResult.model_validate_json(eval_result_data)

google/adk/evaluation/local_eval_sets_manager.py CHANGED Viewed

@@ -27,6 +27,7 @@ from google.genai import types as genai_types
 from pydantic import ValidationError
 from typing_extensions import override
+from ..errors.not_found_error import NotFoundError
 from ._eval_sets_manager_utils import add_eval_case_to_eval_set
 from ._eval_sets_manager_utils import delete_eval_case_from_eval_set
 from ._eval_sets_manager_utils import get_eval_case_from_eval_set
@@ -226,16 +227,30 @@ class LocalEvalSetsManager(EvalSetsManager):
   @override
   def list_eval_sets(self, app_name: str) -> list[str]:
-    """Returns a list of EvalSets that belong to the given app_name."""
+    """Returns a list of EvalSets that belong to the given app_name.
+    Args:
+      app_name: The app name to list the eval sets for.
+    Returns:
+      A list of EvalSet ids.
+    Raises:
+      NotFoundError: If the eval directory for the app is not found.
+    """
     eval_set_file_path = os.path.join(self._agents_dir, app_name)
     eval_sets = []
-    for file in os.listdir(eval_set_file_path):
-      if file.endswith(_EVAL_SET_FILE_EXTENSION):
-        eval_sets.append(
-            os.path.basename(file).removesuffix(_EVAL_SET_FILE_EXTENSION)
-        )
-    return sorted(eval_sets)
+    try:
+      for file in os.listdir(eval_set_file_path):
+        if file.endswith(_EVAL_SET_FILE_EXTENSION):
+          eval_sets.append(
+              os.path.basename(file).removesuffix(_EVAL_SET_FILE_EXTENSION)
+          )
+      return sorted(eval_sets)
+    except FileNotFoundError as e:
+      raise NotFoundError(
+          f"Eval directory for app `{app_name}` not found."
+      ) from e
   @override
   def get_eval_case(
@@ -300,7 +315,7 @@ class LocalEvalSetsManager(EvalSetsManager):
       )
   def _write_eval_set_to_path(self, eval_set_path: str, eval_set: EvalSet):
-    with open(eval_set_path, "w") as f:
+    with open(eval_set_path, "w", encoding="utf-8") as f:
       f.write(eval_set.model_dump_json(indent=2))
   def _save_eval_set(self, app_name: str, eval_set_id: str, eval_set: EvalSet):

google/adk/evaluation/metric_evaluator_registry.py CHANGED Viewed

@@ -21,7 +21,9 @@ from .eval_metrics import EvalMetric
 from .eval_metrics import MetricName
 from .eval_metrics import PrebuiltMetrics
 from .evaluator import Evaluator
+from .final_response_match_v2 import FinalResponseMatchV2Evaluator
 from .response_evaluator import ResponseEvaluator
+from .safety_evaluator import SafetyEvaluatorV1
 from .trajectory_evaluator import TrajectoryEvaluator
 logger = logging.getLogger("google_adk." + __name__)
@@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
   metric_evaluator_registry = MetricEvaluatorRegistry()
   metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
-      evaluator=type(TrajectoryEvaluator),
+      metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
+      evaluator=TrajectoryEvaluator,
   )
   metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
-      evaluator=type(ResponseEvaluator),
+      metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
+      evaluator=ResponseEvaluator,
   )
   metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
-      evaluator=type(ResponseEvaluator),
+      metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
+      evaluator=ResponseEvaluator,
+  )
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.SAFETY_V1.value,
+      evaluator=SafetyEvaluatorV1,
+  )
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
+      evaluator=FinalResponseMatchV2Evaluator,
   )
   return metric_evaluator_registry

google-adk 1.6.1__py3-none-any.whl → 1.8.0__py3-none-any.whl

google-adk 1.6.1py3-none-any.whl → 1.8.0py3-none-any.whl