google-adk 1.6.1__py3-none-any.whl → 1.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/adk/a2a/converters/event_converter.py +5 -85
- google/adk/a2a/converters/request_converter.py +1 -2
- google/adk/a2a/executor/a2a_agent_executor.py +45 -16
- google/adk/a2a/logs/log_utils.py +1 -2
- google/adk/a2a/utils/__init__.py +0 -0
- google/adk/a2a/utils/agent_card_builder.py +544 -0
- google/adk/a2a/utils/agent_to_a2a.py +118 -0
- google/adk/agents/__init__.py +5 -0
- google/adk/agents/agent_config.py +46 -0
- google/adk/agents/base_agent.py +239 -41
- google/adk/agents/callback_context.py +41 -0
- google/adk/agents/common_configs.py +79 -0
- google/adk/agents/config_agent_utils.py +184 -0
- google/adk/agents/config_schemas/AgentConfig.json +566 -0
- google/adk/agents/invocation_context.py +5 -1
- google/adk/agents/live_request_queue.py +15 -0
- google/adk/agents/llm_agent.py +201 -9
- google/adk/agents/loop_agent.py +35 -1
- google/adk/agents/parallel_agent.py +24 -3
- google/adk/agents/remote_a2a_agent.py +17 -5
- google/adk/agents/sequential_agent.py +22 -1
- google/adk/artifacts/gcs_artifact_service.py +110 -20
- google/adk/auth/auth_handler.py +3 -3
- google/adk/auth/credential_manager.py +23 -23
- google/adk/auth/credential_service/base_credential_service.py +6 -6
- google/adk/auth/credential_service/in_memory_credential_service.py +10 -8
- google/adk/auth/credential_service/session_state_credential_service.py +8 -8
- google/adk/auth/exchanger/oauth2_credential_exchanger.py +3 -3
- google/adk/auth/oauth2_credential_util.py +2 -2
- google/adk/auth/refresher/oauth2_credential_refresher.py +4 -4
- google/adk/cli/agent_graph.py +3 -1
- google/adk/cli/browser/index.html +2 -2
- google/adk/cli/browser/main-W7QZBYAR.js +3914 -0
- google/adk/cli/browser/polyfills-B6TNHZQ6.js +17 -0
- google/adk/cli/cli_eval.py +87 -12
- google/adk/cli/cli_tools_click.py +143 -82
- google/adk/cli/fast_api.py +150 -69
- google/adk/cli/utils/agent_loader.py +35 -1
- google/adk/code_executors/base_code_executor.py +14 -19
- google/adk/code_executors/built_in_code_executor.py +4 -1
- google/adk/evaluation/base_eval_service.py +46 -2
- google/adk/evaluation/eval_metrics.py +4 -0
- google/adk/evaluation/eval_sets_manager.py +5 -1
- google/adk/evaluation/evaluation_generator.py +1 -1
- google/adk/evaluation/final_response_match_v2.py +2 -2
- google/adk/evaluation/gcs_eval_sets_manager.py +2 -1
- google/adk/evaluation/in_memory_eval_sets_manager.py +151 -0
- google/adk/evaluation/local_eval_service.py +389 -0
- google/adk/evaluation/local_eval_set_results_manager.py +2 -2
- google/adk/evaluation/local_eval_sets_manager.py +24 -9
- google/adk/evaluation/metric_evaluator_registry.py +16 -6
- google/adk/evaluation/vertex_ai_eval_facade.py +7 -1
- google/adk/events/event.py +7 -2
- google/adk/flows/llm_flows/auto_flow.py +6 -11
- google/adk/flows/llm_flows/base_llm_flow.py +66 -29
- google/adk/flows/llm_flows/contents.py +16 -10
- google/adk/flows/llm_flows/functions.py +89 -52
- google/adk/memory/in_memory_memory_service.py +21 -15
- google/adk/memory/vertex_ai_memory_bank_service.py +12 -10
- google/adk/models/anthropic_llm.py +46 -6
- google/adk/models/base_llm_connection.py +2 -0
- google/adk/models/gemini_llm_connection.py +17 -6
- google/adk/models/google_llm.py +46 -11
- google/adk/models/lite_llm.py +52 -22
- google/adk/plugins/__init__.py +17 -0
- google/adk/plugins/base_plugin.py +317 -0
- google/adk/plugins/plugin_manager.py +265 -0
- google/adk/runners.py +122 -18
- google/adk/sessions/database_session_service.py +51 -52
- google/adk/sessions/vertex_ai_session_service.py +27 -12
- google/adk/tools/__init__.py +2 -0
- google/adk/tools/_automatic_function_calling_util.py +20 -2
- google/adk/tools/agent_tool.py +15 -3
- google/adk/tools/apihub_tool/apihub_toolset.py +38 -39
- google/adk/tools/application_integration_tool/application_integration_toolset.py +35 -37
- google/adk/tools/application_integration_tool/integration_connector_tool.py +2 -3
- google/adk/tools/base_tool.py +9 -9
- google/adk/tools/base_toolset.py +29 -5
- google/adk/tools/bigquery/__init__.py +3 -3
- google/adk/tools/bigquery/metadata_tool.py +2 -0
- google/adk/tools/bigquery/query_tool.py +15 -1
- google/adk/tools/computer_use/__init__.py +13 -0
- google/adk/tools/computer_use/base_computer.py +265 -0
- google/adk/tools/computer_use/computer_use_tool.py +166 -0
- google/adk/tools/computer_use/computer_use_toolset.py +220 -0
- google/adk/tools/enterprise_search_tool.py +4 -2
- google/adk/tools/exit_loop_tool.py +1 -0
- google/adk/tools/google_api_tool/google_api_tool.py +16 -1
- google/adk/tools/google_api_tool/google_api_toolset.py +9 -7
- google/adk/tools/google_api_tool/google_api_toolsets.py +41 -20
- google/adk/tools/google_search_tool.py +4 -2
- google/adk/tools/langchain_tool.py +16 -6
- google/adk/tools/long_running_tool.py +21 -0
- google/adk/tools/mcp_tool/mcp_toolset.py +27 -28
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_spec_parser.py +5 -0
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +8 -8
- google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +4 -6
- google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +3 -2
- google/adk/tools/tool_context.py +0 -10
- google/adk/tools/url_context_tool.py +4 -2
- google/adk/tools/vertex_ai_search_tool.py +4 -2
- google/adk/utils/model_name_utils.py +90 -0
- google/adk/version.py +1 -1
- {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/METADATA +3 -2
- {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/RECORD +108 -91
- google/adk/cli/browser/main-RXDVX3K6.js +0 -3914
- google/adk/cli/browser/polyfills-FFHMD2TL.js +0 -17
- {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/WHEEL +0 -0
- {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/entry_points.txt +0 -0
- {google_adk-1.6.1.dist-info → google_adk-1.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -21,7 +21,7 @@ from typing import Optional
|
|
21
21
|
from typing_extensions import override
|
22
22
|
|
23
23
|
from ..models.llm_response import LlmResponse
|
24
|
-
from ..utils.feature_decorator import
|
24
|
+
from ..utils.feature_decorator import experimental
|
25
25
|
from .eval_case import Invocation
|
26
26
|
from .eval_metrics import EvalMetric
|
27
27
|
from .evaluator import EvalStatus
|
@@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
|
|
125
125
|
return label
|
126
126
|
|
127
127
|
|
128
|
-
@
|
128
|
+
@experimental
|
129
129
|
class FinalResponseMatchV2Evaluator(LlmAsJudge):
|
130
130
|
"""V2 final response match evaluator which uses an LLM to judge responses.
|
131
131
|
|
@@ -23,6 +23,7 @@ from google.cloud import exceptions as cloud_exceptions
|
|
23
23
|
from google.cloud import storage
|
24
24
|
from typing_extensions import override
|
25
25
|
|
26
|
+
from ..errors.not_found_error import NotFoundError
|
26
27
|
from ._eval_sets_manager_utils import add_eval_case_to_eval_set
|
27
28
|
from ._eval_sets_manager_utils import delete_eval_case_from_eval_set
|
28
29
|
from ._eval_sets_manager_utils import get_eval_case_from_eval_set
|
@@ -130,7 +131,7 @@ class GcsEvalSetsManager(EvalSetsManager):
|
|
130
131
|
eval_sets.append(eval_set_id)
|
131
132
|
return sorted(eval_sets)
|
132
133
|
except cloud_exceptions.NotFound as e:
|
133
|
-
raise
|
134
|
+
raise NotFoundError(
|
134
135
|
f"App `{app_name}` not found in GCS bucket `{self.bucket_name}`."
|
135
136
|
) from e
|
136
137
|
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
import time
|
18
|
+
from typing import Optional
|
19
|
+
|
20
|
+
from typing_extensions import override
|
21
|
+
|
22
|
+
from ..errors.not_found_error import NotFoundError
|
23
|
+
from .eval_case import EvalCase
|
24
|
+
from .eval_set import EvalSet
|
25
|
+
from .eval_sets_manager import EvalSetsManager
|
26
|
+
|
27
|
+
|
28
|
+
class InMemoryEvalSetsManager(EvalSetsManager):
|
29
|
+
"""An in-memory implementation of EvalSetsManager using dictionaries.
|
30
|
+
|
31
|
+
You can use this class:
|
32
|
+
1) As a part of your testcase.
|
33
|
+
2) For cases where other implementations of EvalSetsManager are too expensive
|
34
|
+
to use.
|
35
|
+
"""
|
36
|
+
|
37
|
+
def __init__(self):
|
38
|
+
# {app_name: {eval_set_id: EvalSet}}
|
39
|
+
self._eval_sets: dict[str, dict[str, EvalSet]] = {}
|
40
|
+
# {app_name: {eval_set_id: {eval_case_id: EvalCase}}}
|
41
|
+
self._eval_cases: dict[str, dict[str, dict[str, EvalCase]]] = {}
|
42
|
+
|
43
|
+
def _ensure_app_exists(self, app_name: str):
|
44
|
+
if app_name not in self._eval_sets:
|
45
|
+
self._eval_sets[app_name] = {}
|
46
|
+
self._eval_cases[app_name] = {}
|
47
|
+
|
48
|
+
@override
|
49
|
+
def get_eval_set(self, app_name: str, eval_set_id: str) -> Optional[EvalSet]:
|
50
|
+
self._ensure_app_exists(app_name)
|
51
|
+
return self._eval_sets[app_name].get(eval_set_id, None)
|
52
|
+
|
53
|
+
@override
|
54
|
+
def create_eval_set(self, app_name: str, eval_set_id: str):
|
55
|
+
self._ensure_app_exists(app_name)
|
56
|
+
if eval_set_id in self._eval_sets[app_name]:
|
57
|
+
raise ValueError(
|
58
|
+
f"EvalSet {eval_set_id} already exists for app {app_name}."
|
59
|
+
)
|
60
|
+
|
61
|
+
new_eval_set = EvalSet(
|
62
|
+
eval_set_id=eval_set_id,
|
63
|
+
eval_cases=[],
|
64
|
+
creation_timestamp=time.time(),
|
65
|
+
)
|
66
|
+
self._eval_sets[app_name][eval_set_id] = new_eval_set
|
67
|
+
self._eval_cases[app_name][eval_set_id] = {}
|
68
|
+
|
69
|
+
@override
|
70
|
+
def list_eval_sets(self, app_name: str) -> list[str]:
|
71
|
+
if app_name not in self._eval_sets:
|
72
|
+
return []
|
73
|
+
|
74
|
+
return list(self._eval_sets[app_name].keys())
|
75
|
+
|
76
|
+
@override
|
77
|
+
def get_eval_case(
|
78
|
+
self, app_name: str, eval_set_id: str, eval_case_id: str
|
79
|
+
) -> Optional[EvalCase]:
|
80
|
+
if app_name not in self._eval_cases:
|
81
|
+
return None
|
82
|
+
if eval_set_id not in self._eval_cases[app_name]:
|
83
|
+
return None
|
84
|
+
return self._eval_cases[app_name][eval_set_id].get(eval_case_id)
|
85
|
+
|
86
|
+
@override
|
87
|
+
def add_eval_case(self, app_name: str, eval_set_id: str, eval_case: EvalCase):
|
88
|
+
self._ensure_app_exists(app_name)
|
89
|
+
if eval_set_id not in self._eval_sets[app_name]:
|
90
|
+
raise NotFoundError(
|
91
|
+
f"EvalSet {eval_set_id} not found for app {app_name}."
|
92
|
+
)
|
93
|
+
if eval_case.eval_id in self._eval_cases[app_name][eval_set_id]:
|
94
|
+
raise ValueError(
|
95
|
+
f"EvalCase {eval_case.eval_id} already exists in EvalSet"
|
96
|
+
f" {eval_set_id} for app {app_name}."
|
97
|
+
)
|
98
|
+
|
99
|
+
self._eval_cases[app_name][eval_set_id][eval_case.eval_id] = eval_case
|
100
|
+
# Also update the list in the EvalSet object
|
101
|
+
self._eval_sets[app_name][eval_set_id].eval_cases.append(eval_case)
|
102
|
+
|
103
|
+
@override
|
104
|
+
def update_eval_case(
|
105
|
+
self, app_name: str, eval_set_id: str, updated_eval_case: EvalCase
|
106
|
+
):
|
107
|
+
self._ensure_app_exists(app_name)
|
108
|
+
if eval_set_id not in self._eval_sets[app_name]:
|
109
|
+
raise NotFoundError(
|
110
|
+
f"EvalSet {eval_set_id} not found for app {app_name}."
|
111
|
+
)
|
112
|
+
if updated_eval_case.eval_id not in self._eval_cases[app_name][eval_set_id]:
|
113
|
+
raise NotFoundError(
|
114
|
+
f"EvalCase {updated_eval_case.eval_id} not found in EvalSet"
|
115
|
+
f" {eval_set_id} for app {app_name}."
|
116
|
+
)
|
117
|
+
|
118
|
+
# Full replace
|
119
|
+
self._eval_cases[app_name][eval_set_id][
|
120
|
+
updated_eval_case.eval_id
|
121
|
+
] = updated_eval_case
|
122
|
+
|
123
|
+
# Update the list in the EvalSet object
|
124
|
+
eval_set = self._eval_sets[app_name][eval_set_id]
|
125
|
+
for i, case in enumerate(eval_set.eval_cases):
|
126
|
+
if case.eval_id == updated_eval_case.eval_id:
|
127
|
+
eval_set.eval_cases[i] = updated_eval_case
|
128
|
+
break
|
129
|
+
|
130
|
+
@override
|
131
|
+
def delete_eval_case(
|
132
|
+
self, app_name: str, eval_set_id: str, eval_case_id: str
|
133
|
+
):
|
134
|
+
self._ensure_app_exists(app_name)
|
135
|
+
if eval_set_id not in self._eval_sets[app_name]:
|
136
|
+
raise NotFoundError(
|
137
|
+
f"EvalSet {eval_set_id} not found for app {app_name}."
|
138
|
+
)
|
139
|
+
if eval_case_id not in self._eval_cases[app_name][eval_set_id]:
|
140
|
+
raise NotFoundError(
|
141
|
+
f"EvalCase {eval_case_id} not found in EvalSet {eval_set_id}"
|
142
|
+
f" for app {app_name}."
|
143
|
+
)
|
144
|
+
|
145
|
+
del self._eval_cases[app_name][eval_set_id][eval_case_id]
|
146
|
+
|
147
|
+
# Remove from the list in the EvalSet object
|
148
|
+
eval_set = self._eval_sets[app_name][eval_set_id]
|
149
|
+
eval_set.eval_cases = [
|
150
|
+
case for case in eval_set.eval_cases if case.eval_id != eval_case_id
|
151
|
+
]
|
@@ -0,0 +1,389 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
import asyncio
|
18
|
+
import inspect
|
19
|
+
import logging
|
20
|
+
from typing import AsyncGenerator
|
21
|
+
from typing import Callable
|
22
|
+
from typing import Optional
|
23
|
+
import uuid
|
24
|
+
|
25
|
+
from typing_extensions import override
|
26
|
+
|
27
|
+
from ..agents import BaseAgent
|
28
|
+
from ..artifacts.base_artifact_service import BaseArtifactService
|
29
|
+
from ..artifacts.in_memory_artifact_service import InMemoryArtifactService
|
30
|
+
from ..errors.not_found_error import NotFoundError
|
31
|
+
from ..sessions.base_session_service import BaseSessionService
|
32
|
+
from ..sessions.in_memory_session_service import InMemorySessionService
|
33
|
+
from ..utils.feature_decorator import experimental
|
34
|
+
from .base_eval_service import BaseEvalService
|
35
|
+
from .base_eval_service import EvaluateConfig
|
36
|
+
from .base_eval_service import EvaluateRequest
|
37
|
+
from .base_eval_service import InferenceRequest
|
38
|
+
from .base_eval_service import InferenceResult
|
39
|
+
from .base_eval_service import InferenceStatus
|
40
|
+
from .eval_case import Invocation
|
41
|
+
from .eval_metrics import EvalMetric
|
42
|
+
from .eval_metrics import EvalMetricResult
|
43
|
+
from .eval_metrics import EvalMetricResultPerInvocation
|
44
|
+
from .eval_result import EvalCaseResult
|
45
|
+
from .eval_set import EvalCase
|
46
|
+
from .eval_set_results_manager import EvalSetResultsManager
|
47
|
+
from .eval_sets_manager import EvalSetsManager
|
48
|
+
from .evaluation_generator import EvaluationGenerator
|
49
|
+
from .evaluator import EvalStatus
|
50
|
+
from .evaluator import EvaluationResult
|
51
|
+
from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
|
52
|
+
from .metric_evaluator_registry import MetricEvaluatorRegistry
|
53
|
+
|
54
|
+
logger = logging.getLogger('google_adk.' + __name__)
|
55
|
+
|
56
|
+
EVAL_SESSION_ID_PREFIX = '___eval___session___'
|
57
|
+
|
58
|
+
|
59
|
+
def _get_session_id() -> str:
|
60
|
+
return f'{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}'
|
61
|
+
|
62
|
+
|
63
|
+
@experimental
|
64
|
+
class LocalEvalService(BaseEvalService):
|
65
|
+
"""An implementation of BaseEvalService, that runs the evals locally."""
|
66
|
+
|
67
|
+
def __init__(
|
68
|
+
self,
|
69
|
+
root_agent: BaseAgent,
|
70
|
+
eval_sets_manager: EvalSetsManager,
|
71
|
+
metric_evaluator_registry: MetricEvaluatorRegistry = DEFAULT_METRIC_EVALUATOR_REGISTRY,
|
72
|
+
session_service: BaseSessionService = InMemorySessionService(),
|
73
|
+
artifact_service: BaseArtifactService = InMemoryArtifactService(),
|
74
|
+
eval_set_results_manager: Optional[EvalSetResultsManager] = None,
|
75
|
+
session_id_supplier: Callable[[], str] = _get_session_id,
|
76
|
+
):
|
77
|
+
self._root_agent = root_agent
|
78
|
+
self._eval_sets_manager = eval_sets_manager
|
79
|
+
self._metric_evaluator_registry = metric_evaluator_registry
|
80
|
+
self._session_service = session_service
|
81
|
+
self._artifact_service = artifact_service
|
82
|
+
self._eval_set_results_manager = eval_set_results_manager
|
83
|
+
self._session_id_supplier = session_id_supplier
|
84
|
+
|
85
|
+
@override
|
86
|
+
async def perform_inference(
|
87
|
+
self,
|
88
|
+
inference_request: InferenceRequest,
|
89
|
+
) -> AsyncGenerator[InferenceResult, None]:
|
90
|
+
"""Returns InferenceResult obtained from the Agent as and when they are available.
|
91
|
+
|
92
|
+
Args:
|
93
|
+
inference_request: The request for generating inferences.
|
94
|
+
"""
|
95
|
+
# Get the eval set from the storage.
|
96
|
+
eval_set = self._eval_sets_manager.get_eval_set(
|
97
|
+
app_name=inference_request.app_name,
|
98
|
+
eval_set_id=inference_request.eval_set_id,
|
99
|
+
)
|
100
|
+
|
101
|
+
if not eval_set:
|
102
|
+
raise NotFoundError(
|
103
|
+
f'Eval set with id {inference_request.eval_set_id} not found for app'
|
104
|
+
f' {inference_request.app_name}'
|
105
|
+
)
|
106
|
+
|
107
|
+
# Select eval cases for which we need to run inferencing. If the inference
|
108
|
+
# request specified eval cases, then we use only those.
|
109
|
+
eval_cases = eval_set.eval_cases
|
110
|
+
if inference_request.eval_case_ids:
|
111
|
+
eval_cases = [
|
112
|
+
eval_case
|
113
|
+
for eval_case in eval_cases
|
114
|
+
if eval_case.eval_id in inference_request.eval_case_ids
|
115
|
+
]
|
116
|
+
|
117
|
+
root_agent = self._root_agent.clone()
|
118
|
+
|
119
|
+
semaphore = asyncio.Semaphore(
|
120
|
+
value=inference_request.inference_config.parallelism
|
121
|
+
)
|
122
|
+
|
123
|
+
async def run_inference(eval_case):
|
124
|
+
async with semaphore:
|
125
|
+
return await self._perform_inference_sigle_eval_item(
|
126
|
+
app_name=inference_request.app_name,
|
127
|
+
eval_set_id=inference_request.eval_set_id,
|
128
|
+
eval_case=eval_case,
|
129
|
+
root_agent=root_agent,
|
130
|
+
)
|
131
|
+
|
132
|
+
inference_results = [run_inference(eval_case) for eval_case in eval_cases]
|
133
|
+
for inference_result in asyncio.as_completed(inference_results):
|
134
|
+
yield await inference_result
|
135
|
+
|
136
|
+
@override
|
137
|
+
async def evaluate(
|
138
|
+
self,
|
139
|
+
evaluate_request: EvaluateRequest,
|
140
|
+
) -> AsyncGenerator[EvalCaseResult, None]:
|
141
|
+
"""Returns EvalCaseResult for each item as and when they are available.
|
142
|
+
|
143
|
+
Args:
|
144
|
+
evaluate_request: The request to perform metric evaluations on the
|
145
|
+
inferences.
|
146
|
+
"""
|
147
|
+
semaphore = asyncio.Semaphore(
|
148
|
+
value=evaluate_request.evaluate_config.parallelism
|
149
|
+
)
|
150
|
+
|
151
|
+
async def run_evaluation(inference_result):
|
152
|
+
async with semaphore:
|
153
|
+
return await self._evaluate_single_inference_result(
|
154
|
+
inference_result=inference_result,
|
155
|
+
evaluate_config=evaluate_request.evaluate_config,
|
156
|
+
)
|
157
|
+
|
158
|
+
evaluation_tasks = [
|
159
|
+
run_evaluation(inference_result)
|
160
|
+
for inference_result in evaluate_request.inference_results
|
161
|
+
]
|
162
|
+
|
163
|
+
for evaluation_task in asyncio.as_completed(evaluation_tasks):
|
164
|
+
inference_result, eval_case_result = await evaluation_task
|
165
|
+
|
166
|
+
if self._eval_set_results_manager:
|
167
|
+
self._eval_set_results_manager.save_eval_set_result(
|
168
|
+
app_name=inference_result.app_name,
|
169
|
+
eval_set_id=inference_result.eval_set_id,
|
170
|
+
eval_case_results=[eval_case_result],
|
171
|
+
)
|
172
|
+
|
173
|
+
yield eval_case_result
|
174
|
+
|
175
|
+
async def _evaluate_single_inference_result(
|
176
|
+
self, inference_result: InferenceResult, evaluate_config: EvaluateConfig
|
177
|
+
) -> tuple[InferenceResult, EvalCaseResult]:
|
178
|
+
"""Returns EvalCaseResult for the given inference result.
|
179
|
+
|
180
|
+
A single inference result can have multiple invocations. For each
|
181
|
+
invocaiton, this method evaluates the metrics present in evaluate config.
|
182
|
+
|
183
|
+
The EvalCaseResult contains scores for each metric per invocation and the
|
184
|
+
overall score.
|
185
|
+
"""
|
186
|
+
eval_case = self._eval_sets_manager.get_eval_case(
|
187
|
+
app_name=inference_result.app_name,
|
188
|
+
eval_set_id=inference_result.eval_set_id,
|
189
|
+
eval_case_id=inference_result.eval_case_id,
|
190
|
+
)
|
191
|
+
|
192
|
+
if eval_case is None:
|
193
|
+
raise NotFoundError(
|
194
|
+
f'Eval case with id {inference_result.eval_case_id} not found for'
|
195
|
+
f' app {inference_result.app_name} and eval set'
|
196
|
+
f' {inference_result.eval_set_id}.'
|
197
|
+
)
|
198
|
+
|
199
|
+
# Metric results for each invocation
|
200
|
+
eval_metric_result_per_invocation = []
|
201
|
+
|
202
|
+
# We also keep track of the overall score for a metric, derived from all
|
203
|
+
# invocation. For example, if we were keeping track the metric that compares
|
204
|
+
# how well is the final resposne as compared to a golden answer, then each
|
205
|
+
# invocation will have the value of this metric. We will also have an
|
206
|
+
# overall score using aggregation strategy across all invocations. This
|
207
|
+
# would be the score for the eval case.
|
208
|
+
overall_eval_metric_results = []
|
209
|
+
|
210
|
+
if len(inference_result.inferences) != len(eval_case.conversation):
|
211
|
+
raise ValueError(
|
212
|
+
'Inferences should match conversations in eval case. Found'
|
213
|
+
f'{len(inference_result.inferences)} inferences '
|
214
|
+
f'{len(eval_case.conversation)} conversations in eval cases.'
|
215
|
+
)
|
216
|
+
|
217
|
+
# Pre-creating the EvalMetricResults entries for each invocation.
|
218
|
+
for actual, expected in zip(
|
219
|
+
inference_result.inferences, eval_case.conversation
|
220
|
+
):
|
221
|
+
eval_metric_result_per_invocation.append(
|
222
|
+
EvalMetricResultPerInvocation(
|
223
|
+
actual_invocation=actual,
|
224
|
+
expected_invocation=expected,
|
225
|
+
# We will fill this as we evaluate each metric per invocation.
|
226
|
+
eval_metric_results=[],
|
227
|
+
)
|
228
|
+
)
|
229
|
+
|
230
|
+
for eval_metric in evaluate_config.eval_metrics:
|
231
|
+
# Perform evaluation of the metric.
|
232
|
+
evaluation_result = await self._evaluate_metric(
|
233
|
+
eval_metric=eval_metric,
|
234
|
+
actual_invocations=inference_result.inferences,
|
235
|
+
expected_invocations=eval_case.conversation,
|
236
|
+
)
|
237
|
+
|
238
|
+
# Track overall scrore across all invocations.
|
239
|
+
overall_eval_metric_results.append(
|
240
|
+
EvalMetricResult(
|
241
|
+
metric_name=eval_metric.metric_name,
|
242
|
+
threshold=eval_metric.threshold,
|
243
|
+
score=evaluation_result.overall_score,
|
244
|
+
eval_status=evaluation_result.overall_eval_status,
|
245
|
+
)
|
246
|
+
)
|
247
|
+
|
248
|
+
if len(evaluation_result.per_invocation_results) != len(
|
249
|
+
eval_metric_result_per_invocation
|
250
|
+
):
|
251
|
+
raise ValueError(
|
252
|
+
'Eval metric should return results for each invocation. Found '
|
253
|
+
f'{len(evaluation_result.per_invocation_results)} results for '
|
254
|
+
f'{len(eval_metric_result_per_invocation)} invocations.'
|
255
|
+
)
|
256
|
+
|
257
|
+
# Track score across individual invocations.
|
258
|
+
for invocation_result, invocation in zip(
|
259
|
+
evaluation_result.per_invocation_results,
|
260
|
+
eval_metric_result_per_invocation,
|
261
|
+
):
|
262
|
+
invocation.eval_metric_results.append(
|
263
|
+
EvalMetricResult(
|
264
|
+
metric_name=eval_metric.metric_name,
|
265
|
+
threshold=eval_metric.threshold,
|
266
|
+
score=invocation_result.score,
|
267
|
+
eval_status=invocation_result.eval_status,
|
268
|
+
)
|
269
|
+
)
|
270
|
+
|
271
|
+
final_eval_status = self._generate_final_eval_status(
|
272
|
+
overall_eval_metric_results
|
273
|
+
)
|
274
|
+
user_id = (
|
275
|
+
eval_case.session_input.user_id
|
276
|
+
if eval_case.session_input and eval_case.session_input.user_id
|
277
|
+
else 'test_user_id'
|
278
|
+
)
|
279
|
+
|
280
|
+
eval_case_result = EvalCaseResult(
|
281
|
+
eval_set_file=inference_result.eval_set_id,
|
282
|
+
eval_set_id=inference_result.eval_set_id,
|
283
|
+
eval_id=inference_result.eval_case_id,
|
284
|
+
final_eval_status=final_eval_status,
|
285
|
+
overall_eval_metric_results=overall_eval_metric_results,
|
286
|
+
eval_metric_result_per_invocation=eval_metric_result_per_invocation,
|
287
|
+
session_id=inference_result.session_id,
|
288
|
+
session_details=await self._session_service.get_session(
|
289
|
+
app_name=inference_result.app_name,
|
290
|
+
user_id=user_id,
|
291
|
+
session_id=inference_result.session_id,
|
292
|
+
),
|
293
|
+
user_id=user_id,
|
294
|
+
)
|
295
|
+
|
296
|
+
return (inference_result, eval_case_result)
|
297
|
+
|
298
|
+
async def _evaluate_metric(
|
299
|
+
self,
|
300
|
+
eval_metric: EvalMetric,
|
301
|
+
actual_invocations: list[Invocation],
|
302
|
+
expected_invocations: list[Invocation],
|
303
|
+
) -> EvaluationResult:
|
304
|
+
"""Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
|
305
|
+
|
306
|
+
# Get the metric evaluator from the registry.
|
307
|
+
metric_evaluator = self._metric_evaluator_registry.get_evaluator(
|
308
|
+
eval_metric=eval_metric
|
309
|
+
)
|
310
|
+
|
311
|
+
if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
|
312
|
+
# Some evaluators could be async, for example those that use llm as a
|
313
|
+
# judge, so we need to make sure that we wait on them.
|
314
|
+
return await metric_evaluator.evaluate_invocations(
|
315
|
+
actual_invocations=actual_invocations,
|
316
|
+
expected_invocations=expected_invocations,
|
317
|
+
)
|
318
|
+
else:
|
319
|
+
# Metrics that perform computation synchronously, mostly these don't
|
320
|
+
# perform any i/o. An example of this would calculation of rouge_1 score.
|
321
|
+
return metric_evaluator.evaluate_invocations(
|
322
|
+
actual_invocations=actual_invocations,
|
323
|
+
expected_invocations=expected_invocations,
|
324
|
+
)
|
325
|
+
|
326
|
+
def _generate_final_eval_status(
|
327
|
+
self, overall_eval_metric_results: list[EvalMetricResult]
|
328
|
+
) -> EvalStatus:
|
329
|
+
final_eval_status = EvalStatus.NOT_EVALUATED
|
330
|
+
# Go over the all the eval statuses and mark the final eval status as
|
331
|
+
# passed if all of them pass, otherwise mark the final eval status to
|
332
|
+
# failed.
|
333
|
+
for overall_eval_metric_result in overall_eval_metric_results:
|
334
|
+
overall_eval_status = overall_eval_metric_result.eval_status
|
335
|
+
if overall_eval_status == EvalStatus.PASSED:
|
336
|
+
final_eval_status = EvalStatus.PASSED
|
337
|
+
elif overall_eval_status == EvalStatus.NOT_EVALUATED:
|
338
|
+
continue
|
339
|
+
elif overall_eval_status == EvalStatus.FAILED:
|
340
|
+
final_eval_status = EvalStatus.FAILED
|
341
|
+
break
|
342
|
+
else:
|
343
|
+
raise ValueError(f'Unknown eval status: {overall_eval_status}.')
|
344
|
+
|
345
|
+
return final_eval_status
|
346
|
+
|
347
|
+
async def _perform_inference_sigle_eval_item(
|
348
|
+
self,
|
349
|
+
app_name: str,
|
350
|
+
eval_set_id: str,
|
351
|
+
eval_case: EvalCase,
|
352
|
+
root_agent: BaseAgent,
|
353
|
+
) -> InferenceResult:
|
354
|
+
initial_session = eval_case.session_input
|
355
|
+
session_id = self._session_id_supplier()
|
356
|
+
inference_result = InferenceResult(
|
357
|
+
app_name=app_name,
|
358
|
+
eval_set_id=eval_set_id,
|
359
|
+
eval_case_id=eval_case.eval_id,
|
360
|
+
session_id=session_id,
|
361
|
+
)
|
362
|
+
|
363
|
+
try:
|
364
|
+
inferences = (
|
365
|
+
await EvaluationGenerator._generate_inferences_from_root_agent(
|
366
|
+
invocations=eval_case.conversation,
|
367
|
+
root_agent=root_agent,
|
368
|
+
initial_session=initial_session,
|
369
|
+
session_id=session_id,
|
370
|
+
session_service=self._session_service,
|
371
|
+
artifact_service=self._artifact_service,
|
372
|
+
)
|
373
|
+
)
|
374
|
+
|
375
|
+
inference_result.inferences = inferences
|
376
|
+
inference_result.status = InferenceStatus.SUCCESS
|
377
|
+
|
378
|
+
return inference_result
|
379
|
+
except Exception as e:
|
380
|
+
# We intentionally catch the Exception as we don't failures to affect
|
381
|
+
# other inferences.
|
382
|
+
logger.error(
|
383
|
+
'Inference failed for eval case `%s` with error %s',
|
384
|
+
eval_case.eval_id,
|
385
|
+
e,
|
386
|
+
)
|
387
|
+
inference_result.status = InferenceStatus.FAILURE
|
388
|
+
inference_result.error_message = str(e)
|
389
|
+
return inference_result
|
@@ -60,7 +60,7 @@ class LocalEvalSetResultsManager(EvalSetResultsManager):
|
|
60
60
|
eval_set_result.eval_set_result_name + _EVAL_SET_RESULT_FILE_EXTENSION,
|
61
61
|
)
|
62
62
|
logger.info("Writing eval result to file: %s", eval_set_result_file_path)
|
63
|
-
with open(eval_set_result_file_path, "w") as f:
|
63
|
+
with open(eval_set_result_file_path, "w", encoding="utf-8") as f:
|
64
64
|
f.write(json.dumps(eval_set_result_json, indent=2))
|
65
65
|
|
66
66
|
@override
|
@@ -78,7 +78,7 @@ class LocalEvalSetResultsManager(EvalSetResultsManager):
|
|
78
78
|
)
|
79
79
|
if not os.path.exists(maybe_eval_result_file_path):
|
80
80
|
raise NotFoundError(f"Eval set result `{eval_set_result_id}` not found.")
|
81
|
-
with open(maybe_eval_result_file_path, "r") as file:
|
81
|
+
with open(maybe_eval_result_file_path, "r", encoding="utf-8") as file:
|
82
82
|
eval_result_data = json.load(file)
|
83
83
|
return EvalSetResult.model_validate_json(eval_result_data)
|
84
84
|
|
@@ -27,6 +27,7 @@ from google.genai import types as genai_types
|
|
27
27
|
from pydantic import ValidationError
|
28
28
|
from typing_extensions import override
|
29
29
|
|
30
|
+
from ..errors.not_found_error import NotFoundError
|
30
31
|
from ._eval_sets_manager_utils import add_eval_case_to_eval_set
|
31
32
|
from ._eval_sets_manager_utils import delete_eval_case_from_eval_set
|
32
33
|
from ._eval_sets_manager_utils import get_eval_case_from_eval_set
|
@@ -226,16 +227,30 @@ class LocalEvalSetsManager(EvalSetsManager):
|
|
226
227
|
|
227
228
|
@override
|
228
229
|
def list_eval_sets(self, app_name: str) -> list[str]:
|
229
|
-
"""Returns a list of EvalSets that belong to the given app_name.
|
230
|
+
"""Returns a list of EvalSets that belong to the given app_name.
|
231
|
+
|
232
|
+
Args:
|
233
|
+
app_name: The app name to list the eval sets for.
|
234
|
+
|
235
|
+
Returns:
|
236
|
+
A list of EvalSet ids.
|
237
|
+
|
238
|
+
Raises:
|
239
|
+
NotFoundError: If the eval directory for the app is not found.
|
240
|
+
"""
|
230
241
|
eval_set_file_path = os.path.join(self._agents_dir, app_name)
|
231
242
|
eval_sets = []
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
243
|
+
try:
|
244
|
+
for file in os.listdir(eval_set_file_path):
|
245
|
+
if file.endswith(_EVAL_SET_FILE_EXTENSION):
|
246
|
+
eval_sets.append(
|
247
|
+
os.path.basename(file).removesuffix(_EVAL_SET_FILE_EXTENSION)
|
248
|
+
)
|
249
|
+
return sorted(eval_sets)
|
250
|
+
except FileNotFoundError as e:
|
251
|
+
raise NotFoundError(
|
252
|
+
f"Eval directory for app `{app_name}` not found."
|
253
|
+
) from e
|
239
254
|
|
240
255
|
@override
|
241
256
|
def get_eval_case(
|
@@ -300,7 +315,7 @@ class LocalEvalSetsManager(EvalSetsManager):
|
|
300
315
|
)
|
301
316
|
|
302
317
|
def _write_eval_set_to_path(self, eval_set_path: str, eval_set: EvalSet):
|
303
|
-
with open(eval_set_path, "w") as f:
|
318
|
+
with open(eval_set_path, "w", encoding="utf-8") as f:
|
304
319
|
f.write(eval_set.model_dump_json(indent=2))
|
305
320
|
|
306
321
|
def _save_eval_set(self, app_name: str, eval_set_id: str, eval_set: EvalSet):
|
@@ -21,7 +21,9 @@ from .eval_metrics import EvalMetric
|
|
21
21
|
from .eval_metrics import MetricName
|
22
22
|
from .eval_metrics import PrebuiltMetrics
|
23
23
|
from .evaluator import Evaluator
|
24
|
+
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
|
24
25
|
from .response_evaluator import ResponseEvaluator
|
26
|
+
from .safety_evaluator import SafetyEvaluatorV1
|
25
27
|
from .trajectory_evaluator import TrajectoryEvaluator
|
26
28
|
|
27
29
|
logger = logging.getLogger("google_adk." + __name__)
|
@@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
|
|
71
73
|
metric_evaluator_registry = MetricEvaluatorRegistry()
|
72
74
|
|
73
75
|
metric_evaluator_registry.register_evaluator(
|
74
|
-
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
|
75
|
-
evaluator=
|
76
|
+
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
|
77
|
+
evaluator=TrajectoryEvaluator,
|
76
78
|
)
|
77
79
|
metric_evaluator_registry.register_evaluator(
|
78
|
-
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
|
79
|
-
evaluator=
|
80
|
+
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
|
81
|
+
evaluator=ResponseEvaluator,
|
80
82
|
)
|
81
83
|
metric_evaluator_registry.register_evaluator(
|
82
|
-
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
|
83
|
-
evaluator=
|
84
|
+
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
|
85
|
+
evaluator=ResponseEvaluator,
|
86
|
+
)
|
87
|
+
metric_evaluator_registry.register_evaluator(
|
88
|
+
metric_name=PrebuiltMetrics.SAFETY_V1.value,
|
89
|
+
evaluator=SafetyEvaluatorV1,
|
90
|
+
)
|
91
|
+
metric_evaluator_registry.register_evaluator(
|
92
|
+
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
|
93
|
+
evaluator=FinalResponseMatchV2Evaluator,
|
84
94
|
)
|
85
95
|
|
86
96
|
return metric_evaluator_registry
|