google-adk 0.5.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/adk/agents/base_agent.py +76 -30
- google/adk/agents/callback_context.py +2 -6
- google/adk/agents/llm_agent.py +122 -30
- google/adk/agents/loop_agent.py +1 -1
- google/adk/agents/parallel_agent.py +7 -0
- google/adk/agents/readonly_context.py +8 -0
- google/adk/agents/run_config.py +1 -1
- google/adk/agents/sequential_agent.py +31 -0
- google/adk/agents/transcription_entry.py +4 -2
- google/adk/artifacts/gcs_artifact_service.py +1 -1
- google/adk/artifacts/in_memory_artifact_service.py +1 -1
- google/adk/auth/auth_credential.py +10 -2
- google/adk/auth/auth_preprocessor.py +7 -1
- google/adk/auth/auth_tool.py +3 -4
- google/adk/cli/agent_graph.py +5 -5
- google/adk/cli/browser/index.html +4 -4
- google/adk/cli/browser/{main-ULN5R5I5.js → main-PKDNKWJE.js} +59 -60
- google/adk/cli/browser/polyfills-B6TNHZQ6.js +17 -0
- google/adk/cli/cli.py +10 -9
- google/adk/cli/cli_deploy.py +7 -2
- google/adk/cli/cli_eval.py +109 -115
- google/adk/cli/cli_tools_click.py +179 -67
- google/adk/cli/fast_api.py +248 -197
- google/adk/cli/utils/agent_loader.py +137 -0
- google/adk/cli/utils/cleanup.py +40 -0
- google/adk/cli/utils/common.py +23 -0
- google/adk/cli/utils/evals.py +83 -0
- google/adk/cli/utils/logs.py +8 -5
- google/adk/code_executors/__init__.py +3 -1
- google/adk/code_executors/built_in_code_executor.py +52 -0
- google/adk/code_executors/code_execution_utils.py +2 -1
- google/adk/code_executors/container_code_executor.py +0 -1
- google/adk/code_executors/vertex_ai_code_executor.py +6 -8
- google/adk/evaluation/__init__.py +1 -1
- google/adk/evaluation/agent_evaluator.py +168 -128
- google/adk/evaluation/eval_case.py +104 -0
- google/adk/evaluation/eval_metrics.py +74 -0
- google/adk/evaluation/eval_result.py +86 -0
- google/adk/evaluation/eval_set.py +39 -0
- google/adk/evaluation/eval_set_results_manager.py +47 -0
- google/adk/evaluation/eval_sets_manager.py +43 -0
- google/adk/evaluation/evaluation_generator.py +88 -113
- google/adk/evaluation/evaluator.py +58 -0
- google/adk/evaluation/local_eval_set_results_manager.py +113 -0
- google/adk/evaluation/local_eval_sets_manager.py +264 -0
- google/adk/evaluation/response_evaluator.py +106 -1
- google/adk/evaluation/trajectory_evaluator.py +84 -2
- google/adk/events/event.py +6 -1
- google/adk/events/event_actions.py +6 -1
- google/adk/examples/base_example_provider.py +1 -0
- google/adk/examples/example_util.py +3 -2
- google/adk/flows/llm_flows/_code_execution.py +9 -1
- google/adk/flows/llm_flows/audio_transcriber.py +4 -3
- google/adk/flows/llm_flows/base_llm_flow.py +58 -21
- google/adk/flows/llm_flows/contents.py +3 -1
- google/adk/flows/llm_flows/functions.py +9 -8
- google/adk/flows/llm_flows/instructions.py +18 -80
- google/adk/flows/llm_flows/single_flow.py +2 -2
- google/adk/memory/__init__.py +1 -1
- google/adk/memory/_utils.py +23 -0
- google/adk/memory/base_memory_service.py +23 -21
- google/adk/memory/in_memory_memory_service.py +57 -25
- google/adk/memory/memory_entry.py +37 -0
- google/adk/memory/vertex_ai_rag_memory_service.py +38 -15
- google/adk/models/anthropic_llm.py +16 -9
- google/adk/models/base_llm.py +2 -1
- google/adk/models/base_llm_connection.py +2 -0
- google/adk/models/gemini_llm_connection.py +11 -11
- google/adk/models/google_llm.py +12 -2
- google/adk/models/lite_llm.py +80 -23
- google/adk/models/llm_response.py +16 -3
- google/adk/models/registry.py +1 -1
- google/adk/runners.py +98 -42
- google/adk/sessions/__init__.py +1 -1
- google/adk/sessions/_session_util.py +2 -1
- google/adk/sessions/base_session_service.py +6 -33
- google/adk/sessions/database_session_service.py +57 -67
- google/adk/sessions/in_memory_session_service.py +106 -24
- google/adk/sessions/session.py +3 -0
- google/adk/sessions/vertex_ai_session_service.py +44 -51
- google/adk/telemetry.py +7 -2
- google/adk/tools/__init__.py +4 -7
- google/adk/tools/_memory_entry_utils.py +30 -0
- google/adk/tools/agent_tool.py +10 -10
- google/adk/tools/apihub_tool/apihub_toolset.py +55 -74
- google/adk/tools/apihub_tool/clients/apihub_client.py +10 -3
- google/adk/tools/apihub_tool/clients/secret_client.py +1 -0
- google/adk/tools/application_integration_tool/application_integration_toolset.py +111 -85
- google/adk/tools/application_integration_tool/clients/connections_client.py +28 -1
- google/adk/tools/application_integration_tool/clients/integration_client.py +7 -5
- google/adk/tools/application_integration_tool/integration_connector_tool.py +69 -26
- google/adk/tools/base_toolset.py +96 -0
- google/adk/tools/bigquery/__init__.py +28 -0
- google/adk/tools/bigquery/bigquery_credentials.py +216 -0
- google/adk/tools/bigquery/bigquery_tool.py +116 -0
- google/adk/tools/{built_in_code_execution_tool.py → enterprise_search_tool.py} +17 -11
- google/adk/tools/function_parameter_parse_util.py +9 -2
- google/adk/tools/function_tool.py +33 -3
- google/adk/tools/get_user_choice_tool.py +1 -0
- google/adk/tools/google_api_tool/__init__.py +24 -70
- google/adk/tools/google_api_tool/google_api_tool.py +12 -6
- google/adk/tools/google_api_tool/{google_api_tool_set.py → google_api_toolset.py} +57 -55
- google/adk/tools/google_api_tool/google_api_toolsets.py +108 -0
- google/adk/tools/google_api_tool/googleapi_to_openapi_converter.py +40 -42
- google/adk/tools/google_search_tool.py +2 -2
- google/adk/tools/langchain_tool.py +96 -49
- google/adk/tools/load_memory_tool.py +14 -5
- google/adk/tools/mcp_tool/__init__.py +3 -2
- google/adk/tools/mcp_tool/conversion_utils.py +6 -2
- google/adk/tools/mcp_tool/mcp_session_manager.py +80 -69
- google/adk/tools/mcp_tool/mcp_tool.py +35 -32
- google/adk/tools/mcp_tool/mcp_toolset.py +99 -194
- google/adk/tools/openapi_tool/auth/credential_exchangers/base_credential_exchanger.py +1 -3
- google/adk/tools/openapi_tool/auth/credential_exchangers/service_account_exchanger.py +6 -7
- google/adk/tools/openapi_tool/common/common.py +5 -1
- google/adk/tools/openapi_tool/openapi_spec_parser/__init__.py +7 -2
- google/adk/tools/openapi_tool/openapi_spec_parser/openapi_toolset.py +27 -7
- google/adk/tools/openapi_tool/openapi_spec_parser/operation_parser.py +36 -32
- google/adk/tools/openapi_tool/openapi_spec_parser/rest_api_tool.py +11 -1
- google/adk/tools/openapi_tool/openapi_spec_parser/tool_auth_handler.py +1 -1
- google/adk/tools/preload_memory_tool.py +27 -18
- google/adk/tools/retrieval/__init__.py +1 -1
- google/adk/tools/retrieval/vertex_ai_rag_retrieval.py +1 -1
- google/adk/tools/toolbox_toolset.py +107 -0
- google/adk/tools/transfer_to_agent_tool.py +0 -1
- google/adk/utils/__init__.py +13 -0
- google/adk/utils/instructions_utils.py +131 -0
- google/adk/version.py +1 -1
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/METADATA +18 -19
- google_adk-1.1.0.dist-info/RECORD +200 -0
- google/adk/agents/remote_agent.py +0 -50
- google/adk/cli/browser/polyfills-FFHMD2TL.js +0 -18
- google/adk/cli/fast_api.py.orig +0 -728
- google/adk/tools/google_api_tool/google_api_tool_sets.py +0 -112
- google/adk/tools/toolbox_tool.py +0 -46
- google_adk-0.5.0.dist-info/RECORD +0 -180
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/WHEEL +0 -0
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/entry_points.txt +0 -0
- {google_adk-0.5.0.dist-info → google_adk-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -13,16 +13,30 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import json
|
16
|
+
import logging
|
16
17
|
import os
|
17
18
|
from os import path
|
19
|
+
from typing import Any
|
18
20
|
from typing import Dict
|
19
21
|
from typing import List
|
22
|
+
from typing import Optional
|
20
23
|
from typing import Union
|
24
|
+
import uuid
|
21
25
|
|
26
|
+
from pydantic import ValidationError
|
27
|
+
|
28
|
+
from .eval_set import EvalSet
|
22
29
|
from .evaluation_generator import EvaluationGenerator
|
30
|
+
from .evaluator import EvalStatus
|
31
|
+
from .evaluator import EvaluationResult
|
32
|
+
from .evaluator import Evaluator
|
33
|
+
from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
|
23
34
|
from .response_evaluator import ResponseEvaluator
|
24
35
|
from .trajectory_evaluator import TrajectoryEvaluator
|
25
36
|
|
37
|
+
logger = logging.getLogger("google_adk." + __name__)
|
38
|
+
|
39
|
+
|
26
40
|
# Constants for default runs and evaluation criteria
|
27
41
|
NUM_RUNS = 2
|
28
42
|
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
|
@@ -76,12 +90,67 @@ class AgentEvaluator:
|
|
76
90
|
return DEFAULT_CRITERIA
|
77
91
|
|
78
92
|
@staticmethod
|
79
|
-
async def
|
80
|
-
agent_module,
|
81
|
-
|
93
|
+
async def evaluate_eval_set(
|
94
|
+
agent_module: str,
|
95
|
+
eval_set: EvalSet,
|
96
|
+
criteria: dict[str, float],
|
82
97
|
num_runs=NUM_RUNS,
|
83
98
|
agent_name=None,
|
84
|
-
|
99
|
+
):
|
100
|
+
"""Evaluates an agent using the given EvalSet.
|
101
|
+
|
102
|
+
Args:
|
103
|
+
agent_module: The path to python module that contains the definition of
|
104
|
+
the agent. There is convention in place here, where the code is going to
|
105
|
+
look for 'root_agent' in the loaded module.
|
106
|
+
eval_set: The eval set.
|
107
|
+
criteria: Evauation criterias, a dictionary of metric names to their
|
108
|
+
respective thresholds.
|
109
|
+
num_runs: Number of times all entries in the eval dataset should be
|
110
|
+
assessed.
|
111
|
+
agent_name: The name of the agent.
|
112
|
+
"""
|
113
|
+
eval_case_responses_list = await EvaluationGenerator.generate_responses(
|
114
|
+
eval_set=eval_set,
|
115
|
+
agent_module_path=agent_module,
|
116
|
+
repeat_num=num_runs,
|
117
|
+
agent_name=agent_name,
|
118
|
+
)
|
119
|
+
|
120
|
+
for eval_case_responses in eval_case_responses_list:
|
121
|
+
actual_invocations = [
|
122
|
+
invocation
|
123
|
+
for invocations in eval_case_responses.responses
|
124
|
+
for invocation in invocations
|
125
|
+
]
|
126
|
+
expected_invocations = (
|
127
|
+
eval_case_responses.eval_case.conversation * num_runs
|
128
|
+
)
|
129
|
+
|
130
|
+
for metric_name, threshold in criteria.items():
|
131
|
+
metric_evaluator = AgentEvaluator._get_metric_evaluator(
|
132
|
+
metric_name=metric_name, threshold=threshold
|
133
|
+
)
|
134
|
+
|
135
|
+
evaluation_result: EvaluationResult = (
|
136
|
+
metric_evaluator.evaluate_invocations(
|
137
|
+
actual_invocations=actual_invocations,
|
138
|
+
expected_invocations=expected_invocations,
|
139
|
+
)
|
140
|
+
)
|
141
|
+
|
142
|
+
assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
|
143
|
+
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
|
144
|
+
f" but got {evaluation_result.overall_score}."
|
145
|
+
)
|
146
|
+
|
147
|
+
@staticmethod
|
148
|
+
async def evaluate(
|
149
|
+
agent_module: str,
|
150
|
+
eval_dataset_file_path_or_dir: str,
|
151
|
+
num_runs: int = NUM_RUNS,
|
152
|
+
agent_name: Optional[str] = None,
|
153
|
+
initial_session_file: Optional[str] = None,
|
85
154
|
):
|
86
155
|
"""Evaluates an Agent given eval data.
|
87
156
|
|
@@ -109,35 +178,102 @@ class AgentEvaluator:
|
|
109
178
|
else:
|
110
179
|
test_files = [eval_dataset_file_path_or_dir]
|
111
180
|
|
112
|
-
|
113
|
-
if initial_session_file:
|
114
|
-
with open(initial_session_file, "r") as f:
|
115
|
-
initial_session_state = json.loads(f.read())["state"]
|
181
|
+
initial_session = AgentEvaluator._get_initial_session(initial_session_file)
|
116
182
|
|
117
183
|
for test_file in test_files:
|
118
|
-
dataset = AgentEvaluator._load_dataset(test_file)[0]
|
119
184
|
criteria = AgentEvaluator.find_config_for_test_file(test_file)
|
185
|
+
eval_set = AgentEvaluator._load_eval_set_from_file(
|
186
|
+
test_file, criteria, initial_session
|
187
|
+
)
|
120
188
|
|
121
|
-
AgentEvaluator.
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
num_runs,
|
189
|
+
await AgentEvaluator.evaluate_eval_set(
|
190
|
+
agent_module=agent_module,
|
191
|
+
eval_set=eval_set,
|
192
|
+
criteria=criteria,
|
193
|
+
num_runs=num_runs,
|
127
194
|
agent_name=agent_name,
|
128
|
-
initial_session={"state": initial_session_state},
|
129
195
|
)
|
130
196
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
197
|
+
@staticmethod
|
198
|
+
def migrate_eval_data_to_new_schema(
|
199
|
+
old_eval_data_file: str,
|
200
|
+
new_eval_data_file: str,
|
201
|
+
initial_session_file: Optional[str] = None,
|
202
|
+
):
|
203
|
+
"""A utility for migrating eval data to new schema backed by EvalSet."""
|
204
|
+
if not old_eval_data_file or not new_eval_data_file:
|
205
|
+
raise ValueError(
|
206
|
+
"One of old_eval_data_file or new_eval_data_file is empty."
|
207
|
+
)
|
208
|
+
|
209
|
+
criteria = AgentEvaluator.find_config_for_test_file(old_eval_data_file)
|
210
|
+
initial_session = AgentEvaluator._get_initial_session(initial_session_file)
|
211
|
+
|
212
|
+
eval_set = AgentEvaluator._get_eval_set_from_old_format(
|
213
|
+
old_eval_data_file, criteria, initial_session
|
214
|
+
)
|
135
215
|
|
136
|
-
|
137
|
-
|
138
|
-
|
216
|
+
with open(new_eval_data_file, "w") as f:
|
217
|
+
f.write(eval_set.model_dump_json(indent=2))
|
218
|
+
|
219
|
+
@staticmethod
|
220
|
+
def _load_eval_set_from_file(
|
221
|
+
eval_set_file: str,
|
222
|
+
criteria: dict[str, float],
|
223
|
+
initial_session: dict[str, Any],
|
224
|
+
) -> EvalSet:
|
225
|
+
"""Loads an EvalSet from the given file."""
|
226
|
+
if os.path.isfile(eval_set_file):
|
227
|
+
with open(eval_set_file, "r", encoding="utf-8") as f:
|
228
|
+
content = f.read()
|
229
|
+
|
230
|
+
try:
|
231
|
+
eval_set = EvalSet.model_validate_json(content)
|
232
|
+
assert len(initial_session) == 0, (
|
233
|
+
"Intial session should be specified as a part of EvalSet file."
|
234
|
+
" Explicit initial session is only needed, when specifying data in"
|
235
|
+
" the older schema."
|
236
|
+
)
|
237
|
+
return eval_set
|
238
|
+
except ValidationError:
|
239
|
+
# We assume that the eval data was specified in the old format
|
240
|
+
logger.warning(
|
241
|
+
f"Contents of {eval_set_file} appear to be in older format.To avoid"
|
242
|
+
" this warning, please update your test files to contain data in"
|
243
|
+
" EvalSet schema. You can use `migrate_eval_data_to_new_schema`"
|
244
|
+
" for migrating your old test files."
|
139
245
|
)
|
140
246
|
|
247
|
+
# If we are here, the data must be specified in the older format.
|
248
|
+
return AgentEvaluator._get_eval_set_from_old_format(
|
249
|
+
eval_set_file, criteria, initial_session
|
250
|
+
)
|
251
|
+
|
252
|
+
@staticmethod
|
253
|
+
def _get_eval_set_from_old_format(
|
254
|
+
eval_set_file: str,
|
255
|
+
criteria: dict[str, float],
|
256
|
+
initial_session: dict[str, Any],
|
257
|
+
) -> EvalSet:
|
258
|
+
data = AgentEvaluator._load_dataset(eval_set_file)[0]
|
259
|
+
AgentEvaluator._validate_input([data], criteria)
|
260
|
+
eval_data = {
|
261
|
+
"name": eval_set_file,
|
262
|
+
"data": data,
|
263
|
+
"initial_session": initial_session,
|
264
|
+
}
|
265
|
+
return convert_eval_set_to_pydanctic_schema(
|
266
|
+
eval_set_id=str(uuid.uuid4()), eval_set_in_json_format=[eval_data]
|
267
|
+
)
|
268
|
+
|
269
|
+
@staticmethod
|
270
|
+
def _get_initial_session(initial_session_file: Optional[str] = None):
|
271
|
+
initial_session = {}
|
272
|
+
if initial_session_file:
|
273
|
+
with open(initial_session_file, "r") as f:
|
274
|
+
initial_session = json.loads(f.read())
|
275
|
+
return initial_session
|
276
|
+
|
141
277
|
@staticmethod
|
142
278
|
def _load_dataset(
|
143
279
|
input_data: Union[str, List[str], List[Dict], List[List[Dict]]],
|
@@ -221,109 +357,13 @@ class AgentEvaluator:
|
|
221
357
|
)
|
222
358
|
|
223
359
|
@staticmethod
|
224
|
-
def
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
"""
|
233
|
-
inferred_criteria = {}
|
234
|
-
sample = eval_dataset[0][0]
|
235
|
-
|
236
|
-
if QUERY_COLUMN in sample and EXPECTED_TOOL_USE_COLUMN in sample:
|
237
|
-
inferred_criteria[TOOL_TRAJECTORY_SCORE_KEY] = DEFAULT_CRITERIA[
|
238
|
-
TOOL_TRAJECTORY_SCORE_KEY
|
239
|
-
]
|
240
|
-
|
241
|
-
if QUERY_COLUMN in sample and REFERENCE_COLUMN in sample:
|
242
|
-
inferred_criteria[RESPONSE_MATCH_SCORE_KEY] = DEFAULT_CRITERIA[
|
243
|
-
RESPONSE_MATCH_SCORE_KEY
|
244
|
-
]
|
245
|
-
|
246
|
-
return inferred_criteria
|
247
|
-
|
248
|
-
@staticmethod
|
249
|
-
async def _generate_responses(
|
250
|
-
agent_module, eval_dataset, num_runs, agent_name=None, initial_session={}
|
251
|
-
):
|
252
|
-
"""Generates evaluation responses by running the agent module multiple times."""
|
253
|
-
return EvaluationGenerator.generate_responses(
|
254
|
-
eval_dataset,
|
255
|
-
agent_module,
|
256
|
-
repeat_num=num_runs,
|
257
|
-
agent_name=agent_name,
|
258
|
-
initial_session=initial_session,
|
259
|
-
)
|
260
|
-
|
261
|
-
@staticmethod
|
262
|
-
def _generate_responses_from_session(eval_dataset, session_path):
|
263
|
-
"""Generates evaluation responses by running the agent module multiple times."""
|
264
|
-
return EvaluationGenerator.generate_responses_from_session(
|
265
|
-
session_path, eval_dataset
|
266
|
-
)
|
267
|
-
|
268
|
-
@staticmethod
|
269
|
-
def _response_evaluation_required(criteria, eval_dataset):
|
270
|
-
"""Checks if response evaluation are needed."""
|
271
|
-
return REFERENCE_COLUMN in eval_dataset[0][0] and any(
|
272
|
-
key in criteria
|
273
|
-
for key in [RESPONSE_EVALUATION_SCORE_KEY, RESPONSE_MATCH_SCORE_KEY]
|
274
|
-
)
|
275
|
-
|
276
|
-
@staticmethod
|
277
|
-
def _trajectory_evaluation_required(evaluation_criteria, eval_dataset):
|
278
|
-
"""Checks if response evaluation are needed."""
|
279
|
-
return (
|
280
|
-
EXPECTED_TOOL_USE_COLUMN in eval_dataset[0][0]
|
281
|
-
and TOOL_TRAJECTORY_SCORE_KEY in evaluation_criteria
|
282
|
-
)
|
283
|
-
|
284
|
-
@staticmethod
|
285
|
-
def _evaluate_response_scores(agent_module, evaluation_response, criteria):
|
286
|
-
"""Evaluates response scores and raises an assertion error if they don't meet the criteria."""
|
287
|
-
metrics = ResponseEvaluator.evaluate(
|
288
|
-
evaluation_response, criteria, print_detailed_results=True
|
289
|
-
)
|
290
|
-
|
291
|
-
AgentEvaluator._assert_score(
|
292
|
-
metrics,
|
293
|
-
"coherence/mean",
|
294
|
-
criteria.get(RESPONSE_EVALUATION_SCORE_KEY),
|
295
|
-
"Average response evaluation score",
|
296
|
-
agent_module,
|
297
|
-
)
|
298
|
-
|
299
|
-
AgentEvaluator._assert_score(
|
300
|
-
metrics,
|
301
|
-
"rouge_1/mean",
|
302
|
-
criteria.get(RESPONSE_MATCH_SCORE_KEY),
|
303
|
-
"Average response match score",
|
304
|
-
agent_module,
|
305
|
-
)
|
306
|
-
|
307
|
-
@staticmethod
|
308
|
-
def _evaluate_tool_trajectory(agent_module, evaluation_response, criteria):
|
309
|
-
"""Evaluates tool trajectory scores and raises an assertion error if they don't meet the criteria."""
|
310
|
-
score = TrajectoryEvaluator.evaluate(
|
311
|
-
evaluation_response, print_detailed_results=True
|
312
|
-
)
|
313
|
-
AgentEvaluator._assert_score(
|
314
|
-
{TOOL_TRAJECTORY_SCORE_KEY: score},
|
315
|
-
TOOL_TRAJECTORY_SCORE_KEY,
|
316
|
-
criteria[TOOL_TRAJECTORY_SCORE_KEY],
|
317
|
-
"Average tool trajectory evaluation score",
|
318
|
-
agent_module,
|
319
|
-
)
|
360
|
+
def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
|
361
|
+
if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
|
362
|
+
return TrajectoryEvaluator(threshold=threshold)
|
363
|
+
elif (
|
364
|
+
metric_name == RESPONSE_MATCH_SCORE_KEY
|
365
|
+
or metric_name == RESPONSE_EVALUATION_SCORE_KEY
|
366
|
+
):
|
367
|
+
return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
|
320
368
|
|
321
|
-
|
322
|
-
def _assert_score(metrics, metric_key, threshold, description, agent_module):
|
323
|
-
"""Asserts that a metric meets the specified threshold."""
|
324
|
-
if metric_key in metrics:
|
325
|
-
actual_score = metrics[metric_key]
|
326
|
-
assert actual_score >= threshold, (
|
327
|
-
f"{description} for {agent_module} is lower than expected. "
|
328
|
-
f"Expected >= {threshold}, but got {actual_score}."
|
329
|
-
)
|
369
|
+
raise ValueError(f"Unsupported eval metric: {metric_name}")
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
|
16
|
+
from typing import Any
|
17
|
+
from typing import Optional
|
18
|
+
from typing import Tuple
|
19
|
+
|
20
|
+
from google.genai import types as genai_types
|
21
|
+
from pydantic import alias_generators
|
22
|
+
from pydantic import BaseModel
|
23
|
+
from pydantic import ConfigDict
|
24
|
+
from pydantic import Field
|
25
|
+
|
26
|
+
|
27
|
+
class EvalBaseModel(BaseModel):
|
28
|
+
model_config = ConfigDict(
|
29
|
+
alias_generator=alias_generators.to_camel,
|
30
|
+
populate_by_name=True,
|
31
|
+
)
|
32
|
+
|
33
|
+
|
34
|
+
class IntermediateData(EvalBaseModel):
|
35
|
+
"""Container for intermediate data that an agent would generate as it responds with a final answer."""
|
36
|
+
|
37
|
+
tool_uses: list[genai_types.FunctionCall] = []
|
38
|
+
"""Tool use trajectory in chronological order."""
|
39
|
+
|
40
|
+
intermediate_responses: list[Tuple[str, list[genai_types.Part]]] = []
|
41
|
+
"""Intermediate responses generated by sub-agents to convey progress or status
|
42
|
+
in a multi-agent system, distinct from the final response.
|
43
|
+
|
44
|
+
This is expressed as a Tuple of:
|
45
|
+
- Author: Usually the sub-agent name that generated the intermediate
|
46
|
+
response.
|
47
|
+
|
48
|
+
- A list of Parts that comprise of the response.
|
49
|
+
"""
|
50
|
+
|
51
|
+
|
52
|
+
class Invocation(EvalBaseModel):
|
53
|
+
"""Represents a single invocation."""
|
54
|
+
|
55
|
+
invocation_id: str = ''
|
56
|
+
"""Unique identifier for the invocation."""
|
57
|
+
|
58
|
+
user_content: genai_types.Content
|
59
|
+
"""Content provided by the user in this invocation."""
|
60
|
+
|
61
|
+
final_response: Optional[genai_types.Content] = None
|
62
|
+
"""Final response from the agent."""
|
63
|
+
|
64
|
+
intermediate_data: Optional[IntermediateData] = None
|
65
|
+
"""Intermediate steps generated as a part of Agent execution.
|
66
|
+
|
67
|
+
For a multi-agent system, it is also helpful to inspect the route that
|
68
|
+
the agent took to generate final response.
|
69
|
+
"""
|
70
|
+
|
71
|
+
creation_timestamp: float = 0.0
|
72
|
+
"""Timestamp for the current invocation, primarily intended for debugging purposes."""
|
73
|
+
|
74
|
+
|
75
|
+
class SessionInput(EvalBaseModel):
|
76
|
+
"""Values that help initialize a Session."""
|
77
|
+
|
78
|
+
app_name: str
|
79
|
+
"""The name of the app."""
|
80
|
+
|
81
|
+
user_id: str
|
82
|
+
"""The user id."""
|
83
|
+
|
84
|
+
state: dict[str, Any] = Field(default_factory=dict)
|
85
|
+
"""The state of the session."""
|
86
|
+
|
87
|
+
|
88
|
+
class EvalCase(EvalBaseModel):
|
89
|
+
"""An eval case."""
|
90
|
+
|
91
|
+
eval_id: str
|
92
|
+
"""Unique identifier for the evaluation case."""
|
93
|
+
|
94
|
+
conversation: list[Invocation]
|
95
|
+
"""A conversation between the user and the Agent. The conversation can have any number of invocations."""
|
96
|
+
|
97
|
+
session_input: Optional[SessionInput] = None
|
98
|
+
"""Session input that will be passed on to the Agent during eval.
|
99
|
+
It is common for Agents state to be initialized to some initial/default value,
|
100
|
+
for example, your agent may need to know today's date.
|
101
|
+
"""
|
102
|
+
|
103
|
+
creation_timestamp: float = 0.0
|
104
|
+
"""The time at which this eval case was created."""
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
from typing import Optional
|
18
|
+
|
19
|
+
from pydantic import alias_generators
|
20
|
+
from pydantic import BaseModel
|
21
|
+
from pydantic import ConfigDict
|
22
|
+
|
23
|
+
from .eval_case import Invocation
|
24
|
+
from .evaluator import EvalStatus
|
25
|
+
|
26
|
+
|
27
|
+
class EvalMetric(BaseModel):
|
28
|
+
"""A metric used to evaluate a particular aspect of an eval case."""
|
29
|
+
|
30
|
+
model_config = ConfigDict(
|
31
|
+
alias_generator=alias_generators.to_camel,
|
32
|
+
populate_by_name=True,
|
33
|
+
)
|
34
|
+
|
35
|
+
model_config = ConfigDict(
|
36
|
+
alias_generator=alias_generators.to_camel,
|
37
|
+
populate_by_name=True,
|
38
|
+
)
|
39
|
+
|
40
|
+
metric_name: str
|
41
|
+
"""The name of the metric."""
|
42
|
+
|
43
|
+
threshold: float
|
44
|
+
"""A threshold value. Each metric decides how to interpret this threshold."""
|
45
|
+
|
46
|
+
|
47
|
+
class EvalMetricResult(EvalMetric):
|
48
|
+
"""The actual computed score/value of a particular EvalMetric."""
|
49
|
+
|
50
|
+
model_config = ConfigDict(
|
51
|
+
alias_generator=alias_generators.to_camel,
|
52
|
+
populate_by_name=True,
|
53
|
+
)
|
54
|
+
|
55
|
+
score: Optional[float] = None
|
56
|
+
eval_status: EvalStatus
|
57
|
+
|
58
|
+
|
59
|
+
class EvalMetricResultPerInvocation(BaseModel):
|
60
|
+
"""Eval metric results per invocation."""
|
61
|
+
|
62
|
+
model_config = ConfigDict(
|
63
|
+
alias_generator=alias_generators.to_camel,
|
64
|
+
populate_by_name=True,
|
65
|
+
)
|
66
|
+
|
67
|
+
actual_invocation: Invocation
|
68
|
+
"""The actual invocation, usually obtained by inferencing the agent."""
|
69
|
+
|
70
|
+
expected_invocation: Invocation
|
71
|
+
"""The expected invocation, usually the reference or golden invocation."""
|
72
|
+
|
73
|
+
eval_metric_results: list[EvalMetricResult] = []
|
74
|
+
"""Eval resutls for each applicable metric."""
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
from typing import Optional
|
18
|
+
|
19
|
+
from pydantic import alias_generators
|
20
|
+
from pydantic import BaseModel
|
21
|
+
from pydantic import ConfigDict
|
22
|
+
from pydantic import Field
|
23
|
+
|
24
|
+
from ..sessions.session import Session
|
25
|
+
from .eval_metrics import EvalMetric
|
26
|
+
from .eval_metrics import EvalMetricResult
|
27
|
+
from .eval_metrics import EvalMetricResultPerInvocation
|
28
|
+
from .evaluator import EvalStatus
|
29
|
+
|
30
|
+
|
31
|
+
class EvalCaseResult(BaseModel):
|
32
|
+
"""Case level evaluation results."""
|
33
|
+
|
34
|
+
model_config = ConfigDict(
|
35
|
+
alias_generator=alias_generators.to_camel,
|
36
|
+
populate_by_name=True,
|
37
|
+
)
|
38
|
+
|
39
|
+
eval_set_file: str = Field(
|
40
|
+
deprecated=True,
|
41
|
+
description="This field is deprecated, use eval_set_id instead.",
|
42
|
+
)
|
43
|
+
eval_set_id: str = ""
|
44
|
+
"""The eval set id."""
|
45
|
+
|
46
|
+
eval_id: str = ""
|
47
|
+
"""The eval case id."""
|
48
|
+
|
49
|
+
final_eval_status: EvalStatus
|
50
|
+
"""Final eval status for this eval case."""
|
51
|
+
|
52
|
+
eval_metric_results: list[tuple[EvalMetric, EvalMetricResult]] = Field(
|
53
|
+
deprecated=True,
|
54
|
+
description=(
|
55
|
+
"This field is deprecated, use overall_eval_metric_results instead."
|
56
|
+
),
|
57
|
+
)
|
58
|
+
|
59
|
+
overall_eval_metric_results: list[EvalMetricResult]
|
60
|
+
"""Overall result for each metric for the entire eval case."""
|
61
|
+
|
62
|
+
eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation]
|
63
|
+
"""Result for each metric on a per invocation basis."""
|
64
|
+
|
65
|
+
session_id: str
|
66
|
+
"""Session id of the session generated as result of inferencing/scraping stage of the eval."""
|
67
|
+
|
68
|
+
session_details: Optional[Session] = None
|
69
|
+
"""Session generated as result of inferencing/scraping stage of the eval."""
|
70
|
+
|
71
|
+
user_id: Optional[str] = None
|
72
|
+
"""User id used during inferencing/scraping stage of the eval."""
|
73
|
+
|
74
|
+
|
75
|
+
class EvalSetResult(BaseModel):
|
76
|
+
"""Eval set level evaluation results."""
|
77
|
+
|
78
|
+
model_config = ConfigDict(
|
79
|
+
alias_generator=alias_generators.to_camel,
|
80
|
+
populate_by_name=True,
|
81
|
+
)
|
82
|
+
eval_set_result_id: str
|
83
|
+
eval_set_result_name: str
|
84
|
+
eval_set_id: str
|
85
|
+
eval_case_results: list[EvalCaseResult] = Field(default_factory=list)
|
86
|
+
creation_timestamp: float = 0.0
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from typing import Optional
|
16
|
+
|
17
|
+
from pydantic import BaseModel
|
18
|
+
|
19
|
+
from .eval_case import EvalCase
|
20
|
+
|
21
|
+
|
22
|
+
class EvalSet(BaseModel):
|
23
|
+
"""A set of eval cases."""
|
24
|
+
|
25
|
+
eval_set_id: str
|
26
|
+
"""Unique identifier for the eval set."""
|
27
|
+
|
28
|
+
name: Optional[str] = None
|
29
|
+
"""Name of the dataset."""
|
30
|
+
|
31
|
+
description: Optional[str] = None
|
32
|
+
"""Description of the dataset."""
|
33
|
+
|
34
|
+
eval_cases: list[EvalCase]
|
35
|
+
"""List of eval cases in the dataset. Each case represents a single
|
36
|
+
interaction to be evaluated."""
|
37
|
+
|
38
|
+
creation_timestamp: float = 0.0
|
39
|
+
"""The time at which this eval set was created."""
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
from abc import ABC
|
18
|
+
from abc import abstractmethod
|
19
|
+
|
20
|
+
from .eval_result import EvalCaseResult
|
21
|
+
from .eval_result import EvalSetResult
|
22
|
+
|
23
|
+
|
24
|
+
class EvalSetResultsManager(ABC):
|
25
|
+
"""An interface to manage Eval Set Results."""
|
26
|
+
|
27
|
+
@abstractmethod
|
28
|
+
def save_eval_set_result(
|
29
|
+
self,
|
30
|
+
app_name: str,
|
31
|
+
eval_set_id: str,
|
32
|
+
eval_case_results: list[EvalCaseResult],
|
33
|
+
) -> None:
|
34
|
+
"""Creates and saves a new EvalSetResult given eval_case_results."""
|
35
|
+
raise NotImplementedError()
|
36
|
+
|
37
|
+
@abstractmethod
|
38
|
+
def get_eval_set_result(
|
39
|
+
self, app_name: str, eval_set_result_id: str
|
40
|
+
) -> EvalSetResult:
|
41
|
+
"""Returns an EvalSetResult identified by app_name and eval_set_result_id."""
|
42
|
+
raise NotImplementedError()
|
43
|
+
|
44
|
+
@abstractmethod
|
45
|
+
def list_eval_set_results(self, app_name: str) -> list[str]:
|
46
|
+
"""Returns the eval result ids that belong to the given app_name."""
|
47
|
+
raise NotImplementedError()
|