google-adk 1.5.0__py3-none-any.whl → 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- google/adk/a2a/converters/event_converter.py +257 -36
- google/adk/a2a/converters/part_converter.py +93 -25
- google/adk/a2a/converters/request_converter.py +12 -32
- google/adk/a2a/converters/utils.py +22 -4
- google/adk/a2a/executor/__init__.py +13 -0
- google/adk/a2a/executor/a2a_agent_executor.py +260 -0
- google/adk/a2a/executor/task_result_aggregator.py +71 -0
- google/adk/a2a/logs/__init__.py +13 -0
- google/adk/a2a/logs/log_utils.py +349 -0
- google/adk/agents/base_agent.py +54 -0
- google/adk/agents/llm_agent.py +15 -0
- google/adk/agents/remote_a2a_agent.py +532 -0
- google/adk/artifacts/in_memory_artifact_service.py +6 -3
- google/adk/cli/browser/chunk-EQDQRRRY.js +1 -0
- google/adk/cli/browser/chunk-TXJFAAIW.js +2 -0
- google/adk/cli/browser/index.html +4 -3
- google/adk/cli/browser/main-RXDVX3K6.js +3914 -0
- google/adk/cli/browser/polyfills-FFHMD2TL.js +17 -0
- google/adk/cli/cli_deploy.py +4 -1
- google/adk/cli/cli_eval.py +8 -6
- google/adk/cli/cli_tools_click.py +30 -10
- google/adk/cli/fast_api.py +120 -5
- google/adk/cli/utils/agent_loader.py +12 -0
- google/adk/evaluation/agent_evaluator.py +107 -10
- google/adk/evaluation/base_eval_service.py +157 -0
- google/adk/evaluation/constants.py +20 -0
- google/adk/evaluation/eval_case.py +3 -3
- google/adk/evaluation/eval_metrics.py +39 -0
- google/adk/evaluation/evaluation_generator.py +1 -1
- google/adk/evaluation/final_response_match_v2.py +230 -0
- google/adk/evaluation/llm_as_judge.py +141 -0
- google/adk/evaluation/llm_as_judge_utils.py +48 -0
- google/adk/evaluation/metric_evaluator_registry.py +89 -0
- google/adk/evaluation/response_evaluator.py +38 -211
- google/adk/evaluation/safety_evaluator.py +54 -0
- google/adk/evaluation/trajectory_evaluator.py +16 -2
- google/adk/evaluation/vertex_ai_eval_facade.py +147 -0
- google/adk/events/event.py +2 -4
- google/adk/flows/llm_flows/base_llm_flow.py +2 -0
- google/adk/memory/in_memory_memory_service.py +3 -2
- google/adk/models/lite_llm.py +50 -10
- google/adk/runners.py +27 -10
- google/adk/sessions/database_session_service.py +25 -7
- google/adk/sessions/in_memory_session_service.py +5 -1
- google/adk/sessions/vertex_ai_session_service.py +67 -42
- google/adk/tools/bigquery/config.py +11 -1
- google/adk/tools/bigquery/query_tool.py +306 -12
- google/adk/tools/enterprise_search_tool.py +2 -2
- google/adk/tools/function_tool.py +7 -1
- google/adk/tools/google_search_tool.py +1 -1
- google/adk/tools/mcp_tool/mcp_session_manager.py +44 -30
- google/adk/tools/mcp_tool/mcp_tool.py +44 -7
- google/adk/version.py +1 -1
- {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/METADATA +6 -4
- {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/RECORD +58 -42
- google/adk/cli/browser/main-JAAWEV7F.js +0 -92
- google/adk/cli/browser/polyfills-B6TNHZQ6.js +0 -17
- {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/WHEEL +0 -0
- {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/entry_points.txt +0 -0
- {google_adk-1.5.0.dist-info → google_adk-1.6.1.dist-info}/licenses/LICENSE +0 -0
@@ -12,6 +12,8 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from __future__ import annotations
|
16
|
+
|
15
17
|
import json
|
16
18
|
import logging
|
17
19
|
import os
|
@@ -23,16 +25,17 @@ from typing import Optional
|
|
23
25
|
from typing import Union
|
24
26
|
import uuid
|
25
27
|
|
28
|
+
from google.genai import types as genai_types
|
26
29
|
from pydantic import ValidationError
|
27
30
|
|
31
|
+
from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
|
32
|
+
from .eval_case import IntermediateData
|
33
|
+
from .eval_metrics import EvalMetric
|
28
34
|
from .eval_set import EvalSet
|
29
|
-
from .evaluation_generator import EvaluationGenerator
|
30
35
|
from .evaluator import EvalStatus
|
31
36
|
from .evaluator import EvaluationResult
|
32
37
|
from .evaluator import Evaluator
|
33
38
|
from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
|
34
|
-
from .response_evaluator import ResponseEvaluator
|
35
|
-
from .trajectory_evaluator import TrajectoryEvaluator
|
36
39
|
|
37
40
|
logger = logging.getLogger("google_adk." + __name__)
|
38
41
|
|
@@ -44,11 +47,13 @@ TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
|
|
44
47
|
# This is always optional unless explicitly specified.
|
45
48
|
RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
|
46
49
|
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
|
50
|
+
SAFETY_V1_KEY = "safety_v1"
|
47
51
|
|
48
52
|
ALLOWED_CRITERIA = [
|
49
53
|
TOOL_TRAJECTORY_SCORE_KEY,
|
50
54
|
RESPONSE_EVALUATION_SCORE_KEY,
|
51
55
|
RESPONSE_MATCH_SCORE_KEY,
|
56
|
+
SAFETY_V1_KEY,
|
52
57
|
]
|
53
58
|
|
54
59
|
|
@@ -96,6 +101,7 @@ class AgentEvaluator:
|
|
96
101
|
criteria: dict[str, float],
|
97
102
|
num_runs=NUM_RUNS,
|
98
103
|
agent_name=None,
|
104
|
+
print_detailed_results: bool = True,
|
99
105
|
):
|
100
106
|
"""Evaluates an agent using the given EvalSet.
|
101
107
|
|
@@ -109,7 +115,13 @@ class AgentEvaluator:
|
|
109
115
|
num_runs: Number of times all entries in the eval dataset should be
|
110
116
|
assessed.
|
111
117
|
agent_name: The name of the agent.
|
118
|
+
print_detailed_results: Whether to print detailed results for each metric
|
119
|
+
evaluation.
|
112
120
|
"""
|
121
|
+
try:
|
122
|
+
from .evaluation_generator import EvaluationGenerator
|
123
|
+
except ModuleNotFoundError as e:
|
124
|
+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
113
125
|
eval_case_responses_list = await EvaluationGenerator.generate_responses(
|
114
126
|
eval_set=eval_set,
|
115
127
|
agent_module_path=agent_module,
|
@@ -117,6 +129,8 @@ class AgentEvaluator:
|
|
117
129
|
agent_name=agent_name,
|
118
130
|
)
|
119
131
|
|
132
|
+
failures = []
|
133
|
+
|
120
134
|
for eval_case_responses in eval_case_responses_list:
|
121
135
|
actual_invocations = [
|
122
136
|
invocation
|
@@ -139,10 +153,25 @@ class AgentEvaluator:
|
|
139
153
|
)
|
140
154
|
)
|
141
155
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
156
|
+
if print_detailed_results:
|
157
|
+
AgentEvaluator._print_details(
|
158
|
+
evaluation_result=evaluation_result,
|
159
|
+
metric_name=metric_name,
|
160
|
+
threshold=threshold,
|
161
|
+
)
|
162
|
+
|
163
|
+
# Gather all the failures.
|
164
|
+
if evaluation_result.overall_eval_status != EvalStatus.PASSED:
|
165
|
+
failures.append(
|
166
|
+
f"{metric_name} for {agent_module} Failed. Expected {threshold},"
|
167
|
+
f" but got {evaluation_result.overall_score}."
|
168
|
+
)
|
169
|
+
|
170
|
+
assert not failures, (
|
171
|
+
"Following are all the test failures. If you looking to get more"
|
172
|
+
" details on the failures, then please re-run this test with"
|
173
|
+
" `print_details` set to `True`.\n{}".format("\n".join(failures))
|
174
|
+
)
|
146
175
|
|
147
176
|
@staticmethod
|
148
177
|
async def evaluate(
|
@@ -158,9 +187,10 @@ class AgentEvaluator:
|
|
158
187
|
agent_module: The path to python module that contains the definition of
|
159
188
|
the agent. There is convention in place here, where the code is going to
|
160
189
|
look for 'root_agent' in the loaded module.
|
161
|
-
eval_dataset_file_path_or_dir: The eval data set. This can be either a
|
162
|
-
full path to the file containing eval dataset, or a
|
163
|
-
recursively explored for all files that have a
|
190
|
+
eval_dataset_file_path_or_dir: The eval data set. This can be either a
|
191
|
+
string representing full path to the file containing eval dataset, or a
|
192
|
+
directory that is recursively explored for all files that have a
|
193
|
+
`.test.json` suffix.
|
164
194
|
num_runs: Number of times all entries in the eval dataset should be
|
165
195
|
assessed.
|
166
196
|
agent_name: The name of the agent.
|
@@ -358,6 +388,12 @@ class AgentEvaluator:
|
|
358
388
|
|
359
389
|
@staticmethod
|
360
390
|
def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
|
391
|
+
try:
|
392
|
+
from .response_evaluator import ResponseEvaluator
|
393
|
+
from .safety_evaluator import SafetyEvaluatorV1
|
394
|
+
from .trajectory_evaluator import TrajectoryEvaluator
|
395
|
+
except ModuleNotFoundError as e:
|
396
|
+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
361
397
|
if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
|
362
398
|
return TrajectoryEvaluator(threshold=threshold)
|
363
399
|
elif (
|
@@ -365,5 +401,66 @@ class AgentEvaluator:
|
|
365
401
|
or metric_name == RESPONSE_EVALUATION_SCORE_KEY
|
366
402
|
):
|
367
403
|
return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
|
404
|
+
elif metric_name == SAFETY_V1_KEY:
|
405
|
+
return SafetyEvaluatorV1(
|
406
|
+
eval_metric=EvalMetric(threshold=threshold, metric_name=metric_name)
|
407
|
+
)
|
368
408
|
|
369
409
|
raise ValueError(f"Unsupported eval metric: {metric_name}")
|
410
|
+
|
411
|
+
@staticmethod
|
412
|
+
def _print_details(
|
413
|
+
evaluation_result: EvaluationResult, metric_name: str, threshold: float
|
414
|
+
):
|
415
|
+
try:
|
416
|
+
from pandas import pandas as pd
|
417
|
+
from tabulate import tabulate
|
418
|
+
except ModuleNotFoundError as e:
|
419
|
+
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
|
420
|
+
print(
|
421
|
+
f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
|
422
|
+
f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
|
423
|
+
f" `{evaluation_result.overall_score}`."
|
424
|
+
)
|
425
|
+
|
426
|
+
data = []
|
427
|
+
for per_invocation_result in evaluation_result.per_invocation_results:
|
428
|
+
data.append({
|
429
|
+
"eval_status": per_invocation_result.eval_status,
|
430
|
+
"score": per_invocation_result.score,
|
431
|
+
"threshold": threshold,
|
432
|
+
"prompt": AgentEvaluator._convert_content_to_text(
|
433
|
+
per_invocation_result.expected_invocation.user_content
|
434
|
+
),
|
435
|
+
"expected_response": AgentEvaluator._convert_content_to_text(
|
436
|
+
per_invocation_result.expected_invocation.final_response
|
437
|
+
),
|
438
|
+
"actual_response": AgentEvaluator._convert_content_to_text(
|
439
|
+
per_invocation_result.actual_invocation.final_response
|
440
|
+
),
|
441
|
+
"expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
|
442
|
+
per_invocation_result.expected_invocation.intermediate_data
|
443
|
+
),
|
444
|
+
"actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
|
445
|
+
per_invocation_result.actual_invocation.intermediate_data
|
446
|
+
),
|
447
|
+
})
|
448
|
+
|
449
|
+
print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
|
450
|
+
print("\n\n") # Few empty lines for visual clarity
|
451
|
+
|
452
|
+
@staticmethod
|
453
|
+
def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
|
454
|
+
if content and content.parts:
|
455
|
+
return "\n".join([p.text for p in content.parts if p.text])
|
456
|
+
|
457
|
+
return ""
|
458
|
+
|
459
|
+
@staticmethod
|
460
|
+
def _convert_tool_calls_to_text(
|
461
|
+
intermediate_data: Optional[IntermediateData],
|
462
|
+
) -> str:
|
463
|
+
if intermediate_data and intermediate_data.tool_uses:
|
464
|
+
return "\n".join([str(t) for t in intermediate_data.tool_uses])
|
465
|
+
|
466
|
+
return ""
|
@@ -0,0 +1,157 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
from abc import ABC
|
18
|
+
from abc import abstractmethod
|
19
|
+
from typing import AsyncGenerator
|
20
|
+
from typing import Optional
|
21
|
+
|
22
|
+
from pydantic import alias_generators
|
23
|
+
from pydantic import BaseModel
|
24
|
+
from pydantic import ConfigDict
|
25
|
+
from pydantic import Field
|
26
|
+
|
27
|
+
from .eval_case import Invocation
|
28
|
+
from .eval_metrics import EvalMetric
|
29
|
+
from .eval_result import EvalCaseResult
|
30
|
+
|
31
|
+
|
32
|
+
class EvaluateConfig(BaseModel):
|
33
|
+
"""Contains configurations need to run an evaluations."""
|
34
|
+
|
35
|
+
model_config = ConfigDict(
|
36
|
+
alias_generator=alias_generators.to_camel,
|
37
|
+
populate_by_name=True,
|
38
|
+
)
|
39
|
+
|
40
|
+
eval_metrics: list[EvalMetric] = Field(
|
41
|
+
description="""The list of metrics to be used in Eval.""",
|
42
|
+
)
|
43
|
+
|
44
|
+
|
45
|
+
class InferenceConfig(BaseModel):
|
46
|
+
"""Contains configurations need to run inferences."""
|
47
|
+
|
48
|
+
model_config = ConfigDict(
|
49
|
+
alias_generator=alias_generators.to_camel,
|
50
|
+
populate_by_name=True,
|
51
|
+
)
|
52
|
+
|
53
|
+
labels: Optional[dict[str, str]] = Field(
|
54
|
+
default=None,
|
55
|
+
description="""Labels with user-defined metadata to break down billed
|
56
|
+
charges.""",
|
57
|
+
)
|
58
|
+
|
59
|
+
|
60
|
+
class InferenceRequest(BaseModel):
|
61
|
+
"""Represent a request to perform inferences for the eval cases in an eval set."""
|
62
|
+
|
63
|
+
model_config = ConfigDict(
|
64
|
+
alias_generator=alias_generators.to_camel,
|
65
|
+
populate_by_name=True,
|
66
|
+
)
|
67
|
+
|
68
|
+
app_name: str = Field(
|
69
|
+
description="""The name of the app to which the eval case belongs to."""
|
70
|
+
)
|
71
|
+
|
72
|
+
eval_set_id: str = Field(description="""Id of the eval set.""")
|
73
|
+
|
74
|
+
eval_case_ids: Optional[list[str]] = Field(
|
75
|
+
default=None,
|
76
|
+
description="""Id of the eval cases for which inferences need to be
|
77
|
+
generated.
|
78
|
+
|
79
|
+
All the eval case ids should belong to the EvalSet.
|
80
|
+
|
81
|
+
If the list of eval case ids are empty or not specified, then all the eval cases
|
82
|
+
in an eval set are evaluated.
|
83
|
+
""",
|
84
|
+
)
|
85
|
+
|
86
|
+
inference_config: InferenceConfig = Field(
|
87
|
+
description="""The config to use for inferencing.""",
|
88
|
+
)
|
89
|
+
|
90
|
+
|
91
|
+
class InferenceResult(BaseModel):
|
92
|
+
"""Contains inference results for a single eval case."""
|
93
|
+
|
94
|
+
model_config = ConfigDict(
|
95
|
+
alias_generator=alias_generators.to_camel,
|
96
|
+
populate_by_name=True,
|
97
|
+
)
|
98
|
+
|
99
|
+
app_name: str = Field(
|
100
|
+
description="""The name of the app to which the eval case belongs to."""
|
101
|
+
)
|
102
|
+
|
103
|
+
eval_set_id: str = Field(description="""Id of the eval set.""")
|
104
|
+
|
105
|
+
eval_case_id: str = Field(
|
106
|
+
description="""Id of the eval case for which inferences were generated.""",
|
107
|
+
)
|
108
|
+
|
109
|
+
inferences: list[Invocation] = Field(
|
110
|
+
description="""Inferences obtained from the Agent for the eval case."""
|
111
|
+
)
|
112
|
+
|
113
|
+
session_id: Optional[str] = Field(
|
114
|
+
description="""Id of the inference session."""
|
115
|
+
)
|
116
|
+
|
117
|
+
|
118
|
+
class EvaluateRequest(BaseModel):
|
119
|
+
model_config = ConfigDict(
|
120
|
+
alias_generator=alias_generators.to_camel,
|
121
|
+
populate_by_name=True,
|
122
|
+
)
|
123
|
+
|
124
|
+
inference_results: list[InferenceResult] = Field(
|
125
|
+
description="""A list of inferences that need to be evaluated.""",
|
126
|
+
)
|
127
|
+
|
128
|
+
evaluate_config: EvaluateConfig = Field(
|
129
|
+
description="""The config to use for evaluations.""",
|
130
|
+
)
|
131
|
+
|
132
|
+
|
133
|
+
class BaseEvalService(ABC):
|
134
|
+
"""A service to run Evals for an ADK agent."""
|
135
|
+
|
136
|
+
@abstractmethod
|
137
|
+
async def perform_inference(
|
138
|
+
self,
|
139
|
+
inference_request: InferenceRequest,
|
140
|
+
) -> AsyncGenerator[InferenceResult, None]:
|
141
|
+
"""Returns InferenceResult obtained from the Agent as and when they are available.
|
142
|
+
|
143
|
+
Args:
|
144
|
+
inference_request: The request for generating inferences.
|
145
|
+
"""
|
146
|
+
|
147
|
+
@abstractmethod
|
148
|
+
async def evaluate(
|
149
|
+
self,
|
150
|
+
evaluate_request: EvaluateRequest,
|
151
|
+
) -> AsyncGenerator[EvalCaseResult, None]:
|
152
|
+
"""Returns EvalCaseResult for each item as and when they are available.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
evaluate_request: The request to perform metric evaluations on the
|
156
|
+
inferences.
|
157
|
+
"""
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Copyright 2025 Google LLC
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
MISSING_EVAL_DEPENDENCIES_MESSAGE = (
|
18
|
+
"Eval module is not installed, please install via `pip install"
|
19
|
+
" google-adk[eval]`."
|
20
|
+
)
|
@@ -12,10 +12,10 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from __future__ import annotations
|
15
16
|
|
16
17
|
from typing import Any
|
17
18
|
from typing import Optional
|
18
|
-
from typing import Tuple
|
19
19
|
|
20
20
|
from google.genai import types as genai_types
|
21
21
|
from pydantic import alias_generators
|
@@ -37,11 +37,11 @@ class IntermediateData(EvalBaseModel):
|
|
37
37
|
tool_uses: list[genai_types.FunctionCall] = []
|
38
38
|
"""Tool use trajectory in chronological order."""
|
39
39
|
|
40
|
-
intermediate_responses: list[
|
40
|
+
intermediate_responses: list[tuple[str, list[genai_types.Part]]] = []
|
41
41
|
"""Intermediate responses generated by sub-agents to convey progress or status
|
42
42
|
in a multi-agent system, distinct from the final response.
|
43
43
|
|
44
|
-
This is expressed as a
|
44
|
+
This is expressed as a tuple of:
|
45
45
|
- Author: Usually the sub-agent name that generated the intermediate
|
46
46
|
response.
|
47
47
|
|
@@ -14,16 +14,50 @@
|
|
14
14
|
|
15
15
|
from __future__ import annotations
|
16
16
|
|
17
|
+
from enum import Enum
|
17
18
|
from typing import Optional
|
19
|
+
from typing import Union
|
18
20
|
|
21
|
+
from google.genai import types as genai_types
|
19
22
|
from pydantic import alias_generators
|
20
23
|
from pydantic import BaseModel
|
21
24
|
from pydantic import ConfigDict
|
25
|
+
from pydantic import Field
|
26
|
+
from typing_extensions import TypeAlias
|
22
27
|
|
23
28
|
from .eval_case import Invocation
|
24
29
|
from .evaluator import EvalStatus
|
25
30
|
|
26
31
|
|
32
|
+
class PrebuiltMetrics(Enum):
|
33
|
+
TOOL_TRAJECTORY_AVG_SCORE = "tool_trajectory_avg_score"
|
34
|
+
|
35
|
+
RESPONSE_EVALUATION_SCORE = "response_evaluation_score"
|
36
|
+
|
37
|
+
RESPONSE_MATCH_SCORE = "response_match_score"
|
38
|
+
|
39
|
+
|
40
|
+
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
|
41
|
+
|
42
|
+
|
43
|
+
class JudgeModelOptions(BaseModel):
|
44
|
+
"""Options for an eval metric's judge model."""
|
45
|
+
|
46
|
+
judge_model: str = Field(
|
47
|
+
default="gemini-2.5-flash",
|
48
|
+
description="""The judge model to use for evaluation. It can be a model name.""",
|
49
|
+
)
|
50
|
+
|
51
|
+
judge_model_config: Optional[genai_types.GenerateContentConfig] = Field(
|
52
|
+
default=None, description="""The configuration for the judge model."""
|
53
|
+
)
|
54
|
+
|
55
|
+
num_samples: Optional[int] = Field(
|
56
|
+
default=None,
|
57
|
+
description="""The number of times to sample the model for each invocation evaluation.""",
|
58
|
+
)
|
59
|
+
|
60
|
+
|
27
61
|
class EvalMetric(BaseModel):
|
28
62
|
"""A metric used to evaluate a particular aspect of an eval case."""
|
29
63
|
|
@@ -38,6 +72,11 @@ class EvalMetric(BaseModel):
|
|
38
72
|
threshold: float
|
39
73
|
"""A threshold value. Each metric decides how to interpret this threshold."""
|
40
74
|
|
75
|
+
judge_model_options: Optional[JudgeModelOptions] = Field(
|
76
|
+
default=None,
|
77
|
+
description="""Options for the judge model.""",
|
78
|
+
)
|
79
|
+
|
41
80
|
|
42
81
|
class EvalMetricResult(EvalMetric):
|
43
82
|
"""The actual computed score/value of a particular EvalMetric."""
|