azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from collections import Counter
|
|
6
|
+
import json
|
|
7
|
+
from typing import Dict, List, Union, Any, Tuple
|
|
8
|
+
from typing_extensions import overload, override
|
|
9
|
+
|
|
10
|
+
from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
|
|
11
|
+
from azure.ai.evaluation._evaluators._common import EvaluatorBase
|
|
12
|
+
from azure.ai.evaluation._exceptions import (
|
|
13
|
+
ErrorCategory,
|
|
14
|
+
ErrorTarget,
|
|
15
|
+
EvaluationException,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class _TaskNavigationEfficiencyMatchingMode(str, Enum):
|
|
20
|
+
"""
|
|
21
|
+
Enumeration of task navigation efficiency matching mode.
|
|
22
|
+
|
|
23
|
+
This enum allows you to specify which single matching technique should be used when evaluating
|
|
24
|
+
the efficiency of an agent's tool calls sequence against a ground truth path.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
EXACT_MATCH = "exact_match"
|
|
28
|
+
"""
|
|
29
|
+
Binary metric indicating whether the agent's tool calls exactly match the ground truth.
|
|
30
|
+
|
|
31
|
+
Returns True only if the agent's tool calls sequence is identical to the expected sequence
|
|
32
|
+
in both order and content (no extra steps, no missing steps, correct order).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
IN_ORDER_MATCH = "in_order_match"
|
|
36
|
+
"""
|
|
37
|
+
Binary metric allowing extra steps but requiring correct order of required tool calls.
|
|
38
|
+
|
|
39
|
+
Returns True if all ground truth steps appear in the agent's sequence in the correct
|
|
40
|
+
order, even if there are additional steps interspersed.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
ANY_ORDER_MATCH = "any_order_match"
|
|
44
|
+
"""
|
|
45
|
+
Binary metric allowing both extra steps and different ordering.
|
|
46
|
+
|
|
47
|
+
Returns True if all ground truth steps appear in the agent's sequence with sufficient
|
|
48
|
+
frequency, regardless of order. Most lenient matching criterion.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class _TaskNavigationEfficiencyEvaluator(EvaluatorBase):
|
|
53
|
+
"""
|
|
54
|
+
Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
|
|
55
|
+
|
|
56
|
+
The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
|
|
57
|
+
It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
|
|
58
|
+
It also returns precision, recall, and F1 scores in properties bag.
|
|
59
|
+
|
|
60
|
+
:param matching_mode: The matching mode to use. Default is "exact_match".
|
|
61
|
+
:type matching_mode: enum[str, _TaskNavigationEfficiencyMatchingMode]
|
|
62
|
+
|
|
63
|
+
.. admonition:: Example:
|
|
64
|
+
|
|
65
|
+
.. code-block:: python
|
|
66
|
+
|
|
67
|
+
from azure.ai.evaluation._evaluators._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator
|
|
68
|
+
|
|
69
|
+
task_navigation_efficiency_eval = _TaskNavigationEfficiencyEvaluator(
|
|
70
|
+
matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Example 1: Using simple tool names list
|
|
74
|
+
result = path_efficiency_eval(
|
|
75
|
+
response=[
|
|
76
|
+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "identify_tools_to_call", "arguments": {}}]},
|
|
77
|
+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "call_tool_A", "arguments": {}}]},
|
|
78
|
+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]},
|
|
79
|
+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]}
|
|
80
|
+
],
|
|
81
|
+
ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Example 2: Using tool names with parameters (exact parameter matching required)
|
|
85
|
+
result = path_efficiency_eval(
|
|
86
|
+
response=[
|
|
87
|
+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "weather", "location": "NYC"}}]},
|
|
88
|
+
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "format_result", "arguments": {"format": "json"}}]}
|
|
89
|
+
],
|
|
90
|
+
ground_truth=(
|
|
91
|
+
["search", "format_result"],
|
|
92
|
+
{
|
|
93
|
+
"search": {"query": "weather", "location": "NYC"},
|
|
94
|
+
"format_result": {"format": "json"}
|
|
95
|
+
}
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
id = "azureai://built-in/evaluators/task_navigation_efficiency"
|
|
101
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
102
|
+
|
|
103
|
+
matching_mode: _TaskNavigationEfficiencyMatchingMode
|
|
104
|
+
"""The matching mode to use."""
|
|
105
|
+
|
|
106
|
+
@override
|
|
107
|
+
def __init__(
|
|
108
|
+
self,
|
|
109
|
+
*,
|
|
110
|
+
matching_mode: Union[
|
|
111
|
+
str, _TaskNavigationEfficiencyMatchingMode
|
|
112
|
+
] = _TaskNavigationEfficiencyMatchingMode.EXACT_MATCH,
|
|
113
|
+
):
|
|
114
|
+
# Type checking for metric parameter
|
|
115
|
+
if isinstance(matching_mode, str):
|
|
116
|
+
try:
|
|
117
|
+
self.matching_mode = _TaskNavigationEfficiencyMatchingMode(matching_mode)
|
|
118
|
+
except ValueError:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"matching_mode must be one of {[m.value for m in _TaskNavigationEfficiencyMatchingMode]}, got '{matching_mode}'"
|
|
121
|
+
)
|
|
122
|
+
elif isinstance(matching_mode, _TaskNavigationEfficiencyMatchingMode):
|
|
123
|
+
self.matching_mode = matching_mode
|
|
124
|
+
else:
|
|
125
|
+
raise EvaluationException(
|
|
126
|
+
f"matching_mode must be a string with one of {[m.value for m in _TaskNavigationEfficiencyMatchingMode]} or _TaskNavigationEfficiencyMatchingMode enum, got {type(matching_mode)}",
|
|
127
|
+
internal_message=str(matching_mode),
|
|
128
|
+
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
|
|
129
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
super().__init__()
|
|
133
|
+
|
|
134
|
+
def _prepare_steps_for_comparison(
|
|
135
|
+
self,
|
|
136
|
+
agent_tool_pairs: List[Tuple[str, Dict[str, Any]]],
|
|
137
|
+
ground_truth: List[str],
|
|
138
|
+
ground_truth_params: Dict[str, Dict[str, Any]],
|
|
139
|
+
use_parameter_matching: bool,
|
|
140
|
+
) -> Tuple[
|
|
141
|
+
List[Union[str, Tuple[str, Tuple]]],
|
|
142
|
+
List[Union[str, Tuple[str, Tuple]]],
|
|
143
|
+
]:
|
|
144
|
+
"""Prepare agent and ground truth steps for comparison based on parameter matching mode."""
|
|
145
|
+
agent_steps: List[Union[str, Tuple[str, Tuple]]] = []
|
|
146
|
+
ground_truth_steps: List[Union[str, Tuple[str, Tuple]]] = []
|
|
147
|
+
if use_parameter_matching:
|
|
148
|
+
# When parameter matching is enabled, we need to match both tool name and parameters
|
|
149
|
+
agent_steps = [(pair[0], tuple(sorted(pair[1].items()))) for pair in agent_tool_pairs]
|
|
150
|
+
ground_truth_steps = [
|
|
151
|
+
(name, tuple(sorted(ground_truth_params.get(name, {}).items()))) for name in ground_truth
|
|
152
|
+
]
|
|
153
|
+
else:
|
|
154
|
+
# When parameter matching is disabled, only compare tool names
|
|
155
|
+
agent_steps = [name for name, _ in agent_tool_pairs]
|
|
156
|
+
ground_truth_steps = [step for step in ground_truth]
|
|
157
|
+
|
|
158
|
+
return agent_steps, ground_truth_steps
|
|
159
|
+
|
|
160
|
+
def _calculate_precision_recall_f1_scores(self, agent_steps: List, ground_truth_steps: List) -> Dict[str, float]:
|
|
161
|
+
"""Calculate precision, recall, and F1 scores."""
|
|
162
|
+
if not agent_steps:
|
|
163
|
+
return {"precision_score": 0.0, "recall_score": 0.0, "f1_score": 0.0}
|
|
164
|
+
|
|
165
|
+
# Count occurrences of each step in both lists to handle duplicates
|
|
166
|
+
agent_steps_counts = Counter(agent_steps)
|
|
167
|
+
ground_truth_counts = Counter(ground_truth_steps)
|
|
168
|
+
|
|
169
|
+
# Calculate true positives by taking the minimum count for each common element
|
|
170
|
+
# For each step, count the intersection (min count) of agent and ground truth steps
|
|
171
|
+
true_positives = sum(
|
|
172
|
+
min(agent_steps_counts[step], ground_truth_counts[step])
|
|
173
|
+
for step in agent_steps_counts
|
|
174
|
+
if step in ground_truth_counts
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Calculate false positives (agent steps not in ground truth or excess occurrences)
|
|
178
|
+
# For each step, count the excess occurrences of agent steps not in (minus) ground truth
|
|
179
|
+
# or zero (agent steps minus agent steps) if agent steps is less than ground truth
|
|
180
|
+
false_positives = sum(
|
|
181
|
+
agent_steps_counts[step] - min(agent_steps_counts[step], ground_truth_counts.get(step, 0))
|
|
182
|
+
for step in agent_steps_counts
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Calculate false negatives (ground truth steps not in agent or missing occurrences)
|
|
186
|
+
# For each step, count the excess occurrences of ground truth steps not in (minus) agent steps
|
|
187
|
+
# or zero (ground truth steps minus ground truth steps) if ground truth steps is less than agent steps
|
|
188
|
+
false_negatives = sum(
|
|
189
|
+
ground_truth_counts[step] - min(ground_truth_counts[step], agent_steps_counts.get(step, 0))
|
|
190
|
+
for step in ground_truth_counts
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Calculate precision, recall, F1
|
|
194
|
+
precision = (
|
|
195
|
+
true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
|
|
196
|
+
)
|
|
197
|
+
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
|
|
198
|
+
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
"precision_score": precision,
|
|
202
|
+
"recall_score": recall,
|
|
203
|
+
"f1_score": f1_score,
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
def _calculate_exact_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
|
|
207
|
+
"""Check if agent steps exactly match ground truth (order and content)."""
|
|
208
|
+
return agent_steps == ground_truth_steps
|
|
209
|
+
|
|
210
|
+
def _calculate_in_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
|
|
211
|
+
"""Check if all ground truth steps appear in agent steps in correct order (extra steps allowed)."""
|
|
212
|
+
if not ground_truth_steps:
|
|
213
|
+
return True
|
|
214
|
+
|
|
215
|
+
gt_index = 0
|
|
216
|
+
for step in agent_steps:
|
|
217
|
+
if gt_index < len(ground_truth_steps) and step == ground_truth_steps[gt_index]:
|
|
218
|
+
gt_index += 1
|
|
219
|
+
|
|
220
|
+
return gt_index == len(ground_truth_steps)
|
|
221
|
+
|
|
222
|
+
def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
|
|
223
|
+
"""Check if all ground truth steps appear in agent steps with sufficient frequency (any order, extra steps allowed)."""
|
|
224
|
+
# Count occurrences of each step in both lists to handle duplicates
|
|
225
|
+
agent_counts = Counter(agent_steps)
|
|
226
|
+
ground_truth_counts = Counter(ground_truth_steps)
|
|
227
|
+
|
|
228
|
+
# Check if agent has at least as many occurrences of each ground truth step
|
|
229
|
+
return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
|
|
230
|
+
|
|
231
|
+
_TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
|
|
232
|
+
_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH: _calculate_exact_match,
|
|
233
|
+
_TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH: _calculate_in_order_match,
|
|
234
|
+
_TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH: _calculate_any_order_match,
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
@override
|
|
238
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[str, float]]]:
|
|
239
|
+
"""Produce a path efficiency evaluation result.
|
|
240
|
+
|
|
241
|
+
:param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
|
|
242
|
+
:type eval_input: Dict
|
|
243
|
+
:return: The evaluation result.
|
|
244
|
+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
|
|
245
|
+
"""
|
|
246
|
+
response = eval_input["response"]
|
|
247
|
+
ground_truth = eval_input["ground_truth"]
|
|
248
|
+
|
|
249
|
+
# Value and type checking for ground truth steps
|
|
250
|
+
if not ground_truth:
|
|
251
|
+
raise ValueError("ground_truth cannot be empty")
|
|
252
|
+
|
|
253
|
+
# Check if ground_truth is a tuple (tool names + parameters) or list (tool names only)
|
|
254
|
+
use_parameter_matching = False
|
|
255
|
+
ground_truth_names = []
|
|
256
|
+
ground_truth_params_dict: Dict[str, Dict[str, Any]] = {}
|
|
257
|
+
|
|
258
|
+
if isinstance(ground_truth, tuple) and len(ground_truth) == 2:
|
|
259
|
+
# Tuple format: (tool_names, parameters_dict)
|
|
260
|
+
tool_names_list, params_dict = ground_truth
|
|
261
|
+
|
|
262
|
+
if not isinstance(tool_names_list, list) or not all(isinstance(name, str) for name in tool_names_list):
|
|
263
|
+
raise TypeError("ground_truth tuple first element must be a list of strings (tool names)")
|
|
264
|
+
|
|
265
|
+
if not isinstance(params_dict, dict):
|
|
266
|
+
raise TypeError(
|
|
267
|
+
"ground_truth tuple second element must be a dictionary mapping tool names to parameters"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Validate that all values in params_dict are dictionaries with string keys and values
|
|
271
|
+
for tool_name, params in params_dict.items():
|
|
272
|
+
if not isinstance(tool_name, str):
|
|
273
|
+
raise TypeError("ground_truth parameters dictionary keys must be strings (tool names)")
|
|
274
|
+
if not isinstance(params, dict):
|
|
275
|
+
raise TypeError(f"ground_truth parameters for tool '{tool_name}' must be a dictionary")
|
|
276
|
+
for k, v in params.items():
|
|
277
|
+
if not isinstance(k, str):
|
|
278
|
+
raise TypeError(f"ground_truth parameters for tool '{tool_name}' must have string keys")
|
|
279
|
+
try:
|
|
280
|
+
json.dumps(v)
|
|
281
|
+
except (TypeError, ValueError):
|
|
282
|
+
raise TypeError(
|
|
283
|
+
f"ground_truth parameters for tool '{tool_name}' must have JSON-serializable values (got type {type(v)} for key '{k}')"
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
ground_truth_names = [name.strip() for name in tool_names_list]
|
|
287
|
+
ground_truth_params_dict = params_dict
|
|
288
|
+
use_parameter_matching = True
|
|
289
|
+
elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
|
|
290
|
+
# List format: just tool names
|
|
291
|
+
ground_truth_names = [step.strip() for step in ground_truth]
|
|
292
|
+
use_parameter_matching = False
|
|
293
|
+
else:
|
|
294
|
+
raise TypeError(
|
|
295
|
+
"ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Extract tool information from the response
|
|
299
|
+
agent_tool_pairs = self._extract_tool_names_and_params_from_response(response)
|
|
300
|
+
|
|
301
|
+
# Prepare steps for comparison
|
|
302
|
+
agent_steps, ground_truth_steps = self._prepare_steps_for_comparison(
|
|
303
|
+
agent_tool_pairs,
|
|
304
|
+
ground_truth_names,
|
|
305
|
+
ground_truth_params_dict,
|
|
306
|
+
use_parameter_matching,
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Calculate precision, recall, and F1 scores
|
|
310
|
+
additional_properties_metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
|
|
311
|
+
|
|
312
|
+
# Convert metrics to floats, using nan for None or non-convertible values
|
|
313
|
+
for metric, score in additional_properties_metrics.items():
|
|
314
|
+
additional_properties_metrics[metric] = float(score) if score is not None else float("nan")
|
|
315
|
+
|
|
316
|
+
if self.matching_mode in self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS:
|
|
317
|
+
# Calculate binary match metrics
|
|
318
|
+
match_result = self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS[self.matching_mode](
|
|
319
|
+
self, agent_steps, ground_truth_steps
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
"task_navigation_efficiency_label": match_result,
|
|
324
|
+
"task_navigation_efficiency_result": EVALUATION_PASS_FAIL_MAPPING[match_result],
|
|
325
|
+
"task_navigation_efficiency_details": additional_properties_metrics,
|
|
326
|
+
}
|
|
327
|
+
else:
|
|
328
|
+
raise EvaluationException(
|
|
329
|
+
f"Unsupported matching_mode '{self.matching_mode}'",
|
|
330
|
+
internal_message=str(self.matching_mode),
|
|
331
|
+
target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
|
|
332
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
@overload
|
|
336
|
+
def __call__( # type: ignore
|
|
337
|
+
self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str]
|
|
338
|
+
) -> Dict[str, Union[float, str, Dict[str, float]]]:
|
|
339
|
+
"""
|
|
340
|
+
Evaluate the task navigation efficiency of an agent's action sequence.
|
|
341
|
+
|
|
342
|
+
:keyword response: The agent's response containing tool calls.
|
|
343
|
+
:paramtype response: Union[str, List[Dict[str, Any]]]
|
|
344
|
+
:keyword ground_truth: List of expected tool/action steps.
|
|
345
|
+
:paramtype ground_truth: List[str]
|
|
346
|
+
:return: The task navigation efficiency scores and results.
|
|
347
|
+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
@overload
|
|
351
|
+
def __call__( # type: ignore
|
|
352
|
+
self,
|
|
353
|
+
*,
|
|
354
|
+
response: Union[str, List[Dict[str, Any]]],
|
|
355
|
+
ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
|
|
356
|
+
) -> Dict[str, Union[float, str, Dict[str, float]]]:
|
|
357
|
+
"""
|
|
358
|
+
Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.
|
|
359
|
+
|
|
360
|
+
:keyword response: The agent's response containing tool calls.
|
|
361
|
+
:paramtype response: Union[str, List[Dict[str, Any]]]
|
|
362
|
+
:keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
|
|
363
|
+
:paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
|
|
364
|
+
:return: The task navigation efficiency scores and results.
|
|
365
|
+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
@override
|
|
369
|
+
def __call__(
|
|
370
|
+
self,
|
|
371
|
+
*args,
|
|
372
|
+
**kwargs,
|
|
373
|
+
):
|
|
374
|
+
"""
|
|
375
|
+
Evaluate task navigation efficiency.
|
|
376
|
+
|
|
377
|
+
:keyword response: The agent's response containing tool calls.
|
|
378
|
+
:paramtype response: Union[str, List[Dict[str, Any]]]
|
|
379
|
+
:keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
|
|
380
|
+
:paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
|
|
381
|
+
:return: The task navigation efficiency scores and results.
|
|
382
|
+
:rtype: Dict[str, Union[float, str, Dict[str, float]]]
|
|
383
|
+
"""
|
|
384
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from itertools import chain
|
|
5
4
|
import math
|
|
6
5
|
import os
|
|
7
6
|
import logging
|
|
@@ -17,46 +16,12 @@ from azure.ai.evaluation._exceptions import (
|
|
|
17
16
|
)
|
|
18
17
|
from ..._common.utils import check_score_is_valid
|
|
19
18
|
from azure.ai.evaluation._common._experimental import experimental
|
|
20
|
-
from ..._converters._models import (
|
|
21
|
-
_BUILT_IN_DESCRIPTIONS,
|
|
22
|
-
_BUILT_IN_PARAMS,
|
|
23
|
-
)
|
|
24
19
|
|
|
25
20
|
logger = logging.getLogger(__name__)
|
|
26
21
|
|
|
27
22
|
T_EvalValue = TypeVar("T_EvalValue")
|
|
28
23
|
|
|
29
24
|
|
|
30
|
-
def _get_built_in_definition(tool_name: str):
|
|
31
|
-
"""Get the definition for the built-in tool."""
|
|
32
|
-
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
33
|
-
return {
|
|
34
|
-
"type": tool_name,
|
|
35
|
-
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
|
|
36
|
-
"name": tool_name,
|
|
37
|
-
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
|
|
38
|
-
}
|
|
39
|
-
return None
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
|
|
43
|
-
"""Extract tool definitions needed for the given built-in tool calls."""
|
|
44
|
-
needed_definitions = []
|
|
45
|
-
for tool_call in tool_calls:
|
|
46
|
-
if isinstance(tool_call, dict):
|
|
47
|
-
tool_type = tool_call.get("type")
|
|
48
|
-
|
|
49
|
-
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
|
|
50
|
-
if tool_type == "tool_call":
|
|
51
|
-
tool_name = tool_call.get("name")
|
|
52
|
-
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
53
|
-
built_in_def = _get_built_in_definition(tool_name)
|
|
54
|
-
if built_in_def and built_in_def not in needed_definitions:
|
|
55
|
-
needed_definitions.append(built_in_def)
|
|
56
|
-
|
|
57
|
-
return needed_definitions
|
|
58
|
-
|
|
59
|
-
|
|
60
25
|
@experimental
|
|
61
26
|
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
62
27
|
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
|
|
@@ -100,9 +65,12 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
100
65
|
|
|
101
66
|
.. note::
|
|
102
67
|
|
|
68
|
+
The output field "details" has been renamed to "tool_call_accuracy_details" for clarity.
|
|
69
|
+
|
|
103
70
|
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
104
71
|
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
105
72
|
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
73
|
+
|
|
106
74
|
"""
|
|
107
75
|
|
|
108
76
|
_PROMPTY_FILE = "tool_call_accuracy.prompty"
|
|
@@ -132,6 +100,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
132
100
|
prompty_file=prompty_path,
|
|
133
101
|
result_key=self._RESULT_KEY,
|
|
134
102
|
credential=credential,
|
|
103
|
+
threshold=threshold,
|
|
135
104
|
**kwargs,
|
|
136
105
|
)
|
|
137
106
|
|
|
@@ -207,7 +176,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
207
176
|
tool_definitions = [tool_definitions] if tool_definitions else []
|
|
208
177
|
|
|
209
178
|
try:
|
|
210
|
-
needed_tool_definitions = self._extract_needed_tool_definitions(
|
|
179
|
+
needed_tool_definitions = self._extract_needed_tool_definitions(
|
|
180
|
+
tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
|
|
181
|
+
)
|
|
211
182
|
except EvaluationException as e:
|
|
212
183
|
# Check if this is because no tool definitions were provided at all
|
|
213
184
|
if len(tool_definitions) == 0:
|
|
@@ -235,8 +206,8 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
235
206
|
:rtype: Dict
|
|
236
207
|
"""
|
|
237
208
|
# Single LLM call for all tool calls
|
|
238
|
-
|
|
239
|
-
|
|
209
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
210
|
+
llm_output = prompty_output_dict.get("llm_output", {})
|
|
240
211
|
if isinstance(llm_output, dict):
|
|
241
212
|
score = llm_output.get(self._LLM_SCORE_KEY, None)
|
|
242
213
|
if not score or not check_score_is_valid(
|
|
@@ -248,6 +219,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
248
219
|
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
|
|
249
220
|
internal_message="Invalid score value.",
|
|
250
221
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
222
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
251
223
|
blame=ErrorBlame.SYSTEM_ERROR,
|
|
252
224
|
)
|
|
253
225
|
|
|
@@ -257,10 +229,18 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
257
229
|
score_result = "pass" if score >= self.threshold else "fail"
|
|
258
230
|
response_dict = {
|
|
259
231
|
self._result_key: score,
|
|
232
|
+
f"gpt_{self._result_key}": score,
|
|
260
233
|
f"{self._result_key}_result": score_result,
|
|
261
|
-
f"{self._result_key}_threshold": self.
|
|
234
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
262
235
|
f"{self._result_key}_reason": reason,
|
|
263
|
-
"
|
|
236
|
+
f"{self._result_key}_details": llm_output.get("details", {}),
|
|
237
|
+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
|
|
238
|
+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
|
|
239
|
+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
|
|
240
|
+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
|
|
241
|
+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
|
|
242
|
+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
|
|
243
|
+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
|
|
264
244
|
}
|
|
265
245
|
return response_dict
|
|
266
246
|
|
|
@@ -275,105 +255,21 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
275
255
|
async def _real_call(self, **kwargs):
|
|
276
256
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
277
257
|
|
|
278
|
-
:keyword kwargs: The inputs to evaluate
|
|
258
|
+
:keyword kwargs: The inputs to evaluate
|
|
279
259
|
:type kwargs: Dict
|
|
280
|
-
:return: The evaluation result
|
|
260
|
+
:return: The evaluation result
|
|
281
261
|
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
282
262
|
"""
|
|
283
263
|
# Convert inputs into list of evaluable inputs.
|
|
284
264
|
eval_input = self._convert_kwargs_to_eval_input(**kwargs)
|
|
285
265
|
if isinstance(eval_input, dict) and eval_input.get("error_message"):
|
|
286
266
|
# If there is an error message, return not applicable result
|
|
287
|
-
return self._not_applicable_result(eval_input.get("error_message"))
|
|
267
|
+
return self._not_applicable_result(eval_input.get("error_message"), self.threshold)
|
|
288
268
|
# Do the evaluation
|
|
289
269
|
result = await self._do_eval(eval_input)
|
|
290
270
|
# Return the result
|
|
291
271
|
return result
|
|
292
272
|
|
|
293
|
-
def _not_applicable_result(self, error_message):
|
|
294
|
-
"""Return a result indicating that the tool call is not applicable for evaluation.
|
|
295
|
-
:param eval_input: The input to the evaluator.
|
|
296
|
-
:type eval_input: Dict
|
|
297
|
-
:return: A dictionary containing the result of the evaluation.
|
|
298
|
-
:rtype: Dict[str, Union[str, float]]
|
|
299
|
-
"""
|
|
300
|
-
# If no tool calls were made or tool call type is not supported, return not applicable result
|
|
301
|
-
return {
|
|
302
|
-
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
303
|
-
f"{self._result_key}_result": "pass",
|
|
304
|
-
f"{self._result_key}_threshold": self.threshold,
|
|
305
|
-
f"{self._result_key}_reason": error_message,
|
|
306
|
-
"details": {},
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
|
|
310
|
-
"""Extract the tool definitions that are needed for the provided tool calls."""
|
|
311
|
-
needed_tool_definitions = []
|
|
312
|
-
|
|
313
|
-
# Add all user-provided tool definitions
|
|
314
|
-
needed_tool_definitions.extend(tool_definitions)
|
|
315
|
-
|
|
316
|
-
# Add the needed built-in tool definitions (if they are called)
|
|
317
|
-
built_in_definitions = _get_needed_built_in_definitions(tool_calls)
|
|
318
|
-
needed_tool_definitions.extend(built_in_definitions)
|
|
319
|
-
|
|
320
|
-
# OpenAPI tool is a collection of functions, so we need to expand it
|
|
321
|
-
tool_definitions_expanded = list(
|
|
322
|
-
chain.from_iterable(
|
|
323
|
-
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
|
|
324
|
-
for tool in needed_tool_definitions
|
|
325
|
-
)
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
# Validate that all tool calls have corresponding definitions
|
|
329
|
-
for tool_call in tool_calls:
|
|
330
|
-
if isinstance(tool_call, dict):
|
|
331
|
-
tool_type = tool_call.get("type")
|
|
332
|
-
|
|
333
|
-
if tool_type == "tool_call":
|
|
334
|
-
tool_name = tool_call.get("name")
|
|
335
|
-
if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
336
|
-
# This is a built-in tool from converter, already handled above
|
|
337
|
-
continue
|
|
338
|
-
elif tool_name:
|
|
339
|
-
# This is a regular function tool from converter
|
|
340
|
-
tool_definition_exists = any(
|
|
341
|
-
tool.get("name") == tool_name and tool.get("type", "function") == "function"
|
|
342
|
-
for tool in tool_definitions_expanded
|
|
343
|
-
)
|
|
344
|
-
if not tool_definition_exists:
|
|
345
|
-
raise EvaluationException(
|
|
346
|
-
message=f"Tool definition for {tool_name} not found",
|
|
347
|
-
blame=ErrorBlame.USER_ERROR,
|
|
348
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
349
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
350
|
-
)
|
|
351
|
-
else:
|
|
352
|
-
raise EvaluationException(
|
|
353
|
-
message=f"Tool call missing name: {tool_call}",
|
|
354
|
-
blame=ErrorBlame.USER_ERROR,
|
|
355
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
356
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
357
|
-
)
|
|
358
|
-
else:
|
|
359
|
-
# Unsupported tool format - only converter format is supported
|
|
360
|
-
raise EvaluationException(
|
|
361
|
-
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
|
|
362
|
-
blame=ErrorBlame.USER_ERROR,
|
|
363
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
364
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
365
|
-
)
|
|
366
|
-
else:
|
|
367
|
-
# Tool call is not a dictionary
|
|
368
|
-
raise EvaluationException(
|
|
369
|
-
message=f"Tool call is not a dictionary: {tool_call}",
|
|
370
|
-
blame=ErrorBlame.USER_ERROR,
|
|
371
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
372
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
373
|
-
)
|
|
374
|
-
|
|
375
|
-
return needed_tool_definitions
|
|
376
|
-
|
|
377
273
|
@override
|
|
378
274
|
def __call__( # pylint: disable=docstring-missing-param
|
|
379
275
|
self,
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._tool_input_accuracy import _ToolInputAccuracyEvaluator
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"_ToolInputAccuracyEvaluator",
|
|
9
|
+
]
|