azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +46 -12
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_common/rai_service.py +3 -3
- azure/ai/evaluation/_common/utils.py +74 -17
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
- azure/ai/evaluation/_evaluate/_utils.py +5 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +4 -3
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +655 -2665
- azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
- azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -1,33 +1,75 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from itertools import chain
|
|
4
5
|
import math
|
|
5
6
|
import os
|
|
6
7
|
import logging
|
|
7
8
|
import re
|
|
8
|
-
from typing import Dict, List, Union, TypeVar,
|
|
9
|
+
from typing import Dict, List, Union, TypeVar, Optional
|
|
9
10
|
from typing_extensions import overload, override
|
|
10
11
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
|
-
from azure.ai.evaluation.
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
from azure.ai.evaluation._exceptions import (
|
|
13
|
+
ErrorBlame,
|
|
14
|
+
ErrorCategory,
|
|
15
|
+
ErrorTarget,
|
|
16
|
+
EvaluationException,
|
|
17
|
+
)
|
|
18
|
+
from ..._common.utils import check_score_is_valid
|
|
14
19
|
from azure.ai.evaluation._common._experimental import experimental
|
|
20
|
+
from ..._converters._models import (
|
|
21
|
+
_BUILT_IN_DESCRIPTIONS,
|
|
22
|
+
_BUILT_IN_PARAMS,
|
|
23
|
+
)
|
|
15
24
|
|
|
16
25
|
logger = logging.getLogger(__name__)
|
|
17
26
|
|
|
18
27
|
T_EvalValue = TypeVar("T_EvalValue")
|
|
19
28
|
|
|
20
29
|
|
|
30
|
+
def _get_built_in_definition(tool_name: str):
|
|
31
|
+
"""Get the definition for the built-in tool."""
|
|
32
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
33
|
+
return {
|
|
34
|
+
"type": tool_name,
|
|
35
|
+
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
|
|
36
|
+
"name": tool_name,
|
|
37
|
+
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
|
|
38
|
+
}
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
|
|
43
|
+
"""Extract tool definitions needed for the given built-in tool calls."""
|
|
44
|
+
needed_definitions = []
|
|
45
|
+
for tool_call in tool_calls:
|
|
46
|
+
if isinstance(tool_call, dict):
|
|
47
|
+
tool_type = tool_call.get("type")
|
|
48
|
+
|
|
49
|
+
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
|
|
50
|
+
if tool_type == "tool_call":
|
|
51
|
+
tool_name = tool_call.get("name")
|
|
52
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
53
|
+
built_in_def = _get_built_in_definition(tool_name)
|
|
54
|
+
if built_in_def and built_in_def not in needed_definitions:
|
|
55
|
+
needed_definitions.append(built_in_def)
|
|
56
|
+
|
|
57
|
+
return needed_definitions
|
|
58
|
+
|
|
59
|
+
|
|
21
60
|
@experimental
|
|
22
61
|
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
23
62
|
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
|
|
24
|
-
- Relevance to the conversation
|
|
25
|
-
- Parameter correctness according to tool definitions
|
|
26
|
-
- Parameter value extraction from the conversation
|
|
63
|
+
- Relevance to the conversation.
|
|
64
|
+
- Parameter correctness according to tool definitions.
|
|
65
|
+
- Parameter value extraction from the conversation.
|
|
27
66
|
|
|
28
|
-
The evaluator uses a
|
|
29
|
-
- Score
|
|
30
|
-
- Score
|
|
67
|
+
The evaluator uses a scoring rubric of 1 to 5:
|
|
68
|
+
- Score 1: The tool calls are irrelevant
|
|
69
|
+
- Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed.
|
|
70
|
+
- Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made.
|
|
71
|
+
- Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded.
|
|
72
|
+
- Score 5: The tool calls are relevant, and all parameters were correctly passed.
|
|
31
73
|
|
|
32
74
|
This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing
|
|
33
75
|
user needs while properly following tool definitions and using information present in the
|
|
@@ -64,22 +106,34 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
64
106
|
"""
|
|
65
107
|
|
|
66
108
|
_PROMPTY_FILE = "tool_call_accuracy.prompty"
|
|
67
|
-
_RESULT_KEY = "
|
|
68
|
-
|
|
109
|
+
_RESULT_KEY = "tool_call_accuracy"
|
|
110
|
+
|
|
111
|
+
_MAX_TOOL_CALL_ACCURACY_SCORE = 5
|
|
112
|
+
_MIN_TOOL_CALL_ACCURACY_SCORE = 1
|
|
113
|
+
_DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
|
|
69
114
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
115
|
+
_NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
|
|
116
|
+
_NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
|
|
117
|
+
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
|
|
118
|
+
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
|
|
73
119
|
|
|
74
|
-
|
|
120
|
+
_LLM_SCORE_KEY = "tool_calls_success_level"
|
|
121
|
+
|
|
122
|
+
id = "azureai://built-in/evaluators/tool_call_accuracy"
|
|
75
123
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
76
124
|
|
|
77
125
|
@override
|
|
78
|
-
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
|
|
126
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
|
|
79
127
|
current_dir = os.path.dirname(__file__)
|
|
80
128
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
81
129
|
self.threshold = threshold
|
|
82
|
-
super().__init__(
|
|
130
|
+
super().__init__(
|
|
131
|
+
model_config=model_config,
|
|
132
|
+
prompty_file=prompty_path,
|
|
133
|
+
result_key=self._RESULT_KEY,
|
|
134
|
+
credential=credential,
|
|
135
|
+
**kwargs,
|
|
136
|
+
)
|
|
83
137
|
|
|
84
138
|
@overload
|
|
85
139
|
def __call__(
|
|
@@ -134,84 +188,45 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
134
188
|
"""
|
|
135
189
|
# TODO add warning that only tool calls of type function are supported
|
|
136
190
|
# Collect inputs
|
|
137
|
-
tool_calls = kwargs.get("tool_calls"
|
|
138
|
-
tool_definitions = kwargs.get("tool_definitions")
|
|
139
|
-
query = kwargs.get("query"
|
|
140
|
-
response = kwargs.get("response"
|
|
141
|
-
|
|
142
|
-
if response is None and tool_calls is None:
|
|
143
|
-
raise EvaluationException(
|
|
144
|
-
message="Either response or tool_calls must be provided.",
|
|
145
|
-
blame=ErrorBlame.USER_ERROR,
|
|
146
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
147
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
if tool_definitions is None:
|
|
151
|
-
raise EvaluationException(
|
|
152
|
-
message="Tool definitions must be provided.",
|
|
153
|
-
blame=ErrorBlame.USER_ERROR,
|
|
154
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
155
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
156
|
-
)
|
|
157
|
-
|
|
191
|
+
tool_calls = kwargs.get("tool_calls")
|
|
192
|
+
tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
|
|
193
|
+
query = kwargs.get("query")
|
|
194
|
+
response = kwargs.get("response")
|
|
158
195
|
# TODO : Support classes that represents tool calls, messages etc once client side definitions are available
|
|
159
|
-
if
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
[content for content in message.get("content") if content.get("type") == "tool_call"]
|
|
167
|
-
)
|
|
168
|
-
if len(tool_calls) == 0:
|
|
169
|
-
raise EvaluationException(
|
|
170
|
-
message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
|
|
171
|
-
blame=ErrorBlame.USER_ERROR,
|
|
172
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
173
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
174
|
-
)
|
|
196
|
+
if response:
|
|
197
|
+
parsed_tool_calls = self._parse_tools_from_response(response)
|
|
198
|
+
if parsed_tool_calls:
|
|
199
|
+
tool_calls = parsed_tool_calls
|
|
200
|
+
|
|
201
|
+
if not tool_calls:
|
|
202
|
+
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
|
|
175
203
|
|
|
176
204
|
if not isinstance(tool_calls, list):
|
|
177
205
|
tool_calls = [tool_calls]
|
|
178
|
-
|
|
179
206
|
if not isinstance(tool_definitions, list):
|
|
180
|
-
tool_definitions = [tool_definitions]
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
if (
|
|
187
|
-
|
|
188
|
-
): # TODO assuming dict here but it can be a class
|
|
189
|
-
function_name = tool_call.get("name")
|
|
190
|
-
tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
|
|
191
|
-
if len(tool_definition) > 0:
|
|
192
|
-
tool_definition = tool_definition
|
|
193
|
-
else:
|
|
194
|
-
raise EvaluationException(
|
|
195
|
-
message="Tool definition not found",
|
|
196
|
-
blame=ErrorBlame.USER_ERROR,
|
|
197
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
198
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
199
|
-
)
|
|
200
|
-
eval_inputs.append({"query": query, "tool_call": tool_call, "tool_definition": tool_definition})
|
|
207
|
+
tool_definitions = [tool_definitions] if tool_definitions else []
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
|
|
211
|
+
except EvaluationException as e:
|
|
212
|
+
# Check if this is because no tool definitions were provided at all
|
|
213
|
+
if len(tool_definitions) == 0:
|
|
214
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
201
215
|
else:
|
|
202
|
-
|
|
203
|
-
message="Tool definition not found",
|
|
204
|
-
blame=ErrorBlame.USER_ERROR,
|
|
205
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
206
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
207
|
-
)
|
|
216
|
+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
|
|
208
217
|
|
|
209
|
-
|
|
218
|
+
if len(needed_tool_definitions) == 0:
|
|
219
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
220
|
+
|
|
221
|
+
return {
|
|
222
|
+
"query": query,
|
|
223
|
+
"tool_calls": tool_calls,
|
|
224
|
+
"tool_definitions": needed_tool_definitions,
|
|
225
|
+
}
|
|
210
226
|
|
|
211
227
|
@override
|
|
212
228
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
213
|
-
"""Do a
|
|
214
|
-
|
|
229
|
+
"""Do a tool call accuracy evaluation.
|
|
215
230
|
:param eval_input: The input to the evaluator. Expected to contain
|
|
216
231
|
whatever inputs are needed for the _flow method, including context
|
|
217
232
|
and other fields depending on the child class.
|
|
@@ -219,23 +234,43 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
219
234
|
:return: The evaluation result.
|
|
220
235
|
:rtype: Dict
|
|
221
236
|
"""
|
|
237
|
+
# Single LLM call for all tool calls
|
|
222
238
|
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
223
239
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
score
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
240
|
+
if isinstance(llm_output, dict):
|
|
241
|
+
score = llm_output.get(self._LLM_SCORE_KEY, None)
|
|
242
|
+
if not score or not check_score_is_valid(
|
|
243
|
+
score,
|
|
244
|
+
ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE,
|
|
245
|
+
ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE,
|
|
246
|
+
):
|
|
247
|
+
raise EvaluationException(
|
|
248
|
+
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
|
|
249
|
+
internal_message="Invalid score value.",
|
|
250
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
251
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Format the output
|
|
255
|
+
reason = llm_output.get("chain_of_thought", "")
|
|
256
|
+
score = float(score)
|
|
257
|
+
score_result = "pass" if score >= self.threshold else "fail"
|
|
258
|
+
response_dict = {
|
|
259
|
+
self._result_key: score,
|
|
260
|
+
f"{self._result_key}_result": score_result,
|
|
261
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
262
|
+
f"{self._result_key}_reason": reason,
|
|
263
|
+
"details": llm_output.get("details", {}),
|
|
264
|
+
}
|
|
265
|
+
return response_dict
|
|
266
|
+
|
|
267
|
+
else:
|
|
268
|
+
raise EvaluationException(
|
|
269
|
+
message="Tool call accuracy evaluator returned invalid output.",
|
|
270
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
271
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
272
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
273
|
+
)
|
|
239
274
|
|
|
240
275
|
async def _real_call(self, **kwargs):
|
|
241
276
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
@@ -246,106 +281,98 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
246
281
|
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
247
282
|
"""
|
|
248
283
|
# Convert inputs into list of evaluable inputs.
|
|
249
|
-
|
|
250
|
-
if
|
|
251
|
-
return
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
per_turn_results = []
|
|
260
|
-
# Evaluate all inputs.
|
|
261
|
-
for eval_input in eval_input_list:
|
|
262
|
-
if self._is_applicable_tool(eval_input):
|
|
263
|
-
per_turn_results.append(await self._do_eval(eval_input))
|
|
264
|
-
else:
|
|
265
|
-
per_turn_results.append(self._not_applicable_result(eval_input))
|
|
266
|
-
|
|
267
|
-
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
268
|
-
|
|
269
|
-
def _is_applicable_tool(self, eval_input):
|
|
270
|
-
"""Determine if a given tool should be evaluated, since we only evaluate tools that
|
|
271
|
-
have sufficient context available.
|
|
272
|
-
|
|
273
|
-
:type eval_input: Dict
|
|
274
|
-
:return: True if the tool call should be evaluated
|
|
275
|
-
:rtype: bool
|
|
276
|
-
"""
|
|
277
|
-
tool_definition = eval_input.get("tool_definition")
|
|
278
|
-
if tool_definition is None or len(tool_definition) != 1:
|
|
279
|
-
return False
|
|
280
|
-
tool_type = tool_definition[0].get("type")
|
|
281
|
-
if tool_type is None or tool_type != "function":
|
|
282
|
-
return False
|
|
283
|
-
return True
|
|
284
|
-
|
|
285
|
-
def _not_applicable_result(self, eval_input):
|
|
284
|
+
eval_input = self._convert_kwargs_to_eval_input(**kwargs)
|
|
285
|
+
if isinstance(eval_input, dict) and eval_input.get("error_message"):
|
|
286
|
+
# If there is an error message, return not applicable result
|
|
287
|
+
return self._not_applicable_result(eval_input.get("error_message"))
|
|
288
|
+
# Do the evaluation
|
|
289
|
+
result = await self._do_eval(eval_input)
|
|
290
|
+
# Return the result
|
|
291
|
+
return result
|
|
292
|
+
|
|
293
|
+
def _not_applicable_result(self, error_message):
|
|
286
294
|
"""Return a result indicating that the tool call is not applicable for evaluation.
|
|
287
|
-
|
|
288
295
|
:param eval_input: The input to the evaluator.
|
|
289
296
|
:type eval_input: Dict
|
|
290
297
|
:return: A dictionary containing the result of the evaluation.
|
|
291
298
|
:rtype: Dict[str, Union[str, float]]
|
|
292
299
|
"""
|
|
300
|
+
# If no tool calls were made or tool call type is not supported, return not applicable result
|
|
293
301
|
return {
|
|
294
|
-
|
|
295
|
-
f"{self._result_key}
|
|
296
|
-
"
|
|
302
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
303
|
+
f"{self._result_key}_result": "pass",
|
|
304
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
305
|
+
f"{self._result_key}_reason": error_message,
|
|
306
|
+
"details": {},
|
|
297
307
|
}
|
|
298
308
|
|
|
299
|
-
def
|
|
300
|
-
"""
|
|
309
|
+
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
|
|
310
|
+
"""Extract the tool definitions that are needed for the provided tool calls."""
|
|
311
|
+
needed_tool_definitions = []
|
|
301
312
|
|
|
302
|
-
|
|
303
|
-
|
|
313
|
+
# Add all user-provided tool definitions
|
|
314
|
+
needed_tool_definitions.extend(tool_definitions)
|
|
304
315
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
means as top-level values in the dictionary, and all original
|
|
309
|
-
values (including non-numerics) located in under the "evaluation_per_turn" key,
|
|
310
|
-
which each sub-key being a metric and each sub-value being a the list of that metric's
|
|
311
|
-
per-turn values.
|
|
312
|
-
:rtype: AggregateResult[T_EvalValue]
|
|
313
|
-
"""
|
|
316
|
+
# Add the needed built-in tool definitions (if they are called)
|
|
317
|
+
built_in_definitions = _get_needed_built_in_definitions(tool_calls)
|
|
318
|
+
needed_tool_definitions.extend(built_in_definitions)
|
|
314
319
|
|
|
315
|
-
|
|
316
|
-
|
|
320
|
+
# OpenAPI tool is a collection of functions, so we need to expand it
|
|
321
|
+
tool_definitions_expanded = list(
|
|
322
|
+
chain.from_iterable(
|
|
323
|
+
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
|
|
324
|
+
for tool in needed_tool_definitions
|
|
325
|
+
)
|
|
326
|
+
)
|
|
317
327
|
|
|
318
|
-
#
|
|
319
|
-
|
|
328
|
+
# Validate that all tool calls have corresponding definitions
|
|
329
|
+
for tool_call in tool_calls:
|
|
330
|
+
if isinstance(tool_call, dict):
|
|
331
|
+
tool_type = tool_call.get("type")
|
|
332
|
+
|
|
333
|
+
if tool_type == "tool_call":
|
|
334
|
+
tool_name = tool_call.get("name")
|
|
335
|
+
if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
336
|
+
# This is a built-in tool from converter, already handled above
|
|
337
|
+
continue
|
|
338
|
+
elif tool_name:
|
|
339
|
+
# This is a regular function tool from converter
|
|
340
|
+
tool_definition_exists = any(
|
|
341
|
+
tool.get("name") == tool_name and tool.get("type", "function") == "function"
|
|
342
|
+
for tool in tool_definitions_expanded
|
|
343
|
+
)
|
|
344
|
+
if not tool_definition_exists:
|
|
345
|
+
raise EvaluationException(
|
|
346
|
+
message=f"Tool definition for {tool_name} not found",
|
|
347
|
+
blame=ErrorBlame.USER_ERROR,
|
|
348
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
349
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
350
|
+
)
|
|
351
|
+
else:
|
|
352
|
+
raise EvaluationException(
|
|
353
|
+
message=f"Tool call missing name: {tool_call}",
|
|
354
|
+
blame=ErrorBlame.USER_ERROR,
|
|
355
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
356
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
357
|
+
)
|
|
358
|
+
else:
|
|
359
|
+
# Unsupported tool format - only converter format is supported
|
|
360
|
+
raise EvaluationException(
|
|
361
|
+
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
|
|
362
|
+
blame=ErrorBlame.USER_ERROR,
|
|
363
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
364
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
365
|
+
)
|
|
366
|
+
else:
|
|
367
|
+
# Tool call is not a dictionary
|
|
368
|
+
raise EvaluationException(
|
|
369
|
+
message=f"Tool call is not a dictionary: {tool_call}",
|
|
370
|
+
blame=ErrorBlame.USER_ERROR,
|
|
371
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
372
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
373
|
+
)
|
|
320
374
|
|
|
321
|
-
|
|
322
|
-
[
|
|
323
|
-
per_turn_result
|
|
324
|
-
for per_turn_result in per_turn_results
|
|
325
|
-
if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT
|
|
326
|
-
]
|
|
327
|
-
)
|
|
328
|
-
if num_evaluated == 0:
|
|
329
|
-
# None of the invoked tools were applicable, return not applicable result
|
|
330
|
-
# (If a tool fails evaluation, we'll throw an exception)
|
|
331
|
-
return {
|
|
332
|
-
self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
|
|
333
|
-
f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
|
|
334
|
-
f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
|
|
335
|
-
f"{self._AGGREGATE_RESULT_KEY}_reason": "Tool call accuracy evaluation is not yet supported for the invoked tools.",
|
|
336
|
-
"per_tool_call_details": [],
|
|
337
|
-
}
|
|
338
|
-
# ignore not_applicable results, where the _result_key will be "not applicable"
|
|
339
|
-
score = (
|
|
340
|
-
sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results]) / num_evaluated
|
|
341
|
-
)
|
|
342
|
-
aggregated[self._AGGREGATE_RESULT_KEY] = score
|
|
343
|
-
aggregated[f"{self._AGGREGATE_RESULT_KEY}_result"] = (
|
|
344
|
-
self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
|
|
345
|
-
)
|
|
346
|
-
aggregated[f"{self._AGGREGATE_RESULT_KEY}_threshold"] = self.threshold
|
|
347
|
-
aggregated["per_tool_call_details"] = per_turn_results
|
|
348
|
-
return aggregated
|
|
375
|
+
return needed_tool_definitions
|
|
349
376
|
|
|
350
377
|
@override
|
|
351
378
|
def __call__( # pylint: disable=docstring-missing-param
|