azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +46 -12
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
- azure/ai/evaluation/_common/rai_service.py +3 -3
- azure/ai/evaluation/_common/utils.py +74 -17
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
- azure/ai/evaluation/_evaluate/_evaluate.py +150 -40
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +2 -0
- azure/ai/evaluation/_evaluate/_utils.py +1 -2
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +30 -6
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -8
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +1 -1
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
- azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +54 -2
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +1 -1
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +1 -1
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -10
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +169 -186
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +101 -23
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +2 -2
- azure/ai/evaluation/red_team/_red_team.py +838 -478
- azure/ai/evaluation/red_team/_red_team_result.py +6 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +8 -3
- azure/ai/evaluation/red_team/_utils/constants.py +0 -2
- azure/ai/evaluation/simulator/_adversarial_simulator.py +5 -2
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +2 -2
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +20 -2
- azure/ai/evaluation/simulator/_simulator.py +12 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +32 -3
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +64 -63
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -8,9 +8,13 @@ import re
|
|
|
8
8
|
from typing import Dict, List, Union, TypeVar, cast
|
|
9
9
|
from typing_extensions import overload, override
|
|
10
10
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
|
-
from azure.ai.evaluation.
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
from azure.ai.evaluation._exceptions import (
|
|
12
|
+
ErrorBlame,
|
|
13
|
+
ErrorCategory,
|
|
14
|
+
ErrorTarget,
|
|
15
|
+
EvaluationException,
|
|
16
|
+
)
|
|
17
|
+
from ..._common.utils import check_score_is_valid
|
|
14
18
|
from azure.ai.evaluation._common._experimental import experimental
|
|
15
19
|
|
|
16
20
|
logger = logging.getLogger(__name__)
|
|
@@ -21,13 +25,16 @@ T_EvalValue = TypeVar("T_EvalValue")
|
|
|
21
25
|
@experimental
|
|
22
26
|
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
23
27
|
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
|
|
24
|
-
- Relevance to the conversation
|
|
25
|
-
- Parameter correctness according to tool definitions
|
|
26
|
-
- Parameter value extraction from the conversation
|
|
28
|
+
- Relevance to the conversation.
|
|
29
|
+
- Parameter correctness according to tool definitions.
|
|
30
|
+
- Parameter value extraction from the conversation.
|
|
27
31
|
|
|
28
|
-
The evaluator uses a
|
|
29
|
-
- Score
|
|
30
|
-
- Score
|
|
32
|
+
The evaluator uses a scoring rubric of 1 to 5:
|
|
33
|
+
- Score 1: The tool calls are irrelevant
|
|
34
|
+
- Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed.
|
|
35
|
+
- Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made.
|
|
36
|
+
- Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded.
|
|
37
|
+
- Score 5: The tool calls are relevant, and all parameters were correctly passed.
|
|
31
38
|
|
|
32
39
|
This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing
|
|
33
40
|
user needs while properly following tool definitions and using information present in the
|
|
@@ -64,14 +71,20 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
64
71
|
"""
|
|
65
72
|
|
|
66
73
|
_PROMPTY_FILE = "tool_call_accuracy.prompty"
|
|
67
|
-
_RESULT_KEY = "
|
|
68
|
-
_AGGREGATE_RESULT_KEY = "tool_call_accuracy"
|
|
74
|
+
_RESULT_KEY = "tool_call_accuracy"
|
|
69
75
|
|
|
70
|
-
_MAX_TOOL_CALL_ACCURACY_SCORE =
|
|
71
|
-
_MIN_TOOL_CALL_ACCURACY_SCORE =
|
|
72
|
-
_DEFAULT_TOOL_CALL_ACCURACY_SCORE =
|
|
76
|
+
_MAX_TOOL_CALL_ACCURACY_SCORE = 5
|
|
77
|
+
_MIN_TOOL_CALL_ACCURACY_SCORE = 1
|
|
78
|
+
_DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
|
|
73
79
|
|
|
74
|
-
|
|
80
|
+
_NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
|
|
81
|
+
_NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
|
|
82
|
+
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
|
|
83
|
+
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
|
|
84
|
+
|
|
85
|
+
_LLM_SCORE_KEY = "tool_calls_success_level"
|
|
86
|
+
|
|
87
|
+
id = "azureai://built-in/evaluators/tool_call_accuracy"
|
|
75
88
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
76
89
|
|
|
77
90
|
@override
|
|
@@ -79,7 +92,12 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
79
92
|
current_dir = os.path.dirname(__file__)
|
|
80
93
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
81
94
|
self.threshold = threshold
|
|
82
|
-
super().__init__(
|
|
95
|
+
super().__init__(
|
|
96
|
+
model_config=model_config,
|
|
97
|
+
prompty_file=prompty_path,
|
|
98
|
+
result_key=self._RESULT_KEY,
|
|
99
|
+
**kwargs,
|
|
100
|
+
)
|
|
83
101
|
|
|
84
102
|
@overload
|
|
85
103
|
def __call__(
|
|
@@ -134,84 +152,43 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
134
152
|
"""
|
|
135
153
|
# TODO add warning that only tool calls of type function are supported
|
|
136
154
|
# Collect inputs
|
|
137
|
-
tool_calls = kwargs.get("tool_calls"
|
|
155
|
+
tool_calls = kwargs.get("tool_calls")
|
|
138
156
|
tool_definitions = kwargs.get("tool_definitions")
|
|
139
|
-
query = kwargs.get("query"
|
|
140
|
-
response = kwargs.get("response"
|
|
141
|
-
|
|
142
|
-
if response is None and tool_calls is None:
|
|
143
|
-
raise EvaluationException(
|
|
144
|
-
message="Either response or tool_calls must be provided.",
|
|
145
|
-
blame=ErrorBlame.USER_ERROR,
|
|
146
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
147
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
if tool_definitions is None:
|
|
151
|
-
raise EvaluationException(
|
|
152
|
-
message="Tool definitions must be provided.",
|
|
153
|
-
blame=ErrorBlame.USER_ERROR,
|
|
154
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
155
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
156
|
-
)
|
|
157
|
+
query = kwargs.get("query")
|
|
158
|
+
response = kwargs.get("response")
|
|
157
159
|
|
|
158
160
|
# TODO : Support classes that represents tool calls, messages etc once client side definitions are available
|
|
159
|
-
if
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
if len(tool_calls) == 0:
|
|
169
|
-
raise EvaluationException(
|
|
170
|
-
message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
|
|
171
|
-
blame=ErrorBlame.USER_ERROR,
|
|
172
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
173
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
174
|
-
)
|
|
161
|
+
if response:
|
|
162
|
+
parsed_tool_calls = self._parse_tools_from_response(response)
|
|
163
|
+
if parsed_tool_calls:
|
|
164
|
+
tool_calls = parsed_tool_calls
|
|
165
|
+
|
|
166
|
+
if not tool_calls:
|
|
167
|
+
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
|
|
168
|
+
if not tool_definitions or len(tool_definitions) == 0:
|
|
169
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
175
170
|
|
|
176
171
|
if not isinstance(tool_calls, list):
|
|
177
172
|
tool_calls = [tool_calls]
|
|
178
|
-
|
|
179
173
|
if not isinstance(tool_definitions, list):
|
|
180
174
|
tool_definitions = [tool_definitions]
|
|
181
175
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
): # TODO assuming dict here but it can be a class
|
|
189
|
-
function_name = tool_call.get("name")
|
|
190
|
-
tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
|
|
191
|
-
if len(tool_definition) > 0:
|
|
192
|
-
tool_definition = tool_definition
|
|
193
|
-
else:
|
|
194
|
-
raise EvaluationException(
|
|
195
|
-
message="Tool definition not found",
|
|
196
|
-
blame=ErrorBlame.USER_ERROR,
|
|
197
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
198
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
199
|
-
)
|
|
200
|
-
eval_inputs.append({"query": query, "tool_call": tool_call, "tool_definition": tool_definition})
|
|
201
|
-
else:
|
|
202
|
-
raise EvaluationException(
|
|
203
|
-
message="Tool definition not found",
|
|
204
|
-
blame=ErrorBlame.USER_ERROR,
|
|
205
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
206
|
-
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
207
|
-
)
|
|
176
|
+
try:
|
|
177
|
+
needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
|
|
178
|
+
except EvaluationException as e:
|
|
179
|
+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
|
|
180
|
+
if len(needed_tool_definitions) == 0:
|
|
181
|
+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
|
|
208
182
|
|
|
209
|
-
return
|
|
183
|
+
return {
|
|
184
|
+
"query": query,
|
|
185
|
+
"tool_calls": tool_calls,
|
|
186
|
+
"tool_definitions": needed_tool_definitions,
|
|
187
|
+
}
|
|
210
188
|
|
|
211
189
|
@override
|
|
212
190
|
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
213
|
-
"""Do a
|
|
214
|
-
|
|
191
|
+
"""Do a tool call accuracy evaluation.
|
|
215
192
|
:param eval_input: The input to the evaluator. Expected to contain
|
|
216
193
|
whatever inputs are needed for the _flow method, including context
|
|
217
194
|
and other fields depending on the child class.
|
|
@@ -219,23 +196,43 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
219
196
|
:return: The evaluation result.
|
|
220
197
|
:rtype: Dict
|
|
221
198
|
"""
|
|
199
|
+
# Single LLM call for all tool calls
|
|
222
200
|
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
223
201
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
score
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
202
|
+
if isinstance(llm_output, dict):
|
|
203
|
+
score = llm_output.get(self._LLM_SCORE_KEY, None)
|
|
204
|
+
if not score or not check_score_is_valid(
|
|
205
|
+
score,
|
|
206
|
+
ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE,
|
|
207
|
+
ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE,
|
|
208
|
+
):
|
|
209
|
+
raise EvaluationException(
|
|
210
|
+
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
|
|
211
|
+
internal_message="Invalid score value.",
|
|
212
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
213
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Format the output
|
|
217
|
+
reason = llm_output.get("chain_of_thought", "")
|
|
218
|
+
score = float(score)
|
|
219
|
+
score_result = "pass" if score >= self.threshold else "fail"
|
|
220
|
+
response_dict = {
|
|
221
|
+
self._result_key: score,
|
|
222
|
+
f"{self._result_key}_result": score_result,
|
|
223
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
224
|
+
f"{self._result_key}_reason": reason,
|
|
225
|
+
"details": llm_output.get("details", {}),
|
|
226
|
+
}
|
|
227
|
+
return response_dict
|
|
228
|
+
|
|
229
|
+
else:
|
|
230
|
+
raise EvaluationException(
|
|
231
|
+
message="Tool call accuracy evaluator returned invalid output.",
|
|
232
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
233
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
234
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
235
|
+
)
|
|
239
236
|
|
|
240
237
|
async def _real_call(self, **kwargs):
|
|
241
238
|
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
@@ -246,106 +243,92 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
246
243
|
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
247
244
|
"""
|
|
248
245
|
# Convert inputs into list of evaluable inputs.
|
|
249
|
-
|
|
250
|
-
if
|
|
251
|
-
return
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
per_turn_results = []
|
|
260
|
-
# Evaluate all inputs.
|
|
261
|
-
for eval_input in eval_input_list:
|
|
262
|
-
if self._is_applicable_tool(eval_input):
|
|
263
|
-
per_turn_results.append(await self._do_eval(eval_input))
|
|
264
|
-
else:
|
|
265
|
-
per_turn_results.append(self._not_applicable_result(eval_input))
|
|
266
|
-
|
|
267
|
-
return self._aggregate_results(per_turn_results=per_turn_results)
|
|
268
|
-
|
|
269
|
-
def _is_applicable_tool(self, eval_input):
|
|
270
|
-
"""Determine if a given tool should be evaluated, since we only evaluate tools that
|
|
271
|
-
have sufficient context available.
|
|
272
|
-
|
|
273
|
-
:type eval_input: Dict
|
|
274
|
-
:return: True if the tool call should be evaluated
|
|
275
|
-
:rtype: bool
|
|
276
|
-
"""
|
|
277
|
-
tool_definition = eval_input.get("tool_definition")
|
|
278
|
-
if tool_definition is None or len(tool_definition) != 1:
|
|
279
|
-
return False
|
|
280
|
-
tool_type = tool_definition[0].get("type")
|
|
281
|
-
if tool_type is None or tool_type != "function":
|
|
282
|
-
return False
|
|
283
|
-
return True
|
|
284
|
-
|
|
285
|
-
def _not_applicable_result(self, eval_input):
|
|
246
|
+
eval_input = self._convert_kwargs_to_eval_input(**kwargs)
|
|
247
|
+
if isinstance(eval_input, dict) and eval_input.get("error_message"):
|
|
248
|
+
# If there is an error message, return not applicable result
|
|
249
|
+
return self._not_applicable_result(eval_input.get("error_message"))
|
|
250
|
+
# Do the evaluation
|
|
251
|
+
result = await self._do_eval(eval_input)
|
|
252
|
+
# Return the result
|
|
253
|
+
return result
|
|
254
|
+
|
|
255
|
+
def _not_applicable_result(self, error_message):
|
|
286
256
|
"""Return a result indicating that the tool call is not applicable for evaluation.
|
|
287
|
-
|
|
288
257
|
:param eval_input: The input to the evaluator.
|
|
289
258
|
:type eval_input: Dict
|
|
290
259
|
:return: A dictionary containing the result of the evaluation.
|
|
291
260
|
:rtype: Dict[str, Union[str, float]]
|
|
292
261
|
"""
|
|
262
|
+
# If no tool calls were made or tool call type is not supported, return not applicable result
|
|
293
263
|
return {
|
|
294
|
-
|
|
295
|
-
f"{self._result_key}
|
|
296
|
-
"
|
|
264
|
+
self._result_key: self._NOT_APPLICABLE_RESULT,
|
|
265
|
+
f"{self._result_key}_result": "pass",
|
|
266
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
267
|
+
f"{self._result_key}_reason": error_message,
|
|
268
|
+
"details": {},
|
|
297
269
|
}
|
|
298
270
|
|
|
299
|
-
def
|
|
300
|
-
"""
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
:param per_turn_results: List of evaluation results for each turn in the conversation.
|
|
306
|
-
:type per_turn_results: List[Dict]
|
|
307
|
-
:return: A dictionary containing aggregated results, with numeric metrics having their
|
|
308
|
-
means as top-level values in the dictionary, and all original
|
|
309
|
-
values (including non-numerics) located in under the "evaluation_per_turn" key,
|
|
310
|
-
which each sub-key being a metric and each sub-value being a the list of that metric's
|
|
311
|
-
per-turn values.
|
|
312
|
-
:rtype: AggregateResult[T_EvalValue]
|
|
271
|
+
def _parse_tools_from_response(self, response):
|
|
272
|
+
"""Parse the response to extract tool calls and results.
|
|
273
|
+
:param response: The response to parse.
|
|
274
|
+
:type response: Union[str, List[dict]]
|
|
275
|
+
:return: List of tool calls extracted from the response.
|
|
276
|
+
:rtype: List[dict]
|
|
313
277
|
"""
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
"
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
278
|
+
tool_calls = []
|
|
279
|
+
tool_results_map = {}
|
|
280
|
+
if isinstance(response, list):
|
|
281
|
+
for message in response:
|
|
282
|
+
# Extract tool calls from assistant messages
|
|
283
|
+
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
284
|
+
for content_item in message.get("content"):
|
|
285
|
+
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
286
|
+
tool_calls.append(content_item)
|
|
287
|
+
|
|
288
|
+
# Extract tool results from tool messages
|
|
289
|
+
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
290
|
+
tool_call_id = message.get("tool_call_id")
|
|
291
|
+
if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
|
|
292
|
+
result_content = message.get("content")[0]
|
|
293
|
+
if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
|
|
294
|
+
tool_results_map[tool_call_id] = result_content
|
|
295
|
+
|
|
296
|
+
# Attach results to their corresponding calls
|
|
297
|
+
for tool_call in tool_calls:
|
|
298
|
+
tool_call_id = tool_call.get("tool_call_id")
|
|
299
|
+
if tool_call_id in tool_results_map:
|
|
300
|
+
tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
|
|
301
|
+
|
|
302
|
+
return tool_calls
|
|
303
|
+
|
|
304
|
+
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
|
|
305
|
+
"""Extract the tool definitions that are needed for the provided tool calls.
|
|
306
|
+
:param tool_calls: List of tool calls to evaluate.
|
|
307
|
+
:type tool_calls: List[dict]
|
|
308
|
+
:param tool_definitions: List of tool definitions to use for evaluation.
|
|
309
|
+
:type tool_definitions: List[dict]
|
|
310
|
+
:return: List of tool definitions that are needed for the provided tool calls.
|
|
311
|
+
:rtype: List[dict]
|
|
312
|
+
"""
|
|
313
|
+
needed_tool_definitions = []
|
|
314
|
+
for tool_call in tool_calls:
|
|
315
|
+
if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
|
|
316
|
+
tool_name = tool_call.get("name")
|
|
317
|
+
tool_definition = [
|
|
318
|
+
tool
|
|
319
|
+
for tool in tool_definitions
|
|
320
|
+
if tool.get("name") == tool_name and tool.get("type", "function") == "function"
|
|
321
|
+
]
|
|
322
|
+
if len(tool_definition) > 0:
|
|
323
|
+
needed_tool_definitions.extend(tool_definition)
|
|
324
|
+
else:
|
|
325
|
+
raise EvaluationException(
|
|
326
|
+
message=f"Tool definition for {tool_name} not found",
|
|
327
|
+
blame=ErrorBlame.USER_ERROR,
|
|
328
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
329
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
330
|
+
)
|
|
331
|
+
return needed_tool_definitions
|
|
349
332
|
|
|
350
333
|
@override
|
|
351
334
|
def __call__( # pylint: disable=docstring-missing-param
|
|
@@ -5,20 +5,20 @@ model:
|
|
|
5
5
|
api: chat
|
|
6
6
|
parameters:
|
|
7
7
|
temperature: 0.0
|
|
8
|
-
max_tokens:
|
|
8
|
+
max_tokens: 3000
|
|
9
9
|
top_p: 1.0
|
|
10
10
|
presence_penalty: 0
|
|
11
11
|
frequency_penalty: 0
|
|
12
12
|
response_format:
|
|
13
|
-
type:
|
|
13
|
+
type: json_object
|
|
14
14
|
|
|
15
15
|
inputs:
|
|
16
16
|
query:
|
|
17
|
-
type:
|
|
18
|
-
|
|
19
|
-
type:
|
|
20
|
-
|
|
21
|
-
type:
|
|
17
|
+
type: List
|
|
18
|
+
tool_calls:
|
|
19
|
+
type: List
|
|
20
|
+
tool_definitions:
|
|
21
|
+
type: Dict
|
|
22
22
|
|
|
23
23
|
---
|
|
24
24
|
system:
|
|
@@ -27,7 +27,7 @@ system:
|
|
|
27
27
|
### Your are an expert in evaluating the accuracy of a tool call considering relevance and potential usefulness including syntactic and semantic correctness of a proposed tool call from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
|
|
28
28
|
- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
|
|
29
29
|
- **Data**: Your input data include CONVERSATION , TOOL CALL and TOOL DEFINITION.
|
|
30
|
-
- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
|
|
30
|
+
- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways, and you need to be very precise in your evaluation.
|
|
31
31
|
|
|
32
32
|
user:
|
|
33
33
|
# Definition
|
|
@@ -42,30 +42,108 @@ user:
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
# Ratings
|
|
45
|
-
## [Tool Call Accuracy:
|
|
45
|
+
## [Tool Call Accuracy: 1] (Irrelevant)
|
|
46
46
|
**Definition:**
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
3. TOOL CALL has parameters that is not present in TOOL DEFINITION.
|
|
47
|
+
Tool calls were not relevant to the user's query, resulting in an irrelevant or unhelpful final output.
|
|
48
|
+
This level is a 'fail'.
|
|
50
49
|
|
|
51
|
-
|
|
50
|
+
**Example:**
|
|
51
|
+
The user's query is asking for most popular hotels in New York, but the agent calls a tool that does search in local files on a machine. This tool is not relevant to the user query, so this case is a Level 1 'fail'.
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
## [Tool Call Accuracy: 2] (Partially Relevant - No correct output)
|
|
55
|
+
**Definition:**
|
|
56
|
+
Tool calls were somewhat related to the user's query, but the agent was not able to reach a final output that addresses the user query due to one or more of the following:
|
|
57
|
+
• Tools returned errors, and no retrials for the tool call were successful.
|
|
58
|
+
• Parameters passed to the tool were incorrect.
|
|
59
|
+
• Not enough tools were called to fully address the query (missing tool calls).
|
|
60
|
+
This level is a 'fail'.
|
|
61
|
+
|
|
62
|
+
**Example:**
|
|
63
|
+
The user asks for the coordinates of Chicago. The agent calls the correct tool that retrieves the coordinates -which is the relevant tool for the user query- but passes 'New York' instead of 'Chicago' as the parameter to the tool. So this is a Level 2 'fail'.
|
|
64
|
+
|
|
65
|
+
**Example:**
|
|
66
|
+
The user asks for the coordinates of Chicago. The agent calls the correct tool that retrieves the coordinates -which is the relevant tool for the user query- and passes 'Chicago' as the parameter to the tool which is also correct, but the tool returns an error so the agent can't reach the correct answer to the user's query. This is a Level 2 'fail'.
|
|
67
|
+
|
|
68
|
+
**Example:**
|
|
69
|
+
The user asks a question that needs 3 tool calls for it to be answered. The agent calls only one of the three required tool calls. So this case is a Level 2 'fail'.
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## [Tool Call Accuracy: 3] (Slightly Correct - Reached Output)
|
|
52
73
|
**Definition:**
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
74
|
+
Tool calls were relevant, correct and grounded parameters were passed so that led to a correct output. However, multiple excessive, unnecessary tool calls were made.
|
|
75
|
+
This level is a 'pass'.
|
|
76
|
+
|
|
77
|
+
**Example:**
|
|
78
|
+
The user asked to do a modification in the database. The agent called the tool multiple times, resulting in multiple modifications in the database instead of one. This is a level 3 'pass'.
|
|
79
|
+
|
|
80
|
+
**Example:**
|
|
81
|
+
The user asked for popular hotels in a certain place. The agent calls the same tool with the same parameters multiple times, even though a single tool call that returns an output is sufficient. So there were unnecessary tool calls. This is a Level 3 'pass'.
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
## [Tool Call Accuracy: 4] (Mostly Correct - Reached output)
|
|
85
|
+
**Definition:**
|
|
86
|
+
Tool calls were fully relevant and efficient:
|
|
87
|
+
• Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query.
|
|
88
|
+
• A tool returned an error, but the agent retried calling the tool and successfully got an output.
|
|
89
|
+
This level is a 'pass'.
|
|
90
|
+
|
|
91
|
+
**Example:**
|
|
92
|
+
The user asks for the weather forecast in a certain place. The agent calls the correct tool that retrieves the weather forecast with the correct parameters, but the tool returns an error. The agent re-calls the tool once again and it returns the correct output. This is a Level 4 'pass'.
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
## [Tool Call Accuracy: 5] (Optimal Solution - Reached output)
|
|
96
|
+
**Definition:**
|
|
97
|
+
Tool calls were fully relevant and efficient:
|
|
98
|
+
• Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query.
|
|
99
|
+
• No unnecessary or excessive tool calls were made.
|
|
100
|
+
• No errors occurred in any of the tools.
|
|
101
|
+
• The agent was able to reach the final output that addresses the user's query without facing any issues.
|
|
102
|
+
This level is a 'pass'.
|
|
103
|
+
|
|
104
|
+
**Example:**
|
|
105
|
+
The user asks for the distance between two places. The agent correctly calls the tools that retrieve the coordinates for the two places respectively, then calls the tool that calculates the distance between the two sets of coordinates, passing the correct arguments to all the tools, without calling other tools excessively or unnecessarily. This is the optimal solution for the user's query. This is a Level 5 'pass'.
|
|
106
|
+
|
|
107
|
+
**Example:**
|
|
108
|
+
The user asks for the distance between two places. The agent retrieves the needed coordinates from the outputs of the tool calls in the conversation history, and then correctly passes these coordinates to the tool that calculates the distance to output it to the user. This is also an optimal solution for the user's query. This is a Level 5 'pass'.
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# IMPORTANT NOTES
|
|
113
|
+
- There is a clear distinction between 'pass' levels and 'fail' levels. The distinction is that the tools are called correctly in order to reach the required output. If the agent was not able to reach the final output that addresses the user query, it cannot be either of the 'pass' levels, and vice versa. It is crucial that you ensure you are rating the agent's response with the correct level based on the tool calls made to address the user's query.
|
|
114
|
+
- "Correct output" means correct tool with the correct, grounded parameters. You are NOT concerned with the correctness of the result of the tool. As long as the parameters passed were correct and the tool did not return an error, then the tool output is correct and accurate.
|
|
115
|
+
- Ensure that every single parameter that is passed to the tools is correct and grounded from the user query or the conversation history. If the agent passes incorrect parameters or completely makes them up, then this is a fail, even if somehow the agent reaches a correct result.
|
|
56
116
|
|
|
57
117
|
# Data
|
|
58
118
|
CONVERSATION : {{query}}
|
|
59
|
-
TOOL
|
|
119
|
+
TOOL CALLS: {{tool_calls}}
|
|
60
120
|
TOOL DEFINITION: {{tool_definition}}
|
|
61
121
|
|
|
62
122
|
|
|
63
123
|
# Tasks
|
|
64
|
-
## Please provide your
|
|
65
|
-
|
|
66
|
-
-
|
|
67
|
-
-
|
|
68
|
-
|
|
124
|
+
## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
|
|
125
|
+
Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
|
|
126
|
+
- chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:', and think deeply and precisely about which level should be chosen based on the agent's tool calls and how they were able to address the user's query.
|
|
127
|
+
- tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
|
|
128
|
+
- details: a dictionary that contains the following keys:
|
|
129
|
+
- tool_calls_made_by_agent: total number of tool calls made by the agent
|
|
130
|
+
- correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
|
|
131
|
+
- per_tool_call_details: a list of dictionaries, each containing:
|
|
132
|
+
- tool_name: name of the tool
|
|
133
|
+
- total_calls_required: total number of calls required for the tool
|
|
134
|
+
- correct_calls_made_by_agent: number of correct calls made by the agent
|
|
135
|
+
- correct_tool_percentage: percentage of correct calls made by the agent for this tool. It is a value between 0.0 and 1.0
|
|
136
|
+
- tool_call_errors: number of errors encountered during the tool call
|
|
137
|
+
- tool_success_result: 'pass' or 'fail' based on the evaluation of the tool call accuracy for this tool
|
|
138
|
+
- excess_tool_calls: a dictionary with the following keys:
|
|
139
|
+
- total: total number of excess, unnecessary tool calls made by the agent
|
|
140
|
+
- details: a list of dictionaries, each containing:
|
|
141
|
+
- tool_name: name of the tool
|
|
142
|
+
- excess_count: number of excess calls made for this query
|
|
143
|
+
- missing_tool_calls: a dictionary with the following keys:
|
|
144
|
+
- total: total number of missing tool calls that should have been made by the agent to be able to answer the query
|
|
145
|
+
- details: a list of dictionaries, each containing:
|
|
146
|
+
- tool_name: name of the tool
|
|
147
|
+
- missing_count: number of missing calls for this query
|
|
69
148
|
|
|
70
|
-
## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
|
|
71
149
|
# Output
|
|
@@ -58,19 +58,26 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
58
58
|
for the ungrounded attributes will be "ungrounded_attributes_label".
|
|
59
59
|
"""
|
|
60
60
|
|
|
61
|
-
id = "ungrounded_attributes"
|
|
61
|
+
id = "azureai://built-in/evaluators/ungrounded_attributes"
|
|
62
62
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
63
|
+
_OPTIONAL_PARAMS = ["query"]
|
|
63
64
|
|
|
64
65
|
@override
|
|
65
66
|
def __init__(
|
|
66
67
|
self,
|
|
67
68
|
credential,
|
|
68
69
|
azure_ai_project,
|
|
70
|
+
**kwargs,
|
|
69
71
|
):
|
|
72
|
+
# Set default for evaluate_query if not provided
|
|
73
|
+
if "evaluate_query" not in kwargs:
|
|
74
|
+
kwargs["evaluate_query"] = True
|
|
75
|
+
|
|
70
76
|
super().__init__(
|
|
71
77
|
eval_metric=EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
72
78
|
azure_ai_project=azure_ai_project,
|
|
73
79
|
credential=credential,
|
|
80
|
+
**kwargs,
|
|
74
81
|
)
|
|
75
82
|
|
|
76
83
|
@overload
|