azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +13 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +77 -33
- azure/ai/evaluation/_evaluate/_utils.py +4 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +2 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +113 -19
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +113 -3
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +8 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +2 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +10 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +2 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +8 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +104 -60
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +58 -41
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +2 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +697 -3067
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +3 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -0
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +19 -5
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +4 -3
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +32 -2
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +49 -41
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from itertools import chain
|
|
4
5
|
import math
|
|
5
6
|
import os
|
|
6
7
|
import logging
|
|
7
8
|
import re
|
|
8
|
-
from typing import Dict, List, Union, TypeVar,
|
|
9
|
+
from typing import Dict, List, Union, TypeVar, Optional
|
|
9
10
|
from typing_extensions import overload, override
|
|
10
11
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
12
|
from azure.ai.evaluation._exceptions import (
|
|
@@ -16,12 +17,46 @@ from azure.ai.evaluation._exceptions import (
|
|
|
16
17
|
)
|
|
17
18
|
from ..._common.utils import check_score_is_valid
|
|
18
19
|
from azure.ai.evaluation._common._experimental import experimental
|
|
20
|
+
from ..._converters._models import (
|
|
21
|
+
_BUILT_IN_DESCRIPTIONS,
|
|
22
|
+
_BUILT_IN_PARAMS,
|
|
23
|
+
)
|
|
19
24
|
|
|
20
25
|
logger = logging.getLogger(__name__)
|
|
21
26
|
|
|
22
27
|
T_EvalValue = TypeVar("T_EvalValue")
|
|
23
28
|
|
|
24
29
|
|
|
30
|
+
def _get_built_in_definition(tool_name: str):
|
|
31
|
+
"""Get the definition for the built-in tool."""
|
|
32
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
33
|
+
return {
|
|
34
|
+
"type": tool_name,
|
|
35
|
+
"description": _BUILT_IN_DESCRIPTIONS[tool_name],
|
|
36
|
+
"name": tool_name,
|
|
37
|
+
"parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
|
|
38
|
+
}
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
|
|
43
|
+
"""Extract tool definitions needed for the given built-in tool calls."""
|
|
44
|
+
needed_definitions = []
|
|
45
|
+
for tool_call in tool_calls:
|
|
46
|
+
if isinstance(tool_call, dict):
|
|
47
|
+
tool_type = tool_call.get("type")
|
|
48
|
+
|
|
49
|
+
# Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
|
|
50
|
+
if tool_type == "tool_call":
|
|
51
|
+
tool_name = tool_call.get("name")
|
|
52
|
+
if tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
53
|
+
built_in_def = _get_built_in_definition(tool_name)
|
|
54
|
+
if built_in_def and built_in_def not in needed_definitions:
|
|
55
|
+
needed_definitions.append(built_in_def)
|
|
56
|
+
|
|
57
|
+
return needed_definitions
|
|
58
|
+
|
|
59
|
+
|
|
25
60
|
@experimental
|
|
26
61
|
class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
27
62
|
"""The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
|
|
@@ -88,7 +123,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
88
123
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
89
124
|
|
|
90
125
|
@override
|
|
91
|
-
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
|
|
126
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
|
|
92
127
|
current_dir = os.path.dirname(__file__)
|
|
93
128
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
94
129
|
self.threshold = threshold
|
|
@@ -96,6 +131,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
96
131
|
model_config=model_config,
|
|
97
132
|
prompty_file=prompty_path,
|
|
98
133
|
result_key=self._RESULT_KEY,
|
|
134
|
+
credential=credential,
|
|
99
135
|
**kwargs,
|
|
100
136
|
)
|
|
101
137
|
|
|
@@ -153,10 +189,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
153
189
|
# TODO add warning that only tool calls of type function are supported
|
|
154
190
|
# Collect inputs
|
|
155
191
|
tool_calls = kwargs.get("tool_calls")
|
|
156
|
-
tool_definitions = kwargs.get("tool_definitions")
|
|
192
|
+
tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
|
|
157
193
|
query = kwargs.get("query")
|
|
158
194
|
response = kwargs.get("response")
|
|
159
|
-
|
|
160
195
|
# TODO : Support classes that represents tool calls, messages etc once client side definitions are available
|
|
161
196
|
if response:
|
|
162
197
|
parsed_tool_calls = self._parse_tools_from_response(response)
|
|
@@ -165,20 +200,23 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
165
200
|
|
|
166
201
|
if not tool_calls:
|
|
167
202
|
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
|
|
168
|
-
if not tool_definitions or len(tool_definitions) == 0:
|
|
169
|
-
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
170
203
|
|
|
171
204
|
if not isinstance(tool_calls, list):
|
|
172
205
|
tool_calls = [tool_calls]
|
|
173
206
|
if not isinstance(tool_definitions, list):
|
|
174
|
-
tool_definitions = [tool_definitions]
|
|
207
|
+
tool_definitions = [tool_definitions] if tool_definitions else []
|
|
175
208
|
|
|
176
209
|
try:
|
|
177
210
|
needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
|
|
178
211
|
except EvaluationException as e:
|
|
179
|
-
|
|
212
|
+
# Check if this is because no tool definitions were provided at all
|
|
213
|
+
if len(tool_definitions) == 0:
|
|
214
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
215
|
+
else:
|
|
216
|
+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
|
|
217
|
+
|
|
180
218
|
if len(needed_tool_definitions) == 0:
|
|
181
|
-
return {"error_message": self.
|
|
219
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
182
220
|
|
|
183
221
|
return {
|
|
184
222
|
"query": query,
|
|
@@ -268,66 +306,72 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
268
306
|
"details": {},
|
|
269
307
|
}
|
|
270
308
|
|
|
271
|
-
def _parse_tools_from_response(self, response):
|
|
272
|
-
"""Parse the response to extract tool calls and results.
|
|
273
|
-
:param response: The response to parse.
|
|
274
|
-
:type response: Union[str, List[dict]]
|
|
275
|
-
:return: List of tool calls extracted from the response.
|
|
276
|
-
:rtype: List[dict]
|
|
277
|
-
"""
|
|
278
|
-
tool_calls = []
|
|
279
|
-
tool_results_map = {}
|
|
280
|
-
if isinstance(response, list):
|
|
281
|
-
for message in response:
|
|
282
|
-
# Extract tool calls from assistant messages
|
|
283
|
-
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
284
|
-
for content_item in message.get("content"):
|
|
285
|
-
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
286
|
-
tool_calls.append(content_item)
|
|
287
|
-
|
|
288
|
-
# Extract tool results from tool messages
|
|
289
|
-
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
290
|
-
tool_call_id = message.get("tool_call_id")
|
|
291
|
-
if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
|
|
292
|
-
result_content = message.get("content")[0]
|
|
293
|
-
if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
|
|
294
|
-
tool_results_map[tool_call_id] = result_content
|
|
295
|
-
|
|
296
|
-
# Attach results to their corresponding calls
|
|
297
|
-
for tool_call in tool_calls:
|
|
298
|
-
tool_call_id = tool_call.get("tool_call_id")
|
|
299
|
-
if tool_call_id in tool_results_map:
|
|
300
|
-
tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
|
|
301
|
-
|
|
302
|
-
return tool_calls
|
|
303
|
-
|
|
304
309
|
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
|
|
305
|
-
"""Extract the tool definitions that are needed for the provided tool calls.
|
|
306
|
-
:param tool_calls: List of tool calls to evaluate.
|
|
307
|
-
:type tool_calls: List[dict]
|
|
308
|
-
:param tool_definitions: List of tool definitions to use for evaluation.
|
|
309
|
-
:type tool_definitions: List[dict]
|
|
310
|
-
:return: List of tool definitions that are needed for the provided tool calls.
|
|
311
|
-
:rtype: List[dict]
|
|
312
|
-
"""
|
|
310
|
+
"""Extract the tool definitions that are needed for the provided tool calls."""
|
|
313
311
|
needed_tool_definitions = []
|
|
312
|
+
|
|
313
|
+
# Add all user-provided tool definitions
|
|
314
|
+
needed_tool_definitions.extend(tool_definitions)
|
|
315
|
+
|
|
316
|
+
# Add the needed built-in tool definitions (if they are called)
|
|
317
|
+
built_in_definitions = _get_needed_built_in_definitions(tool_calls)
|
|
318
|
+
needed_tool_definitions.extend(built_in_definitions)
|
|
319
|
+
|
|
320
|
+
# OpenAPI tool is a collection of functions, so we need to expand it
|
|
321
|
+
tool_definitions_expanded = list(
|
|
322
|
+
chain.from_iterable(
|
|
323
|
+
tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
|
|
324
|
+
for tool in needed_tool_definitions
|
|
325
|
+
)
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Validate that all tool calls have corresponding definitions
|
|
314
329
|
for tool_call in tool_calls:
|
|
315
|
-
if isinstance(tool_call, dict)
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
if
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
330
|
+
if isinstance(tool_call, dict):
|
|
331
|
+
tool_type = tool_call.get("type")
|
|
332
|
+
|
|
333
|
+
if tool_type == "tool_call":
|
|
334
|
+
tool_name = tool_call.get("name")
|
|
335
|
+
if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
|
|
336
|
+
# This is a built-in tool from converter, already handled above
|
|
337
|
+
continue
|
|
338
|
+
elif tool_name:
|
|
339
|
+
# This is a regular function tool from converter
|
|
340
|
+
tool_definition_exists = any(
|
|
341
|
+
tool.get("name") == tool_name and tool.get("type", "function") == "function"
|
|
342
|
+
for tool in tool_definitions_expanded
|
|
343
|
+
)
|
|
344
|
+
if not tool_definition_exists:
|
|
345
|
+
raise EvaluationException(
|
|
346
|
+
message=f"Tool definition for {tool_name} not found",
|
|
347
|
+
blame=ErrorBlame.USER_ERROR,
|
|
348
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
349
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
350
|
+
)
|
|
351
|
+
else:
|
|
352
|
+
raise EvaluationException(
|
|
353
|
+
message=f"Tool call missing name: {tool_call}",
|
|
354
|
+
blame=ErrorBlame.USER_ERROR,
|
|
355
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
356
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
357
|
+
)
|
|
324
358
|
else:
|
|
359
|
+
# Unsupported tool format - only converter format is supported
|
|
325
360
|
raise EvaluationException(
|
|
326
|
-
message=f"
|
|
361
|
+
message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
|
|
327
362
|
blame=ErrorBlame.USER_ERROR,
|
|
328
363
|
category=ErrorCategory.INVALID_VALUE,
|
|
329
364
|
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
330
365
|
)
|
|
366
|
+
else:
|
|
367
|
+
# Tool call is not a dictionary
|
|
368
|
+
raise EvaluationException(
|
|
369
|
+
message=f"Tool call is not a dictionary: {tool_call}",
|
|
370
|
+
blame=ErrorBlame.USER_ERROR,
|
|
371
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
372
|
+
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
|
|
373
|
+
)
|
|
374
|
+
|
|
331
375
|
return needed_tool_definitions
|
|
332
376
|
|
|
333
377
|
@override
|
|
@@ -31,99 +31,116 @@ system:
|
|
|
31
31
|
|
|
32
32
|
user:
|
|
33
33
|
# Definition
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
# Definition
|
|
35
|
+
**Tool Call Accuracy** refers to the overall effectiveness of ALL TOOL CALLS made by an agent in response to a user's query within an ongoing CONVERSATION.
|
|
36
|
+
|
|
37
|
+
# EVALUATION CRITERIA
|
|
38
|
+
Evaluate based on these factors:
|
|
39
|
+
|
|
40
|
+
1. **Collective Relevance**: Do the tool calls, taken together, appropriately address the user's query?
|
|
41
|
+
2. **Parameter Correctness**: Are all parameter values extracted from or reasonably inferred from the CONVERSATION?
|
|
42
|
+
- *Fabricated parameters automatically result in Level 2*
|
|
43
|
+
3. **Completeness**: Did the agent make all necessary tool calls available in the tool definitions?
|
|
44
|
+
- *Failed calls don't count as missing*
|
|
45
|
+
4. **Efficiency**: Did the agent avoid unnecessary duplicate tool calls with identical parameters?
|
|
46
|
+
- *Don't penalize single tools returning multiple results (like file_search)*
|
|
47
|
+
5. **Execution Success**: Were tool calls executed successfully or recovered from errors appropriately?
|
|
48
|
+
6. **Scope Limitation**: ONLY evaluate tool calls in the "TOOL CALLS TO BE EVALUATED" section.
|
|
49
|
+
- Tool calls in the CONVERSATION section are for context only
|
|
50
|
+
- Focus exclusively on the agent's response to the user's LAST query
|
|
51
|
+
- Use conversation history only to verify parameter correctness and context
|
|
52
|
+
|
|
53
|
+
**Success Criteria**: Tools should retrieve relevant data to help answer the query. Complete final answers are not required from individual tools.
|
|
36
54
|
|
|
37
|
-
|
|
38
|
-
2. Parameter Appropriateness: Do the parameters used in the TOOL CALL match the TOOL DEFINITION and are the parameters relevant to the latest user's query?
|
|
39
|
-
3. Parameter Value Correctness: Are the parameters values used in the TOOL CALL present or inferred by CONVERSATION and relevant to the latest user's query?
|
|
40
|
-
4. Potential Value: Is the information this tool call might provide likely to be useful in advancing the conversation or addressing the user expressed or implied needs?
|
|
41
|
-
5. Context Appropriateness: Does the tool call make sense at this point in the conversation, given what has been discussed so far?
|
|
55
|
+
**Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide.
|
|
42
56
|
|
|
43
57
|
|
|
44
58
|
# Ratings
|
|
45
59
|
## [Tool Call Accuracy: 1] (Irrelevant)
|
|
46
60
|
**Definition:**
|
|
47
61
|
Tool calls were not relevant to the user's query, resulting in an irrelevant or unhelpful final output.
|
|
48
|
-
This level is a 'fail'.
|
|
49
62
|
|
|
50
63
|
**Example:**
|
|
51
|
-
|
|
64
|
+
User asks for distance between two cities -> Agent calls a weather function to get the weather in the two cities.
|
|
52
65
|
|
|
53
66
|
|
|
54
|
-
## [Tool Call Accuracy: 2] (Partially Relevant -
|
|
67
|
+
## [Tool Call Accuracy: 2] (Partially Relevant - Wrong Execution)
|
|
55
68
|
**Definition:**
|
|
56
|
-
Tool calls were somewhat related to the user's query, but the agent was not able to reach
|
|
57
|
-
•
|
|
58
|
-
•
|
|
59
|
-
•
|
|
60
|
-
|
|
69
|
+
Tool calls were somewhat related to the user's query, but the agent was not able to reach information that helps address the user query due to one or more of the following:
|
|
70
|
+
• Parameters passed to the tool were incorrect.
|
|
71
|
+
• Not enough tools (available in the tool definitions) were called to fully help address the query (missing tool calls).
|
|
72
|
+
• Tools returned errors, and no retrials for the tool call were successful.
|
|
73
|
+
|
|
61
74
|
|
|
62
75
|
**Example:**
|
|
63
|
-
The user asks for the coordinates of Chicago. The agent calls the
|
|
76
|
+
The user asks for the coordinates of Chicago. The agent calls the tool that gets the coordinates but passes 'New York' instead of Chicago as parameter.
|
|
64
77
|
|
|
65
78
|
**Example:**
|
|
66
|
-
The user asks for the coordinates of Chicago. The agent calls the
|
|
79
|
+
The user asks for the coordinates of Chicago. The agent calls the tool that gets the coordinates and passes 'Chicago' as the tool parameter, but the tool returns an error.
|
|
67
80
|
|
|
68
81
|
**Example:**
|
|
69
|
-
The user asks a question that needs 3 tool calls for it to be answered. The agent calls only one of the three required tool calls. So this case is a Level 2
|
|
82
|
+
The user asks a question that needs 3 tool calls for it to be answered. The agent calls only one of the three required tool calls. So this case is a Level 2.
|
|
70
83
|
|
|
71
84
|
|
|
72
|
-
## [Tool Call Accuracy: 3] (
|
|
85
|
+
## [Tool Call Accuracy: 3] (Relevant but Inefficient)
|
|
73
86
|
**Definition:**
|
|
74
87
|
Tool calls were relevant, correct and grounded parameters were passed so that led to a correct output. However, multiple excessive, unnecessary tool calls were made.
|
|
75
|
-
|
|
88
|
+
|
|
89
|
+
**Important**: Do NOT penalize built-in tools like file_search that naturally return multiple results in a single call. Only penalize when there are actually multiple separate tool call objects.
|
|
76
90
|
|
|
77
91
|
**Example:**
|
|
78
|
-
The user asked to do a modification in the database. The agent called the tool multiple times, resulting in multiple modifications in the database instead of one.
|
|
92
|
+
The user asked to do a modification in the database. The agent called the tool multiple times, resulting in multiple modifications in the database instead of one.
|
|
79
93
|
|
|
80
94
|
**Example:**
|
|
81
|
-
The user asked for popular hotels in a certain place. The agent calls the same tool with the same parameters multiple times, even though a single tool call that returns an output is sufficient. So there were unnecessary tool calls.
|
|
95
|
+
The user asked for popular hotels in a certain place. The agent calls the same tool with the same parameters multiple times, even though a single tool call that returns an output is sufficient. So there were unnecessary tool calls.
|
|
82
96
|
|
|
83
97
|
|
|
84
|
-
## [Tool Call Accuracy: 4] (
|
|
98
|
+
## [Tool Call Accuracy: 4] (Correct with Retrials)
|
|
85
99
|
**Definition:**
|
|
86
100
|
Tool calls were fully relevant and efficient:
|
|
87
101
|
• Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query.
|
|
88
102
|
• A tool returned an error, but the agent retried calling the tool and successfully got an output.
|
|
89
|
-
This level is a 'pass'.
|
|
90
103
|
|
|
91
104
|
**Example:**
|
|
92
|
-
The user asks for the weather forecast in a certain place. The agent calls the correct tool that retrieves the weather forecast
|
|
105
|
+
The user asks for the weather forecast in a certain place. The agent calls the correct tool that retrieves the weather forecast, but the tool returns an error. The agent re-calls the tool once again and it returns the correct output. This is a Level 4.
|
|
93
106
|
|
|
94
107
|
|
|
95
|
-
## [Tool Call Accuracy: 5] (Optimal Solution
|
|
108
|
+
## [Tool Call Accuracy: 5] (Optimal Solution)
|
|
96
109
|
**Definition:**
|
|
97
110
|
Tool calls were fully relevant and efficient:
|
|
98
|
-
• Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query.
|
|
99
|
-
• No unnecessary or excessive tool calls were made.
|
|
100
|
-
• No errors occurred in any of the tools.
|
|
101
|
-
• The
|
|
102
|
-
This level is a 'pass'.
|
|
111
|
+
• Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query.
|
|
112
|
+
• No unnecessary or excessive tool calls were made.
|
|
113
|
+
• No errors occurred in any of the tools.
|
|
114
|
+
• The tool calls made helped the agent address the user's query without facing any issues.
|
|
103
115
|
|
|
104
116
|
**Example:**
|
|
105
|
-
The user asks for the distance between two places. The agent correctly calls the tools that retrieve the coordinates for the two places respectively, then calls the tool that calculates the distance between the two sets of coordinates, passing the correct arguments to all the tools, without calling other tools excessively or unnecessarily. This is the optimal solution for the user's query.
|
|
117
|
+
The user asks for the distance between two places. The agent correctly calls the tools that retrieve the coordinates for the two places respectively, then calls the tool that calculates the distance between the two sets of coordinates, passing the correct arguments to all the tools, without calling other tools excessively or unnecessarily. This is the optimal solution for the user's query.
|
|
106
118
|
|
|
107
119
|
**Example:**
|
|
108
|
-
The user asks for the distance between two places. The agent retrieves the needed coordinates from the outputs of the tool calls in the conversation history, and then correctly passes these coordinates to the tool that calculates the distance to output it to the user. This is also an optimal solution for the user's query.
|
|
120
|
+
The user asks for the distance between two places. The agent retrieves the needed coordinates from the outputs of the tool calls in the conversation history, and then correctly passes these coordinates to the tool that calculates the distance to output it to the user. This is also an optimal solution for the user's query.
|
|
109
121
|
|
|
122
|
+
**Example:**
|
|
123
|
+
The user asked to summarize a file on their SharePoint. The agent calls the sharepoint_grounding tool to retrieve the file. This retrieved file will help the agent fulfill the task of summarization. This is a Level 5.
|
|
110
124
|
|
|
111
125
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
126
|
+
## Chain of Thought Structure
|
|
127
|
+
Structure your reasoning as follows:
|
|
128
|
+
1. **Start with the user's last query**: Understand well what the last message that is sent by the user is.
|
|
129
|
+
2. **Identify relevant available tools**: Look into the TOOL DEFINITIONS and analyze which tools could help answer the user's last query in the conversation.
|
|
130
|
+
3. **Analyze the actual tool calls made**: Compare what was done in the TOOL CALLS TO BE EVALUATED section vs. What should've been done by the agent.
|
|
131
|
+
4. **Check parameter grounding** - Ensure all parameters are grounded from the CONVERSATION section and are not hallucinated.
|
|
132
|
+
5. **Determine the appropriate level** - Be VERY precise and follow the level definitions exactly.
|
|
116
133
|
|
|
117
134
|
# Data
|
|
118
135
|
CONVERSATION : {{query}}
|
|
119
|
-
TOOL CALLS: {{tool_calls}}
|
|
120
|
-
TOOL
|
|
136
|
+
TOOL CALLS TO BE EVALUATED: {{tool_calls}}
|
|
137
|
+
TOOL DEFINITIONS: {{tool_definitions}}
|
|
121
138
|
|
|
122
139
|
|
|
123
140
|
# Tasks
|
|
124
141
|
## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
|
|
125
142
|
Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
|
|
126
|
-
- chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level. Start this string with 'Let's think step by step:'
|
|
143
|
+
- chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'.
|
|
127
144
|
- tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
|
|
128
145
|
- details: a dictionary that contains the following keys:
|
|
129
146
|
- tool_calls_made_by_agent: total number of tool calls made by the agent
|
|
@@ -141,7 +158,7 @@ Your output should consist only of a JSON object, as provided in the examples, t
|
|
|
141
158
|
- tool_name: name of the tool
|
|
142
159
|
- excess_count: number of excess calls made for this query
|
|
143
160
|
- missing_tool_calls: a dictionary with the following keys:
|
|
144
|
-
- total: total number of missing tool calls that should have been made by the agent to be able to answer the query
|
|
161
|
+
- total: total number of missing tool calls that should have been made by the agent to be able to answer the query, but were not made by the agent at all.
|
|
145
162
|
- details: a list of dictionaries, each containing:
|
|
146
163
|
- tool_name: name of the tool
|
|
147
164
|
- missing_count: number of missing calls for this query
|
azure/ai/evaluation/_version.py
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
try:
|
|
6
6
|
from ._red_team import RedTeam
|
|
7
7
|
from ._attack_strategy import AttackStrategy
|
|
8
|
-
from ._attack_objective_generator import RiskCategory
|
|
8
|
+
from ._attack_objective_generator import RiskCategory, SupportedLanguages
|
|
9
9
|
from ._red_team_result import RedTeamResult
|
|
10
10
|
except ImportError:
|
|
11
11
|
raise ImportError(
|
|
@@ -18,4 +18,5 @@ __all__ = [
|
|
|
18
18
|
"AttackStrategy",
|
|
19
19
|
"RiskCategory",
|
|
20
20
|
"RedTeamResult",
|
|
21
|
+
"SupportedLanguages",
|
|
21
22
|
]
|
|
@@ -20,6 +20,23 @@ class RiskCategory(str, Enum):
|
|
|
20
20
|
SelfHarm = "self_harm"
|
|
21
21
|
ProtectedMaterial = "protected_material"
|
|
22
22
|
CodeVulnerability = "code_vulnerability"
|
|
23
|
+
UngroundedAttributes = "ungrounded_attributes"
|
|
24
|
+
IndirectAttack = "indirect_attack"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@experimental
|
|
28
|
+
class SupportedLanguages(Enum):
|
|
29
|
+
"""Supported languages for attack objectives, using ISO standard language codes."""
|
|
30
|
+
|
|
31
|
+
Spanish = "es"
|
|
32
|
+
Italian = "it"
|
|
33
|
+
French = "fr"
|
|
34
|
+
German = "de"
|
|
35
|
+
SimplifiedChinese = "zh-cn"
|
|
36
|
+
Portuguese = "pt"
|
|
37
|
+
Japanese = "ja"
|
|
38
|
+
English = "en"
|
|
39
|
+
Korean = "ko"
|
|
23
40
|
|
|
24
41
|
|
|
25
42
|
@experimental
|
|
@@ -19,6 +19,7 @@ class _CallbackChatTarget(PromptChatTarget):
|
|
|
19
19
|
*,
|
|
20
20
|
callback: Callable[[List[Dict], bool, Optional[str], Optional[Dict[str, Any]]], Dict],
|
|
21
21
|
stream: bool = False,
|
|
22
|
+
prompt_to_context: Optional[Dict[str, str]] = None,
|
|
22
23
|
) -> None:
|
|
23
24
|
"""
|
|
24
25
|
Initializes an instance of the _CallbackChatTarget class.
|
|
@@ -32,10 +33,12 @@ class _CallbackChatTarget(PromptChatTarget):
|
|
|
32
33
|
Args:
|
|
33
34
|
callback (Callable): The callback function that sends a prompt to a target and receives a response.
|
|
34
35
|
stream (bool, optional): Indicates whether the target supports streaming. Defaults to False.
|
|
36
|
+
prompt_to_context (Optional[Dict[str, str]], optional): Mapping from prompt content to context. Defaults to None.
|
|
35
37
|
"""
|
|
36
38
|
PromptChatTarget.__init__(self)
|
|
37
39
|
self._callback = callback
|
|
38
40
|
self._stream = stream
|
|
41
|
+
self._prompt_to_context = prompt_to_context or {}
|
|
39
42
|
|
|
40
43
|
async def send_prompt_async(self, *, prompt_request: PromptRequestResponse) -> PromptRequestResponse:
|
|
41
44
|
|
|
@@ -48,8 +51,18 @@ class _CallbackChatTarget(PromptChatTarget):
|
|
|
48
51
|
|
|
49
52
|
logger.info(f"Sending the following prompt to the prompt target: {request}")
|
|
50
53
|
|
|
54
|
+
# Get context for the current prompt if available
|
|
55
|
+
current_prompt_content = request.converted_value
|
|
56
|
+
context_data = self._prompt_to_context.get(current_prompt_content, "")
|
|
57
|
+
context_dict = {"context": context_data} if context_data else {}
|
|
58
|
+
|
|
59
|
+
# If context is not available via prompt_to_context, it can be fetched from the memory
|
|
60
|
+
if not context_dict:
|
|
61
|
+
memory_label_context = request.labels.get("context", None)
|
|
62
|
+
context_dict = {"context": memory_label_context} if memory_label_context else {}
|
|
63
|
+
|
|
51
64
|
# response_context contains "messages", "stream", "session_state, "context"
|
|
52
|
-
response_context = await self._callback(messages=messages, stream=self._stream, session_state=None, context=
|
|
65
|
+
response_context = await self._callback(messages=messages, stream=self._stream, session_state=None, context=context_dict) # type: ignore
|
|
53
66
|
|
|
54
67
|
response_text = response_context["messages"][-1]["content"]
|
|
55
68
|
response_entry = construct_response_from_request(request=request, response_text_pieces=[response_text])
|