azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
|
|
4
|
+
import math
|
|
5
|
+
import os
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Dict, Union, List, Optional
|
|
8
|
+
from typing_extensions import overload, override
|
|
9
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@experimental
|
|
18
|
+
class _ToolSuccessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
19
|
+
"""The Tool Success evaluator determines whether tool calls done by an AI agent includes failures or not.
|
|
20
|
+
|
|
21
|
+
This evaluator focuses solely on tool call results and tool definitions, disregarding user's query to
|
|
22
|
+
the agent, conversation history and agent's final response. Although tool definitions is optional,
|
|
23
|
+
providing them can help the evaluator better understand the context of the tool calls made by the
|
|
24
|
+
agent. Please note that this evaluator validates tool calls for potential technical failures like
|
|
25
|
+
errors, exceptions, timeouts and empty results (only in cases where empty results could indicate a
|
|
26
|
+
failure). It does not assess the correctness or the tool result itself, like mathematical errors and
|
|
27
|
+
unrealistic field values like name="668656".
|
|
28
|
+
|
|
29
|
+
Scoring is binary:
|
|
30
|
+
- TRUE: All tool calls were successful
|
|
31
|
+
- FALSE: At least one tool call failed
|
|
32
|
+
|
|
33
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
34
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
35
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
36
|
+
|
|
37
|
+
.. admonition:: Example:
|
|
38
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
39
|
+
:start-after: [START tool_success_evaluator]
|
|
40
|
+
:end-before: [END tool_success_evaluator]
|
|
41
|
+
:language: python
|
|
42
|
+
:dedent: 8
|
|
43
|
+
:caption: Initialize and call a _ToolSuccessEvaluator with a tool definitions and response.
|
|
44
|
+
|
|
45
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
46
|
+
|
|
47
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
48
|
+
:start-after: [START tool_success_evaluator]
|
|
49
|
+
:end-before: [END tool_success_evaluator]
|
|
50
|
+
:language: python
|
|
51
|
+
:dedent: 8
|
|
52
|
+
:caption: Initialize and call a _ToolSuccessEvaluator using Azure AI Project URL in the following
|
|
53
|
+
format https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
_PROMPTY_FILE = "tool_success.prompty"
|
|
58
|
+
_RESULT_KEY = "tool_success"
|
|
59
|
+
_OPTIONAL_PARAMS = ["tool_definitions"]
|
|
60
|
+
|
|
61
|
+
id = "azureai://built-in/evaluators/tool_success"
|
|
62
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
63
|
+
|
|
64
|
+
@override
|
|
65
|
+
def __init__(self, model_config, *, credential=None, **kwargs):
|
|
66
|
+
"""Initialize the Tool Success evaluator."""
|
|
67
|
+
current_dir = os.path.dirname(__file__)
|
|
68
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
69
|
+
super().__init__(
|
|
70
|
+
model_config=model_config,
|
|
71
|
+
prompty_file=prompty_path,
|
|
72
|
+
result_key=self._RESULT_KEY,
|
|
73
|
+
threshold=1,
|
|
74
|
+
credential=credential,
|
|
75
|
+
_higher_is_better=True,
|
|
76
|
+
**kwargs,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
@overload
|
|
80
|
+
def __call__(
|
|
81
|
+
self,
|
|
82
|
+
*,
|
|
83
|
+
response: Union[str, List[dict]],
|
|
84
|
+
tool_definitions: Optional[Union[dict, List[dict]]] = None,
|
|
85
|
+
) -> Dict[str, Union[str, float]]:
|
|
86
|
+
"""Evaluate tool call success for a given response, and optionally tool definitions.
|
|
87
|
+
|
|
88
|
+
Example with list of messages:
|
|
89
|
+
evaluator = _ToolSuccessEvaluator(model_config)
|
|
90
|
+
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant',
|
|
91
|
+
'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
|
|
92
|
+
|
|
93
|
+
result = evaluator(response=response, )
|
|
94
|
+
|
|
95
|
+
:keyword response: The response being evaluated, either a string or a list of messages (full agent
|
|
96
|
+
response potentially including tool calls)
|
|
97
|
+
:paramtype response: Union[str, List[dict]]
|
|
98
|
+
:keyword tool_definitions: Optional tool definitions to use for evaluation.
|
|
99
|
+
:paramtype tool_definitions: Union[dict, List[dict]]
|
|
100
|
+
:return: A dictionary with the tool success evaluation results.
|
|
101
|
+
:rtype: Dict[str, Union[str, float]]
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
@override
|
|
105
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
106
|
+
self,
|
|
107
|
+
*args,
|
|
108
|
+
**kwargs,
|
|
109
|
+
):
|
|
110
|
+
"""
|
|
111
|
+
Invoke the instance using the overloaded __call__ signature.
|
|
112
|
+
|
|
113
|
+
For detailed parameter types and return value documentation, see the overloaded __call__ definition.
|
|
114
|
+
"""
|
|
115
|
+
return super().__call__(*args, **kwargs)
|
|
116
|
+
|
|
117
|
+
@override
|
|
118
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # type: ignore[override]
|
|
119
|
+
"""Do Tool Success evaluation.
|
|
120
|
+
|
|
121
|
+
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are
|
|
122
|
+
needed for the _flow method
|
|
123
|
+
:type eval_input: Dict
|
|
124
|
+
:return: The evaluation result.
|
|
125
|
+
:rtype: Dict
|
|
126
|
+
"""
|
|
127
|
+
if "response" not in eval_input:
|
|
128
|
+
raise EvaluationException(
|
|
129
|
+
message="response is a required input to the Tool Success evaluator.",
|
|
130
|
+
internal_message="response is a required input to the Tool Success evaluator.",
|
|
131
|
+
blame=ErrorBlame.USER_ERROR,
|
|
132
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
133
|
+
target=ErrorTarget.TOOL_SUCCESS_EVALUATOR,
|
|
134
|
+
)
|
|
135
|
+
if eval_input["response"] is None or eval_input["response"] == []:
|
|
136
|
+
raise EvaluationException(
|
|
137
|
+
message="response cannot be None or empty for the Tool Success evaluator.",
|
|
138
|
+
internal_message="response cannot be None or empty for the Tool Success evaluator.",
|
|
139
|
+
blame=ErrorBlame.USER_ERROR,
|
|
140
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
141
|
+
target=ErrorTarget.TOOL_SUCCESS_EVALUATOR,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
|
|
145
|
+
|
|
146
|
+
if "tool_definitions" in eval_input:
|
|
147
|
+
tool_definitions = eval_input["tool_definitions"]
|
|
148
|
+
filtered_tool_definitions = _filter_to_used_tools(
|
|
149
|
+
tool_definitions=tool_definitions,
|
|
150
|
+
msgs_list=eval_input["response"],
|
|
151
|
+
logger=logger,
|
|
152
|
+
)
|
|
153
|
+
eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger)
|
|
154
|
+
|
|
155
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
156
|
+
llm_output = prompty_output_dict.get("llm_output", "")
|
|
157
|
+
|
|
158
|
+
if isinstance(llm_output, dict):
|
|
159
|
+
success = llm_output.get("success", False)
|
|
160
|
+
if isinstance(success, str):
|
|
161
|
+
success = success.upper() == "TRUE"
|
|
162
|
+
|
|
163
|
+
success_result = "pass" if success else "fail"
|
|
164
|
+
reason = llm_output.get("explanation", "")
|
|
165
|
+
return {
|
|
166
|
+
f"{self._result_key}": success * 1.0,
|
|
167
|
+
f"{self._result_key}_result": success_result,
|
|
168
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
169
|
+
f"{self._result_key}_reason": f"{reason} {llm_output.get('details', '')}",
|
|
170
|
+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
|
|
171
|
+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
|
|
172
|
+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
|
|
173
|
+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
|
|
174
|
+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
|
|
175
|
+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
|
|
176
|
+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
|
|
177
|
+
}
|
|
178
|
+
if logger:
|
|
179
|
+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
|
|
180
|
+
|
|
181
|
+
score = math.nan
|
|
182
|
+
binary_result = self._get_binary_result(score)
|
|
183
|
+
return {
|
|
184
|
+
self._result_key: float(score),
|
|
185
|
+
f"{self._result_key}_result": binary_result,
|
|
186
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
|
|
191
|
+
"""Filter the tool definitions to only include those that were actually used in the messages lists."""
|
|
192
|
+
try:
|
|
193
|
+
used_tool_names = set()
|
|
194
|
+
any_tools_used = False
|
|
195
|
+
|
|
196
|
+
for msg in msgs_list:
|
|
197
|
+
if msg.get("role") == "assistant" and "content" in msg:
|
|
198
|
+
for content in msg.get("content", []):
|
|
199
|
+
if content.get("type") == "tool_call":
|
|
200
|
+
any_tools_used = True
|
|
201
|
+
if "tool_call" in content and "function" in content["tool_call"]:
|
|
202
|
+
used_tool_names.add(content["tool_call"]["function"])
|
|
203
|
+
elif "name" in content:
|
|
204
|
+
used_tool_names.add(content["name"])
|
|
205
|
+
|
|
206
|
+
filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names]
|
|
207
|
+
if any_tools_used and not filtered_tools:
|
|
208
|
+
if logger:
|
|
209
|
+
logger.warning("No tool definitions matched the tools used in the messages. Returning original list.")
|
|
210
|
+
filtered_tools = tool_definitions
|
|
211
|
+
|
|
212
|
+
return filtered_tools
|
|
213
|
+
except Exception as e:
|
|
214
|
+
if logger:
|
|
215
|
+
logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}")
|
|
216
|
+
return tool_definitions
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _get_tool_calls_results(agent_response_msgs):
|
|
220
|
+
"""Extract formatted agent tool calls and results from response."""
|
|
221
|
+
agent_response_text = []
|
|
222
|
+
tool_results = {}
|
|
223
|
+
|
|
224
|
+
# First pass: collect tool results
|
|
225
|
+
|
|
226
|
+
for msg in agent_response_msgs:
|
|
227
|
+
if msg.get("role") == "tool" and "tool_call_id" in msg:
|
|
228
|
+
for content in msg.get("content", []):
|
|
229
|
+
if content.get("type") == "tool_result":
|
|
230
|
+
result = content.get("tool_result")
|
|
231
|
+
tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
|
|
232
|
+
|
|
233
|
+
# Second pass: parse assistant messages and tool calls
|
|
234
|
+
for msg in agent_response_msgs:
|
|
235
|
+
if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
|
|
236
|
+
|
|
237
|
+
for content in msg.get("content", []):
|
|
238
|
+
|
|
239
|
+
if content.get("type") == "tool_call":
|
|
240
|
+
if "tool_call" in content and "function" in content.get("tool_call", {}):
|
|
241
|
+
tc = content.get("tool_call", {})
|
|
242
|
+
func_name = tc.get("function", {}).get("name", "")
|
|
243
|
+
args = tc.get("function", {}).get("arguments", {})
|
|
244
|
+
tool_call_id = tc.get("id")
|
|
245
|
+
else:
|
|
246
|
+
tool_call_id = content.get("tool_call_id")
|
|
247
|
+
func_name = content.get("name", "")
|
|
248
|
+
args = content.get("arguments", {})
|
|
249
|
+
args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
|
|
250
|
+
call_line = f"[TOOL_CALL] {func_name}({args_str})"
|
|
251
|
+
agent_response_text.append(call_line)
|
|
252
|
+
if tool_call_id in tool_results:
|
|
253
|
+
agent_response_text.append(tool_results[tool_call_id])
|
|
254
|
+
|
|
255
|
+
return agent_response_text
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _reformat_tool_calls_results(response, logger=None):
|
|
259
|
+
try:
|
|
260
|
+
if response is None or response == []:
|
|
261
|
+
return ""
|
|
262
|
+
agent_response = _get_tool_calls_results(response)
|
|
263
|
+
if agent_response == []:
|
|
264
|
+
# If no message could be extracted, likely the format changed,
|
|
265
|
+
# fallback to the original response in that case
|
|
266
|
+
if logger:
|
|
267
|
+
logger.warning(
|
|
268
|
+
f"Empty agent response extracted, likely due to input schema change. "
|
|
269
|
+
f"Falling back to using the original response: {response}"
|
|
270
|
+
)
|
|
271
|
+
return response
|
|
272
|
+
return "\n".join(agent_response)
|
|
273
|
+
except Exception:
|
|
274
|
+
# If the agent response cannot be parsed for whatever
|
|
275
|
+
# reason (e.g. the converter format changed), the original response is returned
|
|
276
|
+
# This is a fallback to ensure that the evaluation can still proceed.
|
|
277
|
+
# See comments on reformat_conversation_history for more details.
|
|
278
|
+
if logger:
|
|
279
|
+
logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
|
|
280
|
+
return response
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _reformat_tool_definitions(tool_definitions, logger=None):
|
|
284
|
+
try:
|
|
285
|
+
output_lines = ["TOOL_DEFINITIONS:"]
|
|
286
|
+
for tool in tool_definitions:
|
|
287
|
+
name = tool.get("name", "unnamed_tool")
|
|
288
|
+
desc = tool.get("description", "").strip()
|
|
289
|
+
params = tool.get("parameters", {}).get("properties", {})
|
|
290
|
+
param_names = ", ".join(params.keys()) if params else "no parameters"
|
|
291
|
+
output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
|
|
292
|
+
return "\n".join(output_lines)
|
|
293
|
+
except Exception:
|
|
294
|
+
# If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
|
|
295
|
+
# This is a fallback to ensure that the evaluation can still proceed.
|
|
296
|
+
# See comments on reformat_conversation_history for more details.
|
|
297
|
+
if logger:
|
|
298
|
+
logger.warning(
|
|
299
|
+
f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
|
|
300
|
+
)
|
|
301
|
+
return tool_definitions
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Tool Success
|
|
3
|
+
description: Evaluates whether a Tool call was successful or resulted in a technical error
|
|
4
|
+
model:
|
|
5
|
+
api: chat
|
|
6
|
+
parameters:
|
|
7
|
+
temperature: 0.0
|
|
8
|
+
max_tokens: 1500
|
|
9
|
+
top_p: 1.0
|
|
10
|
+
presence_penalty: 0
|
|
11
|
+
frequency_penalty: 0
|
|
12
|
+
response_format:
|
|
13
|
+
type: json_object
|
|
14
|
+
inputs:
|
|
15
|
+
tool_calls:
|
|
16
|
+
type: List
|
|
17
|
+
tool_definitions:
|
|
18
|
+
type: Dict
|
|
19
|
+
---
|
|
20
|
+
system:
|
|
21
|
+
|
|
22
|
+
You are an expert evaluator with strong software development background. You are required to extract the tool result for every tool call then decide for each tool result whether it indicates that the tool succeeded or failed
|
|
23
|
+
|
|
24
|
+
user:
|
|
25
|
+
ROLE
|
|
26
|
+
====
|
|
27
|
+
You are a judge on tool call success who assesses **each tool call made by an AI agent and decide if the result of the tool call indicates a success or failure**. You only care about technical errors , failures and exceptions , not the business correctness of the tool implementation.
|
|
28
|
+
|
|
29
|
+
You are NOT evaluating:
|
|
30
|
+
- The parameters passed to the tool
|
|
31
|
+
- The rationale behind choosing this tool
|
|
32
|
+
- Whether the successfully returned result from the tool is correct or not business-wise given the tool name and definition
|
|
33
|
+
|
|
34
|
+
You **ARE ONLY** evaluating:
|
|
35
|
+
-Whether tool results indicate the presence of a technical error
|
|
36
|
+
|
|
37
|
+
**INPUT**
|
|
38
|
+
=====
|
|
39
|
+
TOOL_DEFINITIONS: {{tool_definitions}}
|
|
40
|
+
TOOL_CALLS: {{tool_calls}}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
TOOL_CALLS is a list of tool calls that were produced by the AI agent. It includes calls together with the result of every tool call.
|
|
45
|
+
TOOL_DEFINITIONS is a list of definitions for the tools that were called. This definition can contain a description of functionality provided by the tool, the parameters that the tool accepts and the expected return of the tool. This definition can contribute to the assessment of whether a tool call succeeded or failed.
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
EVALUATION FRAMEWORK
|
|
49
|
+
====================
|
|
50
|
+
|
|
51
|
+
A. Iterate on the list of tool calls
|
|
52
|
+
B. Examine tool result and definition for the tool being called to check whether the call **succeeded** or **failed** as in the following steps explain:
|
|
53
|
+
1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it:
|
|
54
|
+
ERROR-CASES:
|
|
55
|
+
===========
|
|
56
|
+
- The tool call resulted in an error or exception
|
|
57
|
+
- The tool call failed to run or failed to return
|
|
58
|
+
- The tool call returned a result that indicates an error or failure
|
|
59
|
+
- The tool call returned an object or JSON string that has one or more of its fields indicating an error
|
|
60
|
+
- The tool timed-out or returned a result that indicate a time-out
|
|
61
|
+
- The tool result does not make sense, from technical perspective, not business perspective, given the definition of that tool, if the definition is present
|
|
62
|
+
2. If none of the error cases apply to the tool result , it is considered **succeeded** even if the tool result itself indicates a business mistake
|
|
63
|
+
C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded**
|
|
64
|
+
D. You are required to return your **output** in the following format:
|
|
65
|
+
{
|
|
66
|
+
"explanation": "<15-60 words explaining the logic flow of your decision>",
|
|
67
|
+
"details": {
|
|
68
|
+
"failed_tools": "<comma-separated list for the tools that has failed results if any>",
|
|
69
|
+
},
|
|
70
|
+
"success": <True or False based on whether the **evaluation process** has **succeeded** or **failed**>
|
|
71
|
+
}
|
|
72
|
+
E. If no tool calls found at all , the TOOL_CALLS input is empty or the TOOL_CALLS input is not passed , the **evaluation process** has **succeeded**
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
## Successful Evaluation Process Examples
|
|
76
|
+
========================================
|
|
77
|
+
|
|
78
|
+
### Example - Succeeded
|
|
79
|
+
|
|
80
|
+
[TOOL_CALLS]
|
|
81
|
+
[TOOL_CALL] get_account_balances(user_id="USER456")
|
|
82
|
+
[TOOL_RESULT] {'accounts': [{'account_id': 'CHK001', 'type': 'checking', 'balance': 1250.75}, {'account_id': 'SAV001', 'type': 'savings', 'balance': 3400.20}]}
|
|
83
|
+
[TOOL_CALL] get_weather_info()
|
|
84
|
+
[TOOL_RESULT] "the temperature is 23 C and it is cloudy"
|
|
85
|
+
|
|
86
|
+
EXPECTED OUTPUT
|
|
87
|
+
{
|
|
88
|
+
"explanation": "None of the results indicate an error",
|
|
89
|
+
"details": {
|
|
90
|
+
"failed_tools": "",
|
|
91
|
+
},
|
|
92
|
+
"success": True
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
### Example - Succeeded
|
|
96
|
+
|
|
97
|
+
[TOOL_CALLS]
|
|
98
|
+
[TOOL_CALL] get_employee_info(employee_id="EMP2568")
|
|
99
|
+
[TOOL_RESULT] {"name":"David", "Age":32}
|
|
100
|
+
|
|
101
|
+
EXPECTED OUTPUT
|
|
102
|
+
{
|
|
103
|
+
"explanation": "None of the results indicate an error",
|
|
104
|
+
"details": {
|
|
105
|
+
"failed_tools": "",
|
|
106
|
+
},
|
|
107
|
+
"success": True
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
### Example - Succeeded
|
|
111
|
+
|
|
112
|
+
[TOOL_DEFINITIONS] [get_sqrt] gets the square root of the input parameter
|
|
113
|
+
[TOOL_CALLS]
|
|
114
|
+
[TOOL_CALL] get_sqrt(4)
|
|
115
|
+
[TOOL_RESULT] {"value":7}
|
|
116
|
+
|
|
117
|
+
EXPECTED OUTPUT
|
|
118
|
+
{
|
|
119
|
+
"explanation": "Although the returned value 7 is not the square root of 4, this is a business mistake in the tool. The tool did not return a result indicating a technical error",
|
|
120
|
+
"details": {
|
|
121
|
+
"failed_tools": "",
|
|
122
|
+
},
|
|
123
|
+
"success": True
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
### Example - Succeeded
|
|
128
|
+
|
|
129
|
+
[TOOL_DEFINITIONS] [get_blocked_usernames] gets comma-separated list of usernames for blocked users
|
|
130
|
+
[TOOL_CALLS]
|
|
131
|
+
[TOOL_CALL] get_blocked_usernames()
|
|
132
|
+
[TOOL_RESULT] "david33;amr_master;phantom5"
|
|
133
|
+
|
|
134
|
+
EXPECTED OUTPUT
|
|
135
|
+
{
|
|
136
|
+
"explanation": "The tool returned a semicolon separated list of names. Although the description in the definition says it should return comma-separated list , this formatting mistake is a business mistake of the tool, not a technical failure. The tool did not return an error",
|
|
137
|
+
"details": {
|
|
138
|
+
"failed_tools": "",
|
|
139
|
+
},
|
|
140
|
+
"success": True
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
### Example - Succeeded
|
|
146
|
+
|
|
147
|
+
[TOOL_DEFINITIONS] [update_user_email] Updates the email of the given user id to the new email specified in the parameters
|
|
148
|
+
[TOOL_CALLS]
|
|
149
|
+
[TOOL_CALL] update_user_email(userId:2251 , newEmail:"david235@mydomain.com")
|
|
150
|
+
[TOOL_RESULT] {}
|
|
151
|
+
|
|
152
|
+
EXPECTED OUTPUT
|
|
153
|
+
{
|
|
154
|
+
"explanation": "The tool returned empty response which is accepted given that this tool functionality does not include returning data to the caller",
|
|
155
|
+
"details": {
|
|
156
|
+
"failed_tools": "",
|
|
157
|
+
},
|
|
158
|
+
"success": True
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
## Failed Evaluation Process Examples
|
|
163
|
+
========================================
|
|
164
|
+
|
|
165
|
+
### Example - Failed
|
|
166
|
+
|
|
167
|
+
[TOOL_DEFINITIONS] [get_weather_info] return today's the weather information of the specified city
|
|
168
|
+
[TOOL_CALLS]
|
|
169
|
+
[TOOL_CALL] get_weather_info(city:"London")
|
|
170
|
+
[TOOL_RESULT] ""
|
|
171
|
+
|
|
172
|
+
EXPECTED OUTPUT
|
|
173
|
+
{
|
|
174
|
+
"explanation": "The tool returned empty response , however , given the tool definition , it should never return empty response because there should be weather info at any given point in time. An empty response here is considered a technical failure. The conclusion is the get_weather_info failed",
|
|
175
|
+
"details": {
|
|
176
|
+
"failed_tools": "get_weather_info",
|
|
177
|
+
},
|
|
178
|
+
"success": False
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
### Example - Failed
|
|
183
|
+
|
|
184
|
+
[TOOL_CALLS]
|
|
185
|
+
[TOOL_CALL] get_current_user_Info()
|
|
186
|
+
[TOOL_RESULT] "failed to get current user information"
|
|
187
|
+
|
|
188
|
+
EXPECTED OUTPUT
|
|
189
|
+
{
|
|
190
|
+
"explanation": "The tool returned a string indicating that it failed",
|
|
191
|
+
"details": {
|
|
192
|
+
"failed_tools": "get_current_user_Info",
|
|
193
|
+
},
|
|
194
|
+
"success": False
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
### Example - Failed
|
|
199
|
+
|
|
200
|
+
[TOOL_CALLS]
|
|
201
|
+
[TOOL_CALL] get_current_user_Info()
|
|
202
|
+
[TOOL_RESULT] {"UserName":"", "UserEmail":"", "Message":"failed to get current user information"}
|
|
203
|
+
|
|
204
|
+
EXPECTED OUTPUT
|
|
205
|
+
{
|
|
206
|
+
"explanation": "The tool returned an object with empty fields and a string indicating that it failed",
|
|
207
|
+
"details": {
|
|
208
|
+
"failed_tools": "get_current_user_Info",
|
|
209
|
+
},
|
|
210
|
+
"success": False
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
### Example - Failed
|
|
215
|
+
|
|
216
|
+
[TOOL_CALLS]
|
|
217
|
+
[TOOL_CALL] GetWeatherInfo()
|
|
218
|
+
[TOOL_RESULT] {temp:""}
|
|
219
|
+
|
|
220
|
+
EXPECTED OUTPUT
|
|
221
|
+
{
|
|
222
|
+
"explanation": "The call for GetWeatherInfo returned an object containing single property 'temp' that is an empty string. This means the call to GetWeatherInfo returned empty result while weather info should be available at any time",
|
|
223
|
+
"details": {
|
|
224
|
+
"failed_tools": "GetWeatherInfo",
|
|
225
|
+
},
|
|
226
|
+
"success": False
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
### Example - Failed
|
|
231
|
+
|
|
232
|
+
[TOOL_CALLS]
|
|
233
|
+
[TOOL_CALL] get_day_of_week(date:"1/1/2023")
|
|
234
|
+
[TOOL_RESULT] time out
|
|
235
|
+
|
|
236
|
+
EXPECTED OUTPUT
|
|
237
|
+
{
|
|
238
|
+
"explanation": "the returned result indicates that the call to get_day_of_week timed out",
|
|
239
|
+
"details": {
|
|
240
|
+
"failed_tools": "get_day_of_week",
|
|
241
|
+
},
|
|
242
|
+
"success": False
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
### Example - Failed
|
|
248
|
+
|
|
249
|
+
[TOOL_DEFINITIONS] [get_day_of_week] Takes date as an input and returns the day of week that this day represents
|
|
250
|
+
[TOOL_CALLS]
|
|
251
|
+
[TOOL_CALL] get_day_of_week(date:"1/1/2023")
|
|
252
|
+
[TOOL_RESULT] null
|
|
253
|
+
|
|
254
|
+
EXPECTED OUTPUT
|
|
255
|
+
{
|
|
256
|
+
"explanation": "null indicates an empty result which cannot be an accepted output of the tool given the tool definition since any given date represents a day of week",
|
|
257
|
+
"details": {
|
|
258
|
+
"failed_tools": "get_day_of_week",
|
|
259
|
+
},
|
|
260
|
+
"success": False
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
### Example - Failed
|
|
266
|
+
|
|
267
|
+
[TOOL_DEFINITIONS] [get_day_of_week] Takes date as an input and returns the day of week that this day represents
|
|
268
|
+
[TOOL_CALLS]
|
|
269
|
+
[TOOL_CALL] get_day_of_week(date:"1/1/2023")
|
|
270
|
+
[TOOL_RESULT] {}
|
|
271
|
+
|
|
272
|
+
EXPECTED OUTPUT
|
|
273
|
+
{
|
|
274
|
+
"explanation": "Empty object cannot be an accepted output of the tool given the tool definition since any given date should represent a day of week",
|
|
275
|
+
"details": {
|
|
276
|
+
"failed_tools": "get_day_of_week",
|
|
277
|
+
},
|
|
278
|
+
"success": False
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
### Example - Failed
|
|
284
|
+
|
|
285
|
+
[TOOL_CALLS]
|
|
286
|
+
[TOOL_CALL] GetWeatherInfo()
|
|
287
|
+
[TOOL_RESULT] {temp:"" }
|
|
288
|
+
[TOOL_CALL] BookTicket(flightId:"FL23" , Seat:"A17")
|
|
289
|
+
[TOOL_RESULT] "Failed to book the ticket"
|
|
290
|
+
|
|
291
|
+
EXPECTED OUTPUT
|
|
292
|
+
{
|
|
293
|
+
"explanation": "GetWeatherInfo returned an empty response while it should return the weather info and BookTicket returned an error.Both tools failed.",
|
|
294
|
+
"details": {
|
|
295
|
+
"failed_tools": "GetWeatherInfo,BookTicket",
|
|
296
|
+
},
|
|
297
|
+
"success": False
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
### Example - Failed
|
|
302
|
+
|
|
303
|
+
[TOOL_CALLS]
|
|
304
|
+
[TOOL_CALL] GetWeatherInfo()
|
|
305
|
+
[TOOL_RESULT] {temp:"23 C" }
|
|
306
|
+
[TOOL_CALL] BookTicket(flightId:"FL23" , Seat:"A17")
|
|
307
|
+
[TOOL_RESULT] "Failed to book the ticket"
|
|
308
|
+
|
|
309
|
+
EXPECTED OUTPUT
|
|
310
|
+
{
|
|
311
|
+
"explanation": "Although GetWeatherInfo succeeded, BookTicket returned an error. The final result is failure because one of the tool calls has failed",
|
|
312
|
+
"details": {
|
|
313
|
+
"failed_tools": "BookTicket",
|
|
314
|
+
},
|
|
315
|
+
"success": False
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
Now given the **INPUT** you received generate the output
|
|
321
|
+
# Output
|
|
@@ -33,25 +33,6 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
33
33
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
34
34
|
:type kwargs: Any
|
|
35
35
|
|
|
36
|
-
.. admonition:: Example:
|
|
37
|
-
|
|
38
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
39
|
-
:start-after: [START ungrounded_attributes_evaluator]
|
|
40
|
-
:end-before: [END ungrounded_attributes_evaluator]
|
|
41
|
-
:language: python
|
|
42
|
-
:dedent: 8
|
|
43
|
-
:caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
|
|
44
|
-
|
|
45
|
-
.. admonition:: Example using Azure AI Project URL:
|
|
46
|
-
|
|
47
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
48
|
-
:start-after: [START ungrounded_attributes_evaluator]
|
|
49
|
-
:end-before: [END ungrounded_attributes_evaluator]
|
|
50
|
-
:language: python
|
|
51
|
-
:dedent: 8
|
|
52
|
-
:caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
|
|
53
|
-
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
54
|
-
|
|
55
36
|
.. note::
|
|
56
37
|
|
|
57
38
|
If this evaluator is supplied to the `evaluate` function, the metric
|
|
@@ -78,14 +78,16 @@ class ErrorTarget(Enum):
|
|
|
78
78
|
ECI_EVALUATOR = "ECIEvaluator"
|
|
79
79
|
F1_EVALUATOR = "F1Evaluator"
|
|
80
80
|
GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
|
|
81
|
+
TASK_NAVIGATION_EFFICIENCY_EVALUATOR = "_TaskNavigationEfficiencyEvaluator"
|
|
81
82
|
PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
|
|
82
83
|
INTENT_RESOLUTION_EVALUATOR = "IntentResolutionEvaluator"
|
|
83
84
|
RELEVANCE_EVALUATOR = "RelevanceEvaluator"
|
|
84
85
|
SIMILARITY_EVALUATOR = "SimilarityEvaluator"
|
|
85
86
|
FLUENCY_EVALUATOR = "FluencyEvaluator"
|
|
86
87
|
RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
|
|
88
|
+
TOOL_SUCCESS_EVALUATOR = "_ToolSuccessEvaluator"
|
|
87
89
|
TASK_ADHERENCE_EVALUATOR = "TaskAdherenceEvaluator"
|
|
88
|
-
|
|
90
|
+
TASK_COMPLETION_EVALUATOR = "_TaskCompletionEvaluator"
|
|
89
91
|
INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
|
|
90
92
|
INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator"
|
|
91
93
|
ADVERSARIAL_SIMULATOR = "AdversarialSimulator"
|
|
@@ -96,9 +98,12 @@ class ErrorTarget(Enum):
|
|
|
96
98
|
UNKNOWN = "Unknown"
|
|
97
99
|
CONVERSATION = "Conversation"
|
|
98
100
|
TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
|
|
101
|
+
TOOL_SELECTION_EVALUATOR = "_ToolSelectionEvaluator"
|
|
102
|
+
TOOL_INPUT_ACCURACY_EVALUATOR = "_ToolInputAccuracyEvaluator"
|
|
99
103
|
RED_TEAM = "RedTeam"
|
|
100
104
|
AOAI_GRADER = "AoaiGrader"
|
|
101
105
|
CONVERSATION_HISTORY_PARSING = "_get_conversation_history"
|
|
106
|
+
TOOL_OUTPUT_UTILIZATION_EVALUATOR = "ToolOutputUtilizationEvaluator"
|
|
102
107
|
|
|
103
108
|
|
|
104
109
|
class EvaluationException(AzureError):
|