azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
import math
|
|
7
|
+
import json
|
|
8
|
+
from typing import Dict, List, Union, TypeVar, Optional, cast
|
|
9
|
+
from typing_extensions import override
|
|
10
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
11
|
+
from azure.ai.evaluation._exceptions import (
|
|
12
|
+
ErrorBlame,
|
|
13
|
+
ErrorCategory,
|
|
14
|
+
ErrorTarget,
|
|
15
|
+
EvaluationException,
|
|
16
|
+
)
|
|
17
|
+
from ..._common.utils import reformat_conversation_history, _get_agent_response
|
|
18
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@experimental
|
|
24
|
+
class _ToolInputAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
25
|
+
"""The Tool Input Accuracy evaluator performs a strict binary evaluation (PASS/FAIL) of parameters
|
|
26
|
+
passed to tool calls. It ensures that ALL parameters meet ALL criteria:
|
|
27
|
+
|
|
28
|
+
- Parameter grounding: All parameters must be derived from conversation history/query
|
|
29
|
+
- Type compliance: All parameters must match exact types specified in tool definitions
|
|
30
|
+
- Format compliance: All parameters must follow exact format and structure requirements
|
|
31
|
+
- Completeness: All required parameters must be provided
|
|
32
|
+
- No unexpected parameters: Only defined parameters are allowed
|
|
33
|
+
|
|
34
|
+
The evaluator uses strict binary evaluation:
|
|
35
|
+
- 1: Only when ALL criteria are satisfied perfectly for ALL parameters
|
|
36
|
+
- 0: When ANY criterion fails for ANY parameter
|
|
37
|
+
|
|
38
|
+
This evaluation focuses on ensuring tool call parameters are completely correct without any tolerance
|
|
39
|
+
for partial correctness.
|
|
40
|
+
|
|
41
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
42
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
43
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
44
|
+
|
|
45
|
+
.. admonition:: Example:
|
|
46
|
+
|
|
47
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
48
|
+
:start-after: [START tool_input_accuracy_evaluator]
|
|
49
|
+
:end-before: [END tool_input_accuracy_evaluator]
|
|
50
|
+
:language: python
|
|
51
|
+
:dedent: 8
|
|
52
|
+
:caption: Initialize and call a _ToolInputAccuracyEvaluator.
|
|
53
|
+
|
|
54
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
55
|
+
|
|
56
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
57
|
+
:start-after: [START tool_input_accuracy_evaluator]
|
|
58
|
+
:end-before: [END tool_input_accuracy_evaluator]
|
|
59
|
+
:language: python
|
|
60
|
+
:dedent: 8
|
|
61
|
+
:caption: Initialize and call _ToolInputAccuracyEvaluator using Azure AI Project URL in the following format
|
|
62
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
63
|
+
|
|
64
|
+
.. note::
|
|
65
|
+
|
|
66
|
+
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
67
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
68
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
_PROMPTY_FILE = "tool_input_accuracy.prompty"
|
|
72
|
+
_RESULT_KEY = "tool_input_accuracy"
|
|
73
|
+
|
|
74
|
+
_NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
|
|
75
|
+
_NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
|
|
76
|
+
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
model_config,
|
|
81
|
+
*,
|
|
82
|
+
credential=None,
|
|
83
|
+
**kwargs,
|
|
84
|
+
):
|
|
85
|
+
current_dir = os.path.dirname(__file__)
|
|
86
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
87
|
+
super().__init__(
|
|
88
|
+
model_config=model_config,
|
|
89
|
+
prompty_file=prompty_path,
|
|
90
|
+
result_key=self._RESULT_KEY,
|
|
91
|
+
threshold=1,
|
|
92
|
+
credential=credential,
|
|
93
|
+
**kwargs,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def _convert_kwargs_to_eval_input(self, **kwargs):
|
|
97
|
+
"""Convert kwargs to evaluation input format.
|
|
98
|
+
|
|
99
|
+
:keyword kwargs: The inputs to convert.
|
|
100
|
+
:type kwargs: Dict
|
|
101
|
+
:return: The formatted evaluation input.
|
|
102
|
+
:rtype: Dict
|
|
103
|
+
"""
|
|
104
|
+
# Collect inputs
|
|
105
|
+
tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
|
|
106
|
+
query = kwargs.get("query")
|
|
107
|
+
response = kwargs.get("response")
|
|
108
|
+
|
|
109
|
+
# Extract tool calls from response
|
|
110
|
+
if not response:
|
|
111
|
+
return {"error_message": "Response parameter is required to extract tool calls."}
|
|
112
|
+
|
|
113
|
+
tool_calls = self._parse_tools_from_response(response)
|
|
114
|
+
if not tool_calls:
|
|
115
|
+
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
|
|
116
|
+
|
|
117
|
+
if not isinstance(tool_calls, list):
|
|
118
|
+
tool_calls = [tool_calls]
|
|
119
|
+
if not isinstance(tool_definitions, list):
|
|
120
|
+
tool_definitions = [tool_definitions] if tool_definitions else []
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
# Type cast to satisfy static type checker
|
|
124
|
+
tool_calls_typed = cast(List[Dict], tool_calls)
|
|
125
|
+
needed_tool_definitions = self._extract_needed_tool_definitions(
|
|
126
|
+
tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
|
|
127
|
+
)
|
|
128
|
+
except EvaluationException as e:
|
|
129
|
+
# Check if this is because no tool definitions were provided at all
|
|
130
|
+
if len(tool_definitions) == 0:
|
|
131
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
132
|
+
else:
|
|
133
|
+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
|
|
134
|
+
|
|
135
|
+
if len(needed_tool_definitions) == 0:
|
|
136
|
+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
|
|
137
|
+
|
|
138
|
+
# Get agent response with tool calls and results using _get_agent_response
|
|
139
|
+
agent_response_with_tools = _get_agent_response(response, include_tool_messages=True)
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
"query": query,
|
|
143
|
+
"tool_calls": agent_response_with_tools,
|
|
144
|
+
"tool_definitions": needed_tool_definitions,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
@override
|
|
148
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
|
|
149
|
+
"""Do Tool Input Accuracy evaluation.
|
|
150
|
+
|
|
151
|
+
:param eval_input: The input to the evaluator.
|
|
152
|
+
:type eval_input: Dict
|
|
153
|
+
:return: A dictionary containing the result of the evaluation.
|
|
154
|
+
:rtype: Dict[str, Union[str, float]]
|
|
155
|
+
"""
|
|
156
|
+
# Format conversation history for cleaner evaluation
|
|
157
|
+
if "query" in eval_input:
|
|
158
|
+
eval_input["query"] = reformat_conversation_history(
|
|
159
|
+
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Call the LLM to evaluate
|
|
163
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
164
|
+
llm_output = prompty_output_dict.get("llm_output", {})
|
|
165
|
+
|
|
166
|
+
if isinstance(llm_output, dict):
|
|
167
|
+
result = llm_output.get("result", None)
|
|
168
|
+
if result not in [0, 1]:
|
|
169
|
+
raise EvaluationException(
|
|
170
|
+
message=f"Invalid result value: {result}. Expected 0 or 1.",
|
|
171
|
+
internal_message="Invalid result value.",
|
|
172
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
173
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Add parameter extraction accuracy post-processing
|
|
177
|
+
details = llm_output.get("details", {})
|
|
178
|
+
if details:
|
|
179
|
+
parameter_extraction_accuracy = self._calculate_parameter_extraction_accuracy(details)
|
|
180
|
+
details["parameter_extraction_accuracy"] = parameter_extraction_accuracy
|
|
181
|
+
|
|
182
|
+
# Format the output
|
|
183
|
+
explanation = llm_output.get("chain_of_thought", "")
|
|
184
|
+
score_result = "pass" if result == 1 else "fail"
|
|
185
|
+
response_dict = {
|
|
186
|
+
self._result_key: result,
|
|
187
|
+
f"{self._result_key}_result": score_result,
|
|
188
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
189
|
+
f"{self._result_key}_reason": explanation,
|
|
190
|
+
f"{self._result_key}_details": details,
|
|
191
|
+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
|
|
192
|
+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
|
|
193
|
+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
|
|
194
|
+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
|
|
195
|
+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
|
|
196
|
+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
|
|
197
|
+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
|
|
198
|
+
}
|
|
199
|
+
return response_dict
|
|
200
|
+
|
|
201
|
+
else:
|
|
202
|
+
raise EvaluationException(
|
|
203
|
+
message="Tool input accuracy evaluator returned invalid output.",
|
|
204
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
205
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
206
|
+
target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
async def _real_call(self, **kwargs):
|
|
210
|
+
"""The asynchronous call where real end-to-end evaluation logic is performed.
|
|
211
|
+
|
|
212
|
+
:keyword kwargs: The inputs to evaluate.
|
|
213
|
+
:type kwargs: Dict
|
|
214
|
+
:return: The evaluation result.
|
|
215
|
+
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
216
|
+
"""
|
|
217
|
+
# Convert inputs into list of evaluable inputs.
|
|
218
|
+
eval_input = self._convert_kwargs_to_eval_input(**kwargs)
|
|
219
|
+
if isinstance(eval_input, dict) and eval_input.get("error_message"):
|
|
220
|
+
# If there is an error message, return not applicable result
|
|
221
|
+
error_message = eval_input.get("error_message", "Unknown error")
|
|
222
|
+
return self._not_applicable_result(error_message, 1)
|
|
223
|
+
# Do the evaluation
|
|
224
|
+
result = await self._do_eval(eval_input)
|
|
225
|
+
# Return the result
|
|
226
|
+
return result
|
|
227
|
+
|
|
228
|
+
def _calculate_parameter_extraction_accuracy(self, details):
|
|
229
|
+
"""Calculate parameter extraction accuracy from the evaluation details.
|
|
230
|
+
|
|
231
|
+
:param details: The details dictionary from the LLM evaluation output
|
|
232
|
+
:type details: Dict
|
|
233
|
+
:return: Parameter extraction accuracy as a percentage
|
|
234
|
+
:rtype: float
|
|
235
|
+
"""
|
|
236
|
+
total_parameters = details.get("total_parameters_passed", 0)
|
|
237
|
+
correct_parameters = details.get("correct_parameters_passed", 0)
|
|
238
|
+
|
|
239
|
+
if total_parameters == 0:
|
|
240
|
+
return 100.0 # If no parameters were passed, accuracy is 100%
|
|
241
|
+
|
|
242
|
+
accuracy = (correct_parameters / total_parameters) * 100
|
|
243
|
+
return round(accuracy, 2)
|
|
244
|
+
|
|
245
|
+
@override
|
|
246
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
247
|
+
self,
|
|
248
|
+
*args,
|
|
249
|
+
**kwargs,
|
|
250
|
+
):
|
|
251
|
+
"""
|
|
252
|
+
Evaluate parameter correctness of tool calls.
|
|
253
|
+
|
|
254
|
+
:keyword query: Query or Chat history up to the message that has the tool call being evaluated.
|
|
255
|
+
:paramtype query: Union[str, List[dict]]
|
|
256
|
+
:keyword tool_definitions: List of tool definitions whose calls are being evaluated.
|
|
257
|
+
:paramtype tool_definitions: Union[dict, List[dict]]
|
|
258
|
+
:keyword response: Response containing tool calls to be evaluated.
|
|
259
|
+
:paramtype response: Union[str, List[dict]]
|
|
260
|
+
:return: The tool input accuracy evaluation results.
|
|
261
|
+
:rtype: Dict[str, Union[str, float]]
|
|
262
|
+
"""
|
|
263
|
+
return super().__call__(*args, **kwargs)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Tool Input Accuracy
|
|
3
|
+
description: Evaluates the accuracy of all inputs/parameters passed to the tools by the agent
|
|
4
|
+
model:
|
|
5
|
+
api: chat
|
|
6
|
+
parameters:
|
|
7
|
+
temperature: 0.0
|
|
8
|
+
max_tokens: 1000
|
|
9
|
+
top_p: 1.0
|
|
10
|
+
presence_penalty: 0
|
|
11
|
+
frequency_penalty: 0
|
|
12
|
+
response_format:
|
|
13
|
+
type: json_object
|
|
14
|
+
|
|
15
|
+
inputs:
|
|
16
|
+
query:
|
|
17
|
+
type: List
|
|
18
|
+
tool_calls:
|
|
19
|
+
type: List
|
|
20
|
+
tool_definitions:
|
|
21
|
+
type: Dict
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
# system:
|
|
25
|
+
You are an AI system designed to evaluate the correctness of parameters passed to tool calls. Your task is to perform a strict binary evaluation (PASS/FAIL) based on whether ALL parameters are correct.
|
|
26
|
+
|
|
27
|
+
The evaluation must check ALL of the following criteria. If ANY criterion fails, the overall result is FAIL:
|
|
28
|
+
1. **Parameter Groundedness**: ALL parameters must be derived from or supported by information in the conversation history/query. NO fabricated or unsupported values.
|
|
29
|
+
2. **Type Compliance**: ALL parameters must match the exact type specified in the tool definitions (string, number, boolean, array, object, etc.).
|
|
30
|
+
3. **Format Compliance**: ALL parameters must follow the exact format, structure, and constraints specified in the tool definitions.
|
|
31
|
+
4. **Required Parameters**: ALL required parameters must be provided. Missing any required parameter results in FAIL.
|
|
32
|
+
5. **Unexpected Parameters**: NO parameters should be provided that are not defined in the tool definition. Any extra/unexpected parameters result in FAIL.
|
|
33
|
+
6. **Value Appropriateness**: ALL parameter values must be contextually appropriate and meaningful for the tool's purpose.
|
|
34
|
+
|
|
35
|
+
## Evaluation Rules
|
|
36
|
+
|
|
37
|
+
**PASS**: Only when ALL criteria above are satisfied perfectly. Every single parameter must be:
|
|
38
|
+
- Properly grounded in conversation history/query
|
|
39
|
+
- Correct type according to tool definition
|
|
40
|
+
- Proper format and structure
|
|
41
|
+
- Required parameters all present
|
|
42
|
+
- No unexpected/undefined parameters
|
|
43
|
+
- Contextually appropriate values
|
|
44
|
+
|
|
45
|
+
**FAIL**: When ANY of the above criteria fails, including:
|
|
46
|
+
- Any parameter lacks grounding in conversation history
|
|
47
|
+
- Any parameter has wrong type
|
|
48
|
+
- Any parameter has wrong format/structure
|
|
49
|
+
- Any required parameter is missing
|
|
50
|
+
- Any unexpected parameter is present
|
|
51
|
+
- Any parameter value is inappropriate for the context
|
|
52
|
+
|
|
53
|
+
## Task
|
|
54
|
+
Analyze each tool call and its parameters against the provided tool definitions and conversation context. Provide your evaluation in the following JSON format:
|
|
55
|
+
|
|
56
|
+
{
|
|
57
|
+
"chain_of_thought": "Step-by-step analysis for all parameters passed to all the tools to check for the criteria mentioned above",
|
|
58
|
+
"details": {
|
|
59
|
+
"total_parameters_passed": <number of total parameters that were passed to all tools>,
|
|
60
|
+
"correct_parameters_passed": <number of correct parameters that were passed to all tools in the agent's response>,
|
|
61
|
+
"incorrect_parameters": ["list of incorrect parameters passed with reasons"]
|
|
62
|
+
},
|
|
63
|
+
"result": <0 for FAIL, 1 for PASS>
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
## Conversation History/Query:
|
|
68
|
+
{{query}}
|
|
69
|
+
|
|
70
|
+
## Tool Calls Made:
|
|
71
|
+
{{tool_calls}}
|
|
72
|
+
|
|
73
|
+
## Tool Definitions:
|
|
74
|
+
{{tool_definitions}}
|
|
75
|
+
|
|
76
|
+
# Output
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._tool_output_utilization import _ToolOutputUtilizationEvaluator
|
|
6
|
+
|
|
7
|
+
__all__ = ["_ToolOutputUtilizationEvaluator"]
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import os
|
|
5
|
+
import math
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Dict, Union, List, Optional
|
|
8
|
+
|
|
9
|
+
from typing_extensions import overload, override
|
|
10
|
+
|
|
11
|
+
from azure.ai.evaluation._exceptions import (
|
|
12
|
+
EvaluationException,
|
|
13
|
+
ErrorBlame,
|
|
14
|
+
ErrorCategory,
|
|
15
|
+
ErrorTarget,
|
|
16
|
+
)
|
|
17
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
18
|
+
from ..._common.utils import (
|
|
19
|
+
reformat_conversation_history,
|
|
20
|
+
reformat_agent_response,
|
|
21
|
+
reformat_tool_definitions,
|
|
22
|
+
filter_to_used_tools,
|
|
23
|
+
)
|
|
24
|
+
from azure.ai.evaluation._model_configurations import Message
|
|
25
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@experimental
|
|
31
|
+
class _ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
32
|
+
"""The Tool Output Utilization Evaluator assesses how effectively an AI agent utilizes the outputs from tools and whether it accurately incorporates this information into its responses.
|
|
33
|
+
|
|
34
|
+
Scoring is based on two levels:
|
|
35
|
+
1. Pass - The agent effectively utilizes tool outputs and accurately incorporates the information into its response.
|
|
36
|
+
2. Fail - The agent fails to properly utilize tool outputs or incorrectly incorporates the information into its response.
|
|
37
|
+
|
|
38
|
+
The evaluation includes the score, a brief explanation, and a final pass/fail result.
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
42
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
43
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
44
|
+
|
|
45
|
+
.. admonition:: Example:
|
|
46
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
47
|
+
:start-after: [START tool_output_utilization_evaluator]
|
|
48
|
+
:end-before: [END tool_output_utilization_evaluator]
|
|
49
|
+
:language: python
|
|
50
|
+
:dedent: 8
|
|
51
|
+
:caption: Initialize and call a _ToolOutputUtilizationEvaluator with a query and response.
|
|
52
|
+
|
|
53
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
54
|
+
|
|
55
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
56
|
+
:start-after: [START tool_output_utilization_evaluator]
|
|
57
|
+
:end-before: [END tool_output_utilization_evaluator]
|
|
58
|
+
:language: python
|
|
59
|
+
:dedent: 8
|
|
60
|
+
:caption: Initialize and call _ToolOutputUtilizationEvaluator using Azure AI Project URL in the following format
|
|
61
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
_PROMPTY_FILE = "tool_output_utilization.prompty"
|
|
66
|
+
_RESULT_KEY = "tool_output_utilization"
|
|
67
|
+
_OPTIONAL_PARAMS = ["tool_definitions"]
|
|
68
|
+
|
|
69
|
+
_DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE = 1
|
|
70
|
+
|
|
71
|
+
id = "azureai://built-in/evaluators/tool_output_utilization"
|
|
72
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
73
|
+
|
|
74
|
+
@override
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
model_config,
|
|
78
|
+
*,
|
|
79
|
+
credential=None,
|
|
80
|
+
**kwargs,
|
|
81
|
+
):
|
|
82
|
+
current_dir = os.path.dirname(__file__)
|
|
83
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
84
|
+
super().__init__(
|
|
85
|
+
model_config=model_config,
|
|
86
|
+
prompty_file=prompty_path,
|
|
87
|
+
result_key=self._RESULT_KEY,
|
|
88
|
+
threshold=self._DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE,
|
|
89
|
+
credential=credential,
|
|
90
|
+
_higher_is_better=True,
|
|
91
|
+
**kwargs,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@overload
|
|
95
|
+
def __call__(
|
|
96
|
+
self,
|
|
97
|
+
*,
|
|
98
|
+
query: Union[str, List[dict]],
|
|
99
|
+
response: Union[str, List[dict]],
|
|
100
|
+
tool_definitions: Union[dict, List[dict]],
|
|
101
|
+
) -> Dict[str, Union[str, float]]:
|
|
102
|
+
"""Evaluate tool output utilization for a given query, response, and optional tool defintions.
|
|
103
|
+
The query and response can be either a string or a list of messages.
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
Example with string inputs and no tools:
|
|
107
|
+
evaluator = _ToolOutputUtilizationEvaluator(model_config)
|
|
108
|
+
query = "What is the weather today?"
|
|
109
|
+
response = "The weather is sunny."
|
|
110
|
+
|
|
111
|
+
result = evaluator(query=query, response=response)
|
|
112
|
+
|
|
113
|
+
Example with list of messages:
|
|
114
|
+
evaluator = _ToolOutputUtilizationEvaluator(model_config)
|
|
115
|
+
query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
|
|
116
|
+
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
|
|
117
|
+
tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
|
|
118
|
+
|
|
119
|
+
result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
|
|
120
|
+
|
|
121
|
+
:keyword query: The query being evaluated, either a string or a list of messages.
|
|
122
|
+
:paramtype query: Union[str, List[dict]]
|
|
123
|
+
:keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
|
|
124
|
+
:paramtype response: Union[str, List[dict]]
|
|
125
|
+
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
|
|
126
|
+
:paramtype tool_definitions: Union[dict, List[dict]]
|
|
127
|
+
:return: A dictionary with the tool output utilization evaluation results.
|
|
128
|
+
:rtype: Dict[str, Union[str, float]]
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
@override
|
|
132
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
133
|
+
self,
|
|
134
|
+
*args,
|
|
135
|
+
**kwargs,
|
|
136
|
+
):
|
|
137
|
+
"""
|
|
138
|
+
Invokes the instance using the overloaded __call__ signature.
|
|
139
|
+
|
|
140
|
+
For detailed parameter types and return value documentation, see the overloaded __call__ definition.
|
|
141
|
+
"""
|
|
142
|
+
return super().__call__(*args, **kwargs)
|
|
143
|
+
|
|
144
|
+
@override
|
|
145
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
146
|
+
"""Do Tool Output Utilization evaluation.
|
|
147
|
+
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
|
|
148
|
+
:type eval_input: Dict
|
|
149
|
+
:return: The evaluation result.
|
|
150
|
+
:rtype: Dict
|
|
151
|
+
"""
|
|
152
|
+
# we override the _do_eval method as we want the output to be a dictionary,
|
|
153
|
+
# which is a different schema than _base_prompty_eval.py
|
|
154
|
+
if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input):
|
|
155
|
+
raise EvaluationException(
|
|
156
|
+
message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.",
|
|
157
|
+
internal_message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.",
|
|
158
|
+
blame=ErrorBlame.USER_ERROR,
|
|
159
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
160
|
+
target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
tool_definitions = eval_input["tool_definitions"]
|
|
164
|
+
filtered_tool_definitions = filter_to_used_tools(
|
|
165
|
+
tool_definitions=tool_definitions,
|
|
166
|
+
msgs_lists=[eval_input["query"], eval_input["response"]],
|
|
167
|
+
logger=logger,
|
|
168
|
+
)
|
|
169
|
+
eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)
|
|
170
|
+
|
|
171
|
+
eval_input["query"] = reformat_conversation_history(
|
|
172
|
+
eval_input["query"],
|
|
173
|
+
logger,
|
|
174
|
+
include_system_messages=True,
|
|
175
|
+
include_tool_messages=True,
|
|
176
|
+
)
|
|
177
|
+
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
|
|
178
|
+
|
|
179
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
180
|
+
llm_output = prompty_output_dict.get("llm_output", "")
|
|
181
|
+
if isinstance(llm_output, dict):
|
|
182
|
+
output_label = llm_output.get("label", None)
|
|
183
|
+
if output_label is None:
|
|
184
|
+
if logger:
|
|
185
|
+
logger.warning("LLM output does not contain 'label' key, returning NaN for the score.")
|
|
186
|
+
output_label = "fail"
|
|
187
|
+
|
|
188
|
+
output_label = output_label.lower()
|
|
189
|
+
if output_label not in ["pass", "fail"]:
|
|
190
|
+
if logger:
|
|
191
|
+
logger.warning(
|
|
192
|
+
f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), returning NaN for the score."
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
score = 1.0 if output_label == "pass" else 0.0
|
|
196
|
+
score_result = output_label
|
|
197
|
+
reason = llm_output.get("reason", "")
|
|
198
|
+
|
|
199
|
+
faulty_details = llm_output.get("faulty_details", [])
|
|
200
|
+
if faulty_details:
|
|
201
|
+
reason += " Issues found: " + "; ".join(faulty_details)
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
f"{self._result_key}": score,
|
|
205
|
+
f"{self._result_key}_reason": reason,
|
|
206
|
+
f"{self._result_key}_result": score_result,
|
|
207
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
208
|
+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
|
|
209
|
+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
|
|
210
|
+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
|
|
211
|
+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
|
|
212
|
+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
|
|
213
|
+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
|
|
214
|
+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
|
|
215
|
+
}
|
|
216
|
+
if logger:
|
|
217
|
+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
|
|
218
|
+
|
|
219
|
+
score = math.nan
|
|
220
|
+
binary_result = self._get_binary_result(score)
|
|
221
|
+
return {
|
|
222
|
+
self._result_key: float(score),
|
|
223
|
+
f"{self._result_key}_result": binary_result,
|
|
224
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
225
|
+
}
|