azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
  3. azure/ai/evaluation/_aoai/label_grader.py +6 -10
  4. azure/ai/evaluation/_aoai/python_grader.py +7 -10
  5. azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
  6. azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +241 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -2
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
  33. azure/ai/evaluation/_evaluate/_utils.py +10 -3
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
  38. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  39. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
  40. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  41. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  42. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  43. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
  44. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  45. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  46. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  47. azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
  48. azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
  49. azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
  50. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
  52. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  53. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  55. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  56. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  57. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  58. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  59. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  60. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  61. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  62. azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
  63. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  64. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  65. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  66. azure/ai/evaluation/_exceptions.py +6 -1
  67. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  68. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  69. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  70. azure/ai/evaluation/_model_configurations.py +26 -0
  71. azure/ai/evaluation/_version.py +1 -1
  72. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  73. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  74. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  75. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  76. azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
  77. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  78. azure/ai/evaluation/red_team/_red_team.py +494 -37
  79. azure/ai/evaluation/red_team/_red_team_result.py +48 -28
  80. azure/ai/evaluation/red_team/_result_processor.py +558 -29
  81. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  82. azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
  83. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  84. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  85. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  86. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  87. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  88. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  90. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  91. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  92. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  94. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  95. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
  96. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
  97. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  98. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  99. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,221 @@
1
+ ---
2
+ name: Tool Output Utilization Evaluator
3
+ description: Binary evaluator that judges whether an agent correctly understands and *uses* the outputs returned by tools it invoked (APIs, search/retrieval, DB queries, etc.). This evaluator focuses ONLY on incorrect, missing, or fabricated uses of tool outputs — whether they are used in the final response to the user or reused as inputs to subsequent tool calls. It does NOT judge tool selection, correctness of new inputs, or general reasoning quality.
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 1500
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: json_object
14
+
15
+ inputs:
16
+ query:
17
+ type: string
18
+ response:
19
+ type: string
20
+ tool_definitions:
21
+ type: string
22
+ ---
23
+ system:
24
+ You are **Tool Output Utilization Judge**, an expert evaluator whose only task is to decide whether the AGENT correctly interpreted and *used* TOOL OUTPUTS whenproducing the RESPONSE.
25
+
26
+ Key constraints:
27
+
28
+ - **Focus exclusively** on uses of tool outputs. A "use" means any appearance or
29
+ incorporation of a prior tool output (from `query`) within the agent's `response`
30
+ — either as part of the textual content to the user or as a parameter inside a new tool call.
31
+ - Do **not** judge whether the agent chose the right tool, made the right new call,
32
+ or used the correct input format. Those are evaluated separately.
33
+ - Treat `query` as the authoritative source of all prior conversation.
34
+ - Treat `response` as the agent's latest message, which may:
35
+ 1. State facts that come from tool outputs.
36
+ 2. Contain tool calls that reference or reuse prior tool outputs.
37
+ - Use `tool_definitions` for contextual understanding of tool structures (fields, types, units, etc.).
38
+ - Conservative rule: if any tool-derived information appears incorrectly used in RESPONSE, omitted when relevant, or fabricated, mark it as a fault.
39
+
40
+ INPUT
41
+ =====
42
+
43
+ CONVERSATION_HISTORY: {{query}}
44
+ AGENT_RESPONSE: {{response}}
45
+ TOOL_DEFINITIONS: {{tool_definitions}}
46
+
47
+ > `CONVERSATION_HISTORY` includes all prior turns and any tool results.
48
+ > `AGENT_RESPONSE` is the model's latest message.
49
+ > `TOOL_DEFINITIONS` describe the tool schemas used.
50
+
51
+ user:
52
+ ROLE
53
+ ====
54
+
55
+ You are Tool Output Utilization Judge. Evaluate whether the RESPONSE correctly:
56
+
57
+ - Reflects the factual content of prior tool outputs from `query`, and
58
+ - Reuses any of those tool outputs correctly when incorporating them into new tool calls or the textual response.
59
+
60
+ TASK
61
+ ====
62
+
63
+ Produce exactly one JSON object (and nothing else) with these keys in **this exact order**:
64
+
65
+ 1. `faulty_details`: array of strings — list only the faults found (empty array if none).
66
+ Each entry can follow one of these formats:
67
+ - "claim -> MISMATCH (expected X, saw Y) mapped to tool_name.field_path"
68
+ - "claim -> FABRICATED (no supporting tool field)"
69
+ - "use -> FABRICATED (referenced value not found in prior tool outputs)"
70
+ - "use -> MISMATCH (expected X, used Y) mapped to tool_name.field_path"
71
+
72
+
73
+ 2. `reason`: short 1–2 sentence summary of why PASS or FAIL.
74
+ 3. `label`: string `"pass"` or `"fail"`.
75
+
76
+ > Output must be valid JSON, all lowercase keys, no extra text or markdown.
77
+
78
+ EVALUATION STEPS
79
+ ================
80
+
81
+ 1. Identify all **instances** in the RESPONSE where tool outputs are *used*:
82
+ - Either referenced in text (factual claims to the user), or
83
+ - Reused as parameters in new tool calls.
84
+ 2. For each instance:
85
+ - Cross-check against the corresponding tool outputs in `query`.
86
+ - If the usage faithfully matches the tool output (exact or paraphrased) → OK.
87
+ - If the agent uses wrong values, wrong entities, incorrect transformations, or fabricates data → record as fault.
88
+ 3. Populate the JSON object:
89
+ - `faulty_details`: all detected issues (empty if none).
90
+ - `reason`: concise rationale.
91
+ - `label`: `"pass"` or `"fail"`.
92
+
93
+ SCORING RULES
94
+ =============
95
+
96
+ - **PASS:** No faulty uses of tool outputs found (empty `faulty_details`) in the RESPONSE.
97
+ - **FAIL:** Any misuse, fabrication, omission, or misinterpretation of a tool output,
98
+ including when a prior tool output is reused incorrectly in a new tool call in the RESPONSE.
99
+
100
+ IMPLEMENTATION NOTES
101
+ ====================
102
+
103
+ - Do NOT evaluate:
104
+ - The correctness of *which tool* was used.
105
+ - Whether new tool inputs are valid by themselves.
106
+ - Task success or completeness.
107
+ - Your judgment concerns *only* whether previously returned tool outputs are
108
+ correctly understood and reused where they appear.
109
+ - If multiple faulty uses exist, list all in `faulty_details`.
110
+ - When uncertain whether a value use is correct, treat it as a fault and explain why.
111
+ - If tool outputs are missing but the response claims to use them, that counts as a fabricated use.
112
+ - If a tool fails, that is outside your scope; unless the response misuses or misreports the failed output.
113
+
114
+ > [TOOL CALLS] and [TOOL RESULTS] are internal, user does not see them.
115
+
116
+ EXAMPLES (few-shot — using the new JSON schema and key order)
117
+
118
+ ### Example 1 - PASS
119
+ QUERY:
120
+ User turn 1:
121
+ Can you transfer $500 from my checking to my savings account?
122
+
123
+ Agent turn 1:
124
+ [TOOL_CALL] get_account_balances(user_id="USER456")
125
+ [TOOL_RESULT] {'accounts': [{'account_id': 'CHK001', 'type': 'checking', 'balance': 1250.75}, {'account_id': 'SAV001', 'type': 'savings', 'balance': 3400.20}]}
126
+ You have $1,250.75 in checking and $3,400.20 in savings. You have enough for the transfer.
127
+
128
+ User turn 2:
129
+ Great, please go ahead.
130
+
131
+ RESPONSE:
132
+ [TOOL_CALL] transfer_funds(from_account="CHK001", to_account="SAV001", amount=500)
133
+ [TOOL_RESULT] {'transaction_id':'TXN789123','status':'completed','from_account':'CHK001','to_account':'SAV001','amount':500,'new_balances':{'CHK001':750.75,'SAV001':3900.20}}
134
+ Transfer completed successfully. Checking now has $750.75 and savings $3,900.20.
135
+
136
+ EXPECTED JSON:
137
+ {
138
+ "faulty_details": [],
139
+ "reason": "All tool-derived claims and uses in the response match the prior tool outputs correctly.",
140
+ "label": "pass",
141
+ }
142
+
143
+
144
+ ### Example 2 - FAIL (unit misinterpretation)
145
+ QUERY:
146
+ User turn 1:
147
+ What's the current temperature in Rome?
148
+
149
+ RESPONSE:
150
+ [TOOL_CALL] weather_api(city="Rome")
151
+ [TOOL_RESULT] {"city":"Rome","temp":28,"condition":"Sunny"}
152
+ It’s currently 28°F and sunny in Rome.
153
+
154
+ TOOL_DEFINITIONS:
155
+ - weather_api: Gets current weather for a city and returns temperature in Celsius (inputs: city) (outputs: city, temp, condition)
156
+
157
+ EXPECTED JSON:
158
+ {
159
+ "faulty_details": [
160
+ "claim -> MISMATCH (expected 28°C, saw 28°F) mapped to weather_api.temp"
161
+ ],
162
+ "reason": "Agent incorrectly reported the temperature in Fahrenheit instead of Celsius as provided by the tool output.",
163
+ "label": "fail",
164
+ }
165
+
166
+
167
+ ### Example 3 - FAIL (fabricated inventory claim)
168
+ QUERY:
169
+ User turn 1:
170
+ Can you check if item B123 is available?
171
+
172
+ RESPONSE:
173
+ [TOOL_CALL] inventory_api(item_id="B123")
174
+ [TOOL_RESULT] {"item_id":"B123","qty":0,"eta":"2025-10-07"}
175
+ The item is in stock till the 7th of October.
176
+
177
+ EXPECTED JSON:
178
+ {
179
+ "faulty_details": [
180
+ "claim -> FABRICATED (no supporting tool field; inventory_api.qty is 0, eta is future date)"
181
+ ],
182
+ "reason": "Agent fabricated a claim that the item is in stock, whereas the tool output indicates a quantity of 0 and misunderstood availability eta",
183
+ "label": "fail",
184
+ }
185
+
186
+
187
+ ### Example 4 - FAIL (misuse of prior tool output in new tool call)
188
+ QUERY:
189
+ User turn 1:
190
+ Show my latest account transactions.
191
+
192
+ Agent turn 1:
193
+ [TOOL_CALL] get_account_list(user_id="U123")
194
+ [TOOL_RESULT] {"accounts":[{"id":"CHK100","type":"checking"},{"id":"SAV200","type":"savings"}]}
195
+ You have two accounts: CHK100 and SAV200. Which one do you want transactions for?
196
+
197
+ User turn 2:
198
+ Please get the transaction history for my checking account.
199
+
200
+ RESPONSE:
201
+ [TOOL_CALL] get_transactions(account_id="SAV200")
202
+ [TOOL_RESULT] ...
203
+ The latest transactions for your checking account are: ...
204
+
205
+ EXPECTED JSON:
206
+ {
207
+ "faulty_details": [
208
+ "use -> MISMATCH (expected CHK100, used SAV200) mapped to get_transactions.account_id"
209
+ ],
210
+ "reason": "Agent incorrectly used the savings account ID instead of the requested checking account ID, leading to a mismatch in the transactions reported.",
211
+ "label": "fail",
212
+ }
213
+
214
+ --
215
+ END OF EXAMPLES
216
+
217
+ FINAL NOTES:
218
+
219
+ - Output must be exactly one JSON object and must follow the key order: `faulty_details`, `reason`, `label`.
220
+
221
+ # Output
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._tool_selection import _ToolSelectionEvaluator
6
+
7
+ __all__ = [
8
+ "_ToolSelectionEvaluator",
9
+ ]
@@ -0,0 +1,266 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import logging
6
+ import math
7
+ from typing import Dict, List, Union, TypeVar, Optional
8
+ from typing_extensions import overload, override
9
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
10
+ from azure.ai.evaluation._exceptions import (
11
+ ErrorBlame,
12
+ ErrorCategory,
13
+ ErrorTarget,
14
+ EvaluationException,
15
+ )
16
+ from ..._common.utils import check_score_is_valid, reformat_conversation_history
17
+ from azure.ai.evaluation._common._experimental import experimental
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @experimental
23
+ class _ToolSelectionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
24
+ """The Tool Selection evaluator assesses the appropriateness and efficiency of tool choices made by an AI agent by examining:
25
+ - Relevance of selected tools to the conversation.
26
+ - Completeness of tool selection according to task requirements.
27
+ - Efficiency in avoiding unnecessary or redundant tools.
28
+
29
+ The evaluator uses a binary scoring system:
30
+ - Score 0 (Fail): Tools selected are irrelevant, incorrect, or missing essential tools
31
+ - Score 1 (Pass): All needed tools are selected, even if there are redundant tools
32
+
33
+ This evaluation focuses on measuring whether the right tools were chosen for the task,
34
+ regardless of how those tools were executed or their parameter correctness.
35
+
36
+ :param model_config: Configuration for the Azure OpenAI model.
37
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
38
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
39
+
40
+ .. admonition:: Example:
41
+
42
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
43
+ :start-after: [START tool_selection_evaluator]
44
+ :end-before: [END tool_selection_evaluator]
45
+ :language: python
46
+ :dedent: 8
47
+ :caption: Initialize and call a _ToolSelectionEvaluator.
48
+
49
+ .. admonition:: Example using Azure AI Project URL:
50
+
51
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
52
+ :start-after: [START tool_selection_evaluator]
53
+ :end-before: [END tool_selection_evaluator]
54
+ :language: python
55
+ :dedent: 8
56
+ :caption: Initialize and call _ToolSelectionEvaluator using Azure AI Project URL in the following format
57
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
58
+
59
+ .. note::
60
+
61
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
62
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
63
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
64
+ """
65
+
66
+ _PROMPTY_FILE = "tool_selection.prompty"
67
+ _RESULT_KEY = "tool_selection"
68
+
69
+ _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
70
+ _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
71
+ _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
72
+ _INVALID_SCORE_MESSAGE = "Tool selection score must be 0 or 1."
73
+
74
+ id = "azureai://built-in/evaluators/tool_selection"
75
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
76
+
77
+ @override
78
+ def __init__(self, model_config, *, threshold=1, credential=None, **kwargs):
79
+ current_dir = os.path.dirname(__file__)
80
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
81
+ self.threshold = threshold
82
+ super().__init__(
83
+ model_config=model_config,
84
+ prompty_file=prompty_path,
85
+ result_key=self._RESULT_KEY,
86
+ threshold=1,
87
+ credential=credential,
88
+ **kwargs,
89
+ )
90
+
91
+ @override
92
+ def __call__( # pylint: disable=docstring-missing-param
93
+ self,
94
+ *args,
95
+ **kwargs,
96
+ ):
97
+ """
98
+ Evaluate tool selection quality for a given query, tool definitions, and tool calls.
99
+
100
+ For detailed parameter types and return value documentation, see the class documentation.
101
+ """
102
+ return super().__call__(*args, **kwargs)
103
+
104
+ def _convert_kwargs_to_eval_input(self, **kwargs):
105
+ """Convert an arbitrary input into a list of inputs for evaluators.
106
+ It is assumed that evaluators generally make use of their inputs in one of two ways.
107
+ Either they receive a collection of keyname inputs that are all single values
108
+ (like a query and response), or they receive conversation that is a list of dictionary
109
+ values.
110
+
111
+ The self._singleton_inputs list assigned during initialization is used to find and extract
112
+ singleton keywords, and self._allow_conversation_input is used to determine if a conversation
113
+ is a valid input.
114
+
115
+ If both conversations and singletons are allowed, the function will raise an exception if both
116
+ are inputted.
117
+
118
+ This function must be overridden by child classes IF they need to both a conversation and
119
+ other inputs to be passed in.
120
+
121
+ :keyword kwargs: The inputs to convert.
122
+ :type kwargs: Dict
123
+ :return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
124
+ :rtype: List
125
+ """
126
+ # Collect inputs
127
+ tool_calls = kwargs.get("tool_calls")
128
+ tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
129
+ query = kwargs.get("query")
130
+ response = kwargs.get("response")
131
+
132
+ # Extract tool calls from response if not provided directly
133
+ if response:
134
+ parsed_tool_calls = self._parse_tools_from_response(response)
135
+ if parsed_tool_calls:
136
+ tool_calls = parsed_tool_calls
137
+
138
+ if not tool_calls:
139
+ return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
140
+
141
+ if not isinstance(tool_calls, list):
142
+ tool_calls = [tool_calls]
143
+ if not isinstance(tool_definitions, list):
144
+ tool_definitions = [tool_definitions] if tool_definitions else []
145
+
146
+ try:
147
+ needed_tool_definitions = self._extract_needed_tool_definitions(
148
+ tool_calls, tool_definitions, ErrorTarget.TOOL_SELECTION_EVALUATOR
149
+ )
150
+ except EvaluationException as e:
151
+ # Check if this is because no tool definitions were provided at all
152
+ if len(tool_definitions) == 0:
153
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
154
+ else:
155
+ return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
156
+
157
+ if len(needed_tool_definitions) == 0:
158
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
159
+
160
+ # Extract only tool names from tool calls, removing parameters and results
161
+ tool_names = self._extract_tool_names_from_calls(tool_calls)
162
+
163
+ return {
164
+ "query": query,
165
+ "tool_calls": tool_names, # Only tool names, no parameters
166
+ "tool_definitions": needed_tool_definitions,
167
+ }
168
+
169
+ @override
170
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
171
+ """Do Tool Selection evaluation.
172
+
173
+ :param eval_input: The input to the evaluator.
174
+ :type eval_input: Dict
175
+ :return: A dictionary containing the result of the evaluation.
176
+ :rtype: Dict[str, Union[str, float]]
177
+ """
178
+ # Format conversation history for cleaner evaluation
179
+ if "query" in eval_input:
180
+ eval_input["query"] = reformat_conversation_history(
181
+ eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
182
+ )
183
+
184
+ # Call the LLM to evaluate
185
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
186
+ llm_output = prompty_output_dict.get("llm_output", {})
187
+
188
+ if isinstance(llm_output, dict):
189
+ score = llm_output.get("score", None)
190
+ if score not in [0, 1]:
191
+ raise EvaluationException(
192
+ message=f"Invalid score value: {score}. Expected 0 or 1.",
193
+ internal_message="Invalid score value.",
194
+ category=ErrorCategory.FAILED_EXECUTION,
195
+ blame=ErrorBlame.SYSTEM_ERROR,
196
+ )
197
+
198
+ # Format the output
199
+ explanation = llm_output.get("explanation", "")
200
+ score = int(score) # Keep as int since it's binary (0 or 1)
201
+ score_result = "pass" if score == 1 else "fail"
202
+
203
+ # Add tool selection accuracy post-processing
204
+ details = llm_output.get("details", {})
205
+ if details:
206
+ tool_selection_accuracy = self._calculate_tool_selection_accuracy(details)
207
+ details["tool_selection_accuracy"] = tool_selection_accuracy
208
+
209
+ response_dict = {
210
+ self._result_key: score,
211
+ f"{self._result_key}_result": score_result,
212
+ f"{self._result_key}_threshold": self._threshold,
213
+ f"{self._result_key}_reason": explanation,
214
+ f"{self._result_key}_details": details,
215
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
216
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
217
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
218
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
219
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
220
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
221
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
222
+ }
223
+ return response_dict
224
+
225
+ else:
226
+ raise EvaluationException(
227
+ message="Tool selection evaluator returned invalid output.",
228
+ blame=ErrorBlame.SYSTEM_ERROR,
229
+ category=ErrorCategory.FAILED_EXECUTION,
230
+ target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
231
+ )
232
+
233
+ async def _real_call(self, **kwargs):
234
+ """The asynchronous call where real end-to-end evaluation logic is performed.
235
+
236
+ :keyword kwargs: The inputs to evaluate.
237
+ :type kwargs: Dict
238
+ :return: The evaluation result.
239
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
240
+ """
241
+ # Convert inputs into list of evaluable inputs.
242
+ eval_input = self._convert_kwargs_to_eval_input(**kwargs)
243
+ if isinstance(eval_input, dict) and eval_input.get("error_message"):
244
+ return self._not_applicable_result(eval_input.get("error_message"), 1)
245
+
246
+ result = await self._do_eval(eval_input)
247
+
248
+ return result
249
+
250
+ def _calculate_tool_selection_accuracy(self, details):
251
+ """Calculate tool selection accuracy from the evaluation details.
252
+
253
+ :param details: The details dictionary from the LLM evaluation output
254
+ :type details: Dict
255
+ :return: Tool selection accuracy as a percentage
256
+ :rtype: float
257
+ """
258
+ correct_tool_selections = details.get("correct_tool_selections", 0)
259
+ wrong_tool_selections = details.get("wrong_tool_selections", 0)
260
+ total_tools_called = correct_tool_selections + wrong_tool_selections
261
+
262
+ if total_tools_called > 0:
263
+ accuracy = (correct_tool_selections / total_tools_called) * 100
264
+ return round(accuracy, 2)
265
+ else:
266
+ return 100.0
@@ -0,0 +1,104 @@
1
+ ---
2
+ name: Tool Selection
3
+ description: Evaluates Tool Selection Accuracy and Efficiency for tools chosen by agent
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 3000
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: json_object
14
+
15
+ inputs:
16
+ query:
17
+ type: List
18
+ tool_calls:
19
+ type: List
20
+ tool_definitions:
21
+ type: Dict
22
+
23
+ ---
24
+ system:
25
+ # Instruction
26
+ ## Goal
27
+ You are an expert in evaluating the quality of tool selection made by an AI agent. Your goal is to assess whether the agent chose the right tools, avoided unnecessary tools, and didn't miss essential tools needed to address the user's query.
28
+
29
+ - **Definition**: You are given a definition of tool selection quality that is being evaluated to guide your score.
30
+ - **Data**: Your input data include CONVERSATION, TOOL CALLS (names only), and TOOL DEFINITIONS.
31
+ - **Tasks**: You will evaluate the tool selection quality based on their relevance, efficiency, and potential to help the agent resolve the user's query.
32
+
33
+ user:
34
+ # Definition
35
+ **Tool Selection** refers to the appropriateness and efficiency of the TOOLS CHOSEN by an agent in response to a user's query within an ongoing CONVERSATION.
36
+
37
+ # EVALUATION CRITERIA
38
+ Evaluate based on these factors:
39
+
40
+ 1. **Tool Selection**: Are the selected tools appropriate and useful for addressing the user's query?
41
+ 2. **Tool Completeness**: Did the agent select all **ESSENTIAL** tools available in the tool definitions that are **REQUIRED** to address the core request of the query?
42
+ 3. **Tool Efficiency**: Did the agent avoid selecting unnecessary or redundant tools?
43
+ 4. **Scope Limitation**: ONLY evaluate tool selections in the "TOOL CALLS MADE BY AGENT" section.
44
+ - Tool calls in the CONVERSATION section are for context only
45
+ - Focus exclusively on the agent's response to the user's LAST query
46
+ - Use conversation history only to understand context and requirements
47
+
48
+ **Success Criteria**: Tools should be relevant to help answer the query. The focus is on appropriate tool choice, and the selected tools' ability to address the user's query.
49
+
50
+ # Ratings
51
+ ## [Tool Selection: 0] (Fail)
52
+ **Definition:**
53
+ The tool selection fails if any of the following conditions are met:
54
+ - Tools selected are irrelevant or inappropriate for the user's query
55
+ - Essential tools that are available in the TOOL DEFINITIONS and are needed to complete the task are not selected by the agent **AND** haven't been called previously in the conversation
56
+ - The agent selected tools that show a fundamental misunderstanding of the task requirements
57
+
58
+ **Examples:**
59
+ - User asks for weather information -> Agent selects a file search tool, instead of the weather-fetching tool available in the tool definitions.
60
+ - User asks to analyze sales data and generate a report -> Agent selects only a data retrieval tool but misses the report generation tool that is available **AND** no data retrieval was done previously in the conversation.
61
+ - User asks for stock price analysis -> Agent selects multiple unrelated weather, news, and calendar tools but not the stock price tool.
62
+
63
+ ## [Tool Selection: 1] (Pass)
64
+ **Definition:**
65
+ The tool selection passes when all necessary tools have been selected to address the user's query (considering both current tool calls and previous calls in the conversation history), even if redundant or unnecessary tools were also selected. All essential tools must be available to the agent (either from current calls or previous calls), and no selected tools should be completely irrelevant.
66
+
67
+ **Examples:**
68
+ - User asks for weather forecast -> Agent selects weather tool (necessary) plus calendar and reminder tools (unnecessary but not harmful).
69
+ - User asks to book a flight -> Agent selects flight search and booking tools (necessary) plus hotel booking tools (unnecessary, but the essential tool was called).
70
+ - User asks to summarize a document -> Agent selects document reader, text summarizer, and language detector (all useful tools, even if some are not strictly necessary).
71
+ - User asks for "projected spend next season" -> Agent selects `project_budget_over_days` (appropriate for projection) and has access to spending data from previous `get_spending_by_category` calls in the conversation. No need to re-call data retrieval tools.
72
+
73
+ ## Chain of Thought Structure
74
+ Structure your reasoning as follows:
75
+ 1. **Understand the user's query**: Analyze what the user is asking for in their latest message and identify the specific operation type needed.
76
+ 2. **Review conversation history for existing data**: **CRITICALLY IMPORTANT** - Examine the CONVERSATION history to identify what tools have already been called and what data is already available to the agent. If a tool was called earlier in the conversation and provided the necessary data, the agent does NOT need to call it again.
77
+ 3. **Identify required tools**: Determine which tools from the available TOOL DEFINITIONS would be necessary to address the query, excluding any tools that have already been called in the conversation and provided the needed data.
78
+ 4. **Analyze selected tools**: Examine which tools the agent actually selected in the current response.
79
+ 5. **Evaluate appropriateness**: Assess if selected tools are relevant to the task and are the most appropriate tools available for the specific query type.
80
+ 6. **Check completeness**: Verify if any essential tools are missing from the current agent response, **BUT ONLY** count tools as missing if they haven't been called previously in the conversation AND their data is needed for the current query.
81
+ 7. **Assess efficiency**: Determine if any unnecessary or redundant tools were selected.
82
+ 8. **Determine the appropriate level**: Assign 0 (Fail) if tools are irrelevant, incorrect, missing essential tools that weren't previously called. Assign 1 (Pass) if all needed tools are selected (considering conversation history), even with some redundant additions.
83
+
84
+ Note: You are only passed the names of the tools that have been called by the agent. You do not see the parameters that are passed to the tools nor the results of these tools, because your only concern for evaluation is the appropriateness and completeness of the tools selected by the agent, not the parameter correctness nor the result correctness.
85
+
86
+ # Data
87
+ CONVERSATION: {{query}}
88
+ TOOL DEFINITIONS: {{tool_definitions}}
89
+ TOOL CALLS MADE BY AGENT: {{tool_calls}}
90
+
91
+ # Tasks
92
+ ## Please provide your evaluation for the tool selection in relation to the user's query based on the definitions and examples above.
93
+ Your output should consist only of a JSON object with the following fields:
94
+ - explanation: an explanation of the score focusing on tool selection appropriateness, based on the Chain of Thought Structure.
95
+ - details: a dictionary that contains the following fields:
96
+ - correct_tool_selections: number of appropriate/relevant tools selected
97
+ - wrong_tool_selections: number of inappropriate/irrelevant tools selected
98
+ - excessive_tools_used: number of tools that were unnecessary or redundant
99
+ - excessive_tools_list: list of the tool names that were excessive
100
+ - missing_tools: number of essential tools that should have been called but weren't
101
+ - missing_tools_list: list of the tool names that should have been called but weren't
102
+ - score: an integer value of 0 or 1 that represents the tool selection quality (0 = Fail, 1 = Pass).
103
+
104
+ # Output
@@ -2,6 +2,6 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from ._task_success import TaskSuccessEvaluator
5
+ from ._tool_success import _ToolSuccessEvaluator
6
6
 
7
- __all__ = ["TaskSuccessEvaluator"]
7
+ __all__ = ["_ToolSuccessEvaluator"]