azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show
  1. azure/ai/evaluation/__init__.py +46 -12
  2. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  3. azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
  4. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  5. azure/ai/evaluation/_common/rai_service.py +3 -3
  6. azure/ai/evaluation/_common/utils.py +74 -17
  7. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  8. azure/ai/evaluation/_converters/_models.py +75 -26
  9. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
  10. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  11. azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
  12. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
  13. azure/ai/evaluation/_evaluate/_utils.py +5 -2
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  15. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
  20. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  21. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  22. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  23. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  24. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  25. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  26. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  27. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  29. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  30. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
  31. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  37. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
  38. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
  39. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  40. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
  42. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
  43. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  44. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
  45. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
  46. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  48. azure/ai/evaluation/_exceptions.py +1 -0
  49. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  50. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
  51. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  52. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  53. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
  54. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  55. azure/ai/evaluation/_version.py +1 -1
  56. azure/ai/evaluation/red_team/__init__.py +4 -3
  57. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  58. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  59. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  60. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  61. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  62. azure/ai/evaluation/red_team/_red_team.py +655 -2665
  63. azure/ai/evaluation/red_team/_red_team_result.py +6 -0
  64. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  65. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  66. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
  67. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  68. azure/ai/evaluation/red_team/_utils/constants.py +0 -2
  69. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  70. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  71. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  72. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  73. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  74. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  75. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  76. azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
  78. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
  80. azure/ai/evaluation/simulator/_simulator.py +12 -0
  81. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
  82. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
  83. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
  84. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
  85. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
@@ -1,33 +1,75 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from itertools import chain
4
5
  import math
5
6
  import os
6
7
  import logging
7
8
  import re
8
- from typing import Dict, List, Union, TypeVar, cast
9
+ from typing import Dict, List, Union, TypeVar, Optional
9
10
  from typing_extensions import overload, override
10
11
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
- from azure.ai.evaluation._common.utils import remove_optional_singletons, parse_quality_evaluator_reason_score
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
- from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
12
+ from azure.ai.evaluation._exceptions import (
13
+ ErrorBlame,
14
+ ErrorCategory,
15
+ ErrorTarget,
16
+ EvaluationException,
17
+ )
18
+ from ..._common.utils import check_score_is_valid
14
19
  from azure.ai.evaluation._common._experimental import experimental
20
+ from ..._converters._models import (
21
+ _BUILT_IN_DESCRIPTIONS,
22
+ _BUILT_IN_PARAMS,
23
+ )
15
24
 
16
25
  logger = logging.getLogger(__name__)
17
26
 
18
27
  T_EvalValue = TypeVar("T_EvalValue")
19
28
 
20
29
 
30
+ def _get_built_in_definition(tool_name: str):
31
+ """Get the definition for the built-in tool."""
32
+ if tool_name in _BUILT_IN_DESCRIPTIONS:
33
+ return {
34
+ "type": tool_name,
35
+ "description": _BUILT_IN_DESCRIPTIONS[tool_name],
36
+ "name": tool_name,
37
+ "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
38
+ }
39
+ return None
40
+
41
+
42
+ def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
43
+ """Extract tool definitions needed for the given built-in tool calls."""
44
+ needed_definitions = []
45
+ for tool_call in tool_calls:
46
+ if isinstance(tool_call, dict):
47
+ tool_type = tool_call.get("type")
48
+
49
+ # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
50
+ if tool_type == "tool_call":
51
+ tool_name = tool_call.get("name")
52
+ if tool_name in _BUILT_IN_DESCRIPTIONS:
53
+ built_in_def = _get_built_in_definition(tool_name)
54
+ if built_in_def and built_in_def not in needed_definitions:
55
+ needed_definitions.append(built_in_def)
56
+
57
+ return needed_definitions
58
+
59
+
21
60
  @experimental
22
61
  class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
23
62
  """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
24
- - Relevance to the conversation
25
- - Parameter correctness according to tool definitions
26
- - Parameter value extraction from the conversation
63
+ - Relevance to the conversation.
64
+ - Parameter correctness according to tool definitions.
65
+ - Parameter value extraction from the conversation.
27
66
 
28
- The evaluator uses a binary scoring system (0 or 1):
29
- - Score 0: The tool call is irrelevant or contains information not in the conversation/definition
30
- - Score 1: The tool call is relevant with properly extracted parameters from the conversation
67
+ The evaluator uses a scoring rubric of 1 to 5:
68
+ - Score 1: The tool calls are irrelevant
69
+ - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed.
70
+ - Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made.
71
+ - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded.
72
+ - Score 5: The tool calls are relevant, and all parameters were correctly passed.
31
73
 
32
74
  This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing
33
75
  user needs while properly following tool definitions and using information present in the
@@ -64,22 +106,34 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
64
106
  """
65
107
 
66
108
  _PROMPTY_FILE = "tool_call_accuracy.prompty"
67
- _RESULT_KEY = "tool_call_accurate"
68
- _AGGREGATE_RESULT_KEY = "tool_call_accuracy"
109
+ _RESULT_KEY = "tool_call_accuracy"
110
+
111
+ _MAX_TOOL_CALL_ACCURACY_SCORE = 5
112
+ _MIN_TOOL_CALL_ACCURACY_SCORE = 1
113
+ _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
69
114
 
70
- _MAX_TOOL_CALL_ACCURACY_SCORE = 1.0
71
- _MIN_TOOL_CALL_ACCURACY_SCORE = 0.0
72
- _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 0.8
115
+ _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
116
+ _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
117
+ _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
118
+ _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
73
119
 
74
- id = "id"
120
+ _LLM_SCORE_KEY = "tool_calls_success_level"
121
+
122
+ id = "azureai://built-in/evaluators/tool_call_accuracy"
75
123
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
76
124
 
77
125
  @override
78
- def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
126
+ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
79
127
  current_dir = os.path.dirname(__file__)
80
128
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
81
129
  self.threshold = threshold
82
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
130
+ super().__init__(
131
+ model_config=model_config,
132
+ prompty_file=prompty_path,
133
+ result_key=self._RESULT_KEY,
134
+ credential=credential,
135
+ **kwargs,
136
+ )
83
137
 
84
138
  @overload
85
139
  def __call__(
@@ -134,84 +188,45 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
134
188
  """
135
189
  # TODO add warning that only tool calls of type function are supported
136
190
  # Collect inputs
137
- tool_calls = kwargs.get("tool_calls", None)
138
- tool_definitions = kwargs.get("tool_definitions")
139
- query = kwargs.get("query", None)
140
- response = kwargs.get("response", None)
141
-
142
- if response is None and tool_calls is None:
143
- raise EvaluationException(
144
- message="Either response or tool_calls must be provided.",
145
- blame=ErrorBlame.USER_ERROR,
146
- category=ErrorCategory.MISSING_FIELD,
147
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
148
- )
149
-
150
- if tool_definitions is None:
151
- raise EvaluationException(
152
- message="Tool definitions must be provided.",
153
- blame=ErrorBlame.USER_ERROR,
154
- category=ErrorCategory.MISSING_FIELD,
155
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
156
- )
157
-
191
+ tool_calls = kwargs.get("tool_calls")
192
+ tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
193
+ query = kwargs.get("query")
194
+ response = kwargs.get("response")
158
195
  # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
159
- if tool_calls is None:
160
- # Extract tool calls from response if not provided
161
- tool_calls = []
162
- if isinstance(response, list):
163
- for message in response:
164
- if message.get("role") == "assistant":
165
- tool_calls.extend(
166
- [content for content in message.get("content") if content.get("type") == "tool_call"]
167
- )
168
- if len(tool_calls) == 0:
169
- raise EvaluationException(
170
- message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
171
- blame=ErrorBlame.USER_ERROR,
172
- category=ErrorCategory.MISSING_FIELD,
173
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
174
- )
196
+ if response:
197
+ parsed_tool_calls = self._parse_tools_from_response(response)
198
+ if parsed_tool_calls:
199
+ tool_calls = parsed_tool_calls
200
+
201
+ if not tool_calls:
202
+ return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
175
203
 
176
204
  if not isinstance(tool_calls, list):
177
205
  tool_calls = [tool_calls]
178
-
179
206
  if not isinstance(tool_definitions, list):
180
- tool_definitions = [tool_definitions]
181
-
182
- eval_inputs = []
183
- # TODO : When evaluating an agent tool that depends on the output of a previous tool call,
184
- # we need to provide the output of the previous tool call as part of messages.
185
- for tool_call in tool_calls:
186
- if (
187
- isinstance(tool_call, dict) and tool_call.get("type") == "tool_call"
188
- ): # TODO assuming dict here but it can be a class
189
- function_name = tool_call.get("name")
190
- tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
191
- if len(tool_definition) > 0:
192
- tool_definition = tool_definition
193
- else:
194
- raise EvaluationException(
195
- message="Tool definition not found",
196
- blame=ErrorBlame.USER_ERROR,
197
- category=ErrorCategory.INVALID_VALUE,
198
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
199
- )
200
- eval_inputs.append({"query": query, "tool_call": tool_call, "tool_definition": tool_definition})
207
+ tool_definitions = [tool_definitions] if tool_definitions else []
208
+
209
+ try:
210
+ needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
211
+ except EvaluationException as e:
212
+ # Check if this is because no tool definitions were provided at all
213
+ if len(tool_definitions) == 0:
214
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
201
215
  else:
202
- raise EvaluationException(
203
- message="Tool definition not found",
204
- blame=ErrorBlame.USER_ERROR,
205
- category=ErrorCategory.INVALID_VALUE,
206
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
207
- )
216
+ return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
208
217
 
209
- return eval_inputs
218
+ if len(needed_tool_definitions) == 0:
219
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
220
+
221
+ return {
222
+ "query": query,
223
+ "tool_calls": tool_calls,
224
+ "tool_definitions": needed_tool_definitions,
225
+ }
210
226
 
211
227
  @override
212
228
  async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
213
- """Do a relevance evaluation.
214
-
229
+ """Do a tool call accuracy evaluation.
215
230
  :param eval_input: The input to the evaluator. Expected to contain
216
231
  whatever inputs are needed for the _flow method, including context
217
232
  and other fields depending on the child class.
@@ -219,23 +234,43 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
219
234
  :return: The evaluation result.
220
235
  :rtype: Dict
221
236
  """
237
+ # Single LLM call for all tool calls
222
238
  llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
223
239
 
224
- score = math.nan
225
- if llm_output:
226
- score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
227
- if score >= 0 and score <= 1:
228
- return {
229
- self._result_key: bool(float(score)),
230
- f"{self._result_key}_reason": reason,
231
- "tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
232
- }
233
- raise EvaluationException(
234
- message="Tool call accuracy evaluator: Invalid score returned from LLM.",
235
- blame=ErrorBlame.SYSTEM_ERROR,
236
- category=ErrorCategory.INVALID_VALUE,
237
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
238
- )
240
+ if isinstance(llm_output, dict):
241
+ score = llm_output.get(self._LLM_SCORE_KEY, None)
242
+ if not score or not check_score_is_valid(
243
+ score,
244
+ ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE,
245
+ ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE,
246
+ ):
247
+ raise EvaluationException(
248
+ message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
249
+ internal_message="Invalid score value.",
250
+ category=ErrorCategory.FAILED_EXECUTION,
251
+ blame=ErrorBlame.SYSTEM_ERROR,
252
+ )
253
+
254
+ # Format the output
255
+ reason = llm_output.get("chain_of_thought", "")
256
+ score = float(score)
257
+ score_result = "pass" if score >= self.threshold else "fail"
258
+ response_dict = {
259
+ self._result_key: score,
260
+ f"{self._result_key}_result": score_result,
261
+ f"{self._result_key}_threshold": self.threshold,
262
+ f"{self._result_key}_reason": reason,
263
+ "details": llm_output.get("details", {}),
264
+ }
265
+ return response_dict
266
+
267
+ else:
268
+ raise EvaluationException(
269
+ message="Tool call accuracy evaluator returned invalid output.",
270
+ blame=ErrorBlame.SYSTEM_ERROR,
271
+ category=ErrorCategory.FAILED_EXECUTION,
272
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
273
+ )
239
274
 
240
275
  async def _real_call(self, **kwargs):
241
276
  """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -246,106 +281,98 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
246
281
  :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
247
282
  """
248
283
  # Convert inputs into list of evaluable inputs.
249
- eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
250
- if len(eval_input_list) == 0:
251
- return {
252
- self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
253
- f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
254
- f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
255
- f"{self._AGGREGATE_RESULT_KEY}_reason": "No tool calls were made.",
256
- "per_tool_call_details": [],
257
- }
258
-
259
- per_turn_results = []
260
- # Evaluate all inputs.
261
- for eval_input in eval_input_list:
262
- if self._is_applicable_tool(eval_input):
263
- per_turn_results.append(await self._do_eval(eval_input))
264
- else:
265
- per_turn_results.append(self._not_applicable_result(eval_input))
266
-
267
- return self._aggregate_results(per_turn_results=per_turn_results)
268
-
269
- def _is_applicable_tool(self, eval_input):
270
- """Determine if a given tool should be evaluated, since we only evaluate tools that
271
- have sufficient context available.
272
-
273
- :type eval_input: Dict
274
- :return: True if the tool call should be evaluated
275
- :rtype: bool
276
- """
277
- tool_definition = eval_input.get("tool_definition")
278
- if tool_definition is None or len(tool_definition) != 1:
279
- return False
280
- tool_type = tool_definition[0].get("type")
281
- if tool_type is None or tool_type != "function":
282
- return False
283
- return True
284
-
285
- def _not_applicable_result(self, eval_input):
284
+ eval_input = self._convert_kwargs_to_eval_input(**kwargs)
285
+ if isinstance(eval_input, dict) and eval_input.get("error_message"):
286
+ # If there is an error message, return not applicable result
287
+ return self._not_applicable_result(eval_input.get("error_message"))
288
+ # Do the evaluation
289
+ result = await self._do_eval(eval_input)
290
+ # Return the result
291
+ return result
292
+
293
+ def _not_applicable_result(self, error_message):
286
294
  """Return a result indicating that the tool call is not applicable for evaluation.
287
-
288
295
  :param eval_input: The input to the evaluator.
289
296
  :type eval_input: Dict
290
297
  :return: A dictionary containing the result of the evaluation.
291
298
  :rtype: Dict[str, Union[str, float]]
292
299
  """
300
+ # If no tool calls were made or tool call type is not supported, return not applicable result
293
301
  return {
294
- f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
295
- f"{self._result_key}_reason": "Tool call not supported for evaluation",
296
- "tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
302
+ self._result_key: self._NOT_APPLICABLE_RESULT,
303
+ f"{self._result_key}_result": "pass",
304
+ f"{self._result_key}_threshold": self.threshold,
305
+ f"{self._result_key}_reason": error_message,
306
+ "details": {},
297
307
  }
298
308
 
299
- def _aggregate_results(self, per_turn_results):
300
- """Aggregate the evaluation results of each conversation turn into a single result.
309
+ def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
310
+ """Extract the tool definitions that are needed for the provided tool calls."""
311
+ needed_tool_definitions = []
301
312
 
302
- Exact implementation might need to vary slightly depending on the results produced.
303
- Default behavior is to average the all number-based outputs.
313
+ # Add all user-provided tool definitions
314
+ needed_tool_definitions.extend(tool_definitions)
304
315
 
305
- :param per_turn_results: List of evaluation results for each turn in the conversation.
306
- :type per_turn_results: List[Dict]
307
- :return: A dictionary containing aggregated results, with numeric metrics having their
308
- means as top-level values in the dictionary, and all original
309
- values (including non-numerics) located in under the "evaluation_per_turn" key,
310
- which each sub-key being a metric and each sub-value being a the list of that metric's
311
- per-turn values.
312
- :rtype: AggregateResult[T_EvalValue]
313
- """
316
+ # Add the needed built-in tool definitions (if they are called)
317
+ built_in_definitions = _get_needed_built_in_definitions(tool_calls)
318
+ needed_tool_definitions.extend(built_in_definitions)
314
319
 
315
- aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
316
- evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
320
+ # OpenAPI tool is a collection of functions, so we need to expand it
321
+ tool_definitions_expanded = list(
322
+ chain.from_iterable(
323
+ tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
324
+ for tool in needed_tool_definitions
325
+ )
326
+ )
317
327
 
318
- # Go over each turn, and rotate the results into a
319
- # metric: List[values] format for the evals_per_turn dictionary.
328
+ # Validate that all tool calls have corresponding definitions
329
+ for tool_call in tool_calls:
330
+ if isinstance(tool_call, dict):
331
+ tool_type = tool_call.get("type")
332
+
333
+ if tool_type == "tool_call":
334
+ tool_name = tool_call.get("name")
335
+ if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
336
+ # This is a built-in tool from converter, already handled above
337
+ continue
338
+ elif tool_name:
339
+ # This is a regular function tool from converter
340
+ tool_definition_exists = any(
341
+ tool.get("name") == tool_name and tool.get("type", "function") == "function"
342
+ for tool in tool_definitions_expanded
343
+ )
344
+ if not tool_definition_exists:
345
+ raise EvaluationException(
346
+ message=f"Tool definition for {tool_name} not found",
347
+ blame=ErrorBlame.USER_ERROR,
348
+ category=ErrorCategory.INVALID_VALUE,
349
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
350
+ )
351
+ else:
352
+ raise EvaluationException(
353
+ message=f"Tool call missing name: {tool_call}",
354
+ blame=ErrorBlame.USER_ERROR,
355
+ category=ErrorCategory.INVALID_VALUE,
356
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
357
+ )
358
+ else:
359
+ # Unsupported tool format - only converter format is supported
360
+ raise EvaluationException(
361
+ message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
362
+ blame=ErrorBlame.USER_ERROR,
363
+ category=ErrorCategory.INVALID_VALUE,
364
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
365
+ )
366
+ else:
367
+ # Tool call is not a dictionary
368
+ raise EvaluationException(
369
+ message=f"Tool call is not a dictionary: {tool_call}",
370
+ blame=ErrorBlame.USER_ERROR,
371
+ category=ErrorCategory.INVALID_VALUE,
372
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
373
+ )
320
374
 
321
- num_evaluated = len(
322
- [
323
- per_turn_result
324
- for per_turn_result in per_turn_results
325
- if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT
326
- ]
327
- )
328
- if num_evaluated == 0:
329
- # None of the invoked tools were applicable, return not applicable result
330
- # (If a tool fails evaluation, we'll throw an exception)
331
- return {
332
- self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
333
- f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
334
- f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
335
- f"{self._AGGREGATE_RESULT_KEY}_reason": "Tool call accuracy evaluation is not yet supported for the invoked tools.",
336
- "per_tool_call_details": [],
337
- }
338
- # ignore not_applicable results, where the _result_key will be "not applicable"
339
- score = (
340
- sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results]) / num_evaluated
341
- )
342
- aggregated[self._AGGREGATE_RESULT_KEY] = score
343
- aggregated[f"{self._AGGREGATE_RESULT_KEY}_result"] = (
344
- self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
345
- )
346
- aggregated[f"{self._AGGREGATE_RESULT_KEY}_threshold"] = self.threshold
347
- aggregated["per_tool_call_details"] = per_turn_results
348
- return aggregated
375
+ return needed_tool_definitions
349
376
 
350
377
  @override
351
378
  def __call__( # pylint: disable=docstring-missing-param