azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (49) hide show
  1. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  2. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  3. azure/ai/evaluation/_converters/_models.py +75 -26
  4. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  5. azure/ai/evaluation/_evaluate/_evaluate.py +13 -4
  6. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +104 -35
  7. azure/ai/evaluation/_evaluate/_utils.py +4 -0
  8. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +2 -1
  9. azure/ai/evaluation/_evaluators/_common/_base_eval.py +113 -19
  10. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  11. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +1 -1
  12. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -1
  13. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +113 -3
  14. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +8 -2
  15. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +2 -1
  16. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +10 -2
  17. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +2 -1
  18. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +2 -1
  19. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +8 -2
  20. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +104 -60
  21. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +58 -41
  22. azure/ai/evaluation/_exceptions.py +1 -0
  23. azure/ai/evaluation/_version.py +1 -1
  24. azure/ai/evaluation/red_team/__init__.py +2 -1
  25. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  26. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  27. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  28. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  29. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  30. azure/ai/evaluation/red_team/_red_team.py +697 -3067
  31. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  32. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  33. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +3 -1
  34. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  35. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  36. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  37. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  38. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  39. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  40. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  41. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  42. azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -0
  43. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +19 -5
  44. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +4 -3
  45. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/METADATA +39 -3
  46. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/RECORD +49 -41
  47. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/WHEEL +1 -1
  48. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info/licenses}/NOTICE.txt +0 -0
  49. {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- import os
4
+ import os, logging
5
5
  from typing import Dict, List, Optional, Union
6
6
 
7
7
  from typing_extensions import overload, override
@@ -9,7 +9,14 @@ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
9
9
 
10
10
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
11
  from azure.ai.evaluation._model_configurations import Conversation
12
- from ..._common.utils import construct_prompty_model_config, validate_model_config
12
+ from ..._common.utils import (
13
+ ErrorBlame,
14
+ ErrorTarget,
15
+ EvaluationException,
16
+ ErrorCategory,
17
+ construct_prompty_model_config,
18
+ validate_model_config,
19
+ )
13
20
 
14
21
  try:
15
22
  from ..._user_agent import UserAgentSingleton
@@ -21,6 +28,9 @@ except ImportError:
21
28
  return "None"
22
29
 
23
30
 
31
+ logger = logging.getLogger(__name__)
32
+
33
+
24
34
  class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
25
35
  """
26
36
  Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
@@ -78,12 +88,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
88
  _PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
79
89
  _RESULT_KEY = "groundedness"
80
90
  _OPTIONAL_PARAMS = ["query"]
91
+ _SUPPORTED_TOOLS = ["file_search"]
81
92
 
82
93
  id = "azureai://built-in/evaluators/groundedness"
83
94
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
84
95
 
85
96
  @override
86
- def __init__(self, model_config, *, threshold=3, **kwargs):
97
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
87
98
  current_dir = os.path.dirname(__file__)
88
99
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
89
100
 
@@ -93,6 +104,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
93
104
  prompty_file=prompty_path,
94
105
  result_key=self._RESULT_KEY,
95
106
  threshold=threshold,
107
+ credential=credential,
96
108
  _higher_is_better=self._higher_is_better,
97
109
  )
98
110
  self._model_config = model_config
@@ -120,6 +132,26 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
120
132
  :rtype: Dict[str, float]
121
133
  """
122
134
 
135
+ @overload
136
+ def __call__(
137
+ self,
138
+ *,
139
+ query: str,
140
+ response: List[dict],
141
+ tool_definitions: List[dict],
142
+ ) -> Dict[str, Union[str, float]]:
143
+ """Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
144
+
145
+ :keyword query: The query to be evaluated.
146
+ :paramtype query: str
147
+ :keyword response: The response from the agent to be evaluated.
148
+ :paramtype response: List[dict]
149
+ :keyword tool_definitions: The tool definitions used by the agent.
150
+ :paramtype tool_definitions: List[dict]
151
+ :return: The groundedness score.
152
+ :rtype: Dict[str, Union[str, float]]
153
+ """
154
+
123
155
  @overload
124
156
  def __call__(
125
157
  self,
@@ -174,3 +206,81 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
174
206
  self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
175
207
 
176
208
  return super().__call__(*args, **kwargs)
209
+
210
+ async def _real_call(self, **kwargs):
211
+ """The asynchronous call where real end-to-end evaluation logic is performed.
212
+
213
+ :keyword kwargs: The inputs to evaluate.
214
+ :type kwargs: Dict
215
+ :return: The evaluation result.
216
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
217
+ """
218
+ # Convert inputs into list of evaluable inputs.
219
+ try:
220
+ return await super()._real_call(**kwargs)
221
+ except EvaluationException as ex:
222
+ if ex.category == ErrorCategory.NOT_APPLICABLE:
223
+ return {
224
+ self._result_key: self._NOT_APPLICABLE_RESULT,
225
+ f"{self._result_key}_result": "pass",
226
+ f"{self._result_key}_threshold": self.threshold,
227
+ f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
228
+ }
229
+ else:
230
+ raise ex
231
+
232
+ def _convert_kwargs_to_eval_input(self, **kwargs):
233
+ if "context" in kwargs or "conversation" in kwargs:
234
+ return super()._convert_kwargs_to_eval_input(**kwargs)
235
+
236
+ query = kwargs.get("query")
237
+ response = kwargs.get("response")
238
+ tool_definitions = kwargs.get("tool_definitions")
239
+
240
+ if not query or not response or not tool_definitions:
241
+ msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query', 'response' and 'tool_definitions' are required."
242
+ raise EvaluationException(
243
+ message=msg,
244
+ blame=ErrorBlame.USER_ERROR,
245
+ category=ErrorCategory.INVALID_VALUE,
246
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
247
+ )
248
+
249
+ context = self._get_context_from_agent_response(response, tool_definitions)
250
+ if not context:
251
+ raise EvaluationException(
252
+ message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
253
+ blame=ErrorBlame.USER_ERROR,
254
+ category=ErrorCategory.NOT_APPLICABLE,
255
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
256
+ )
257
+
258
+ return super()._convert_kwargs_to_eval_input(response=response[-1], context=context, query=query)
259
+
260
+ def _get_context_from_agent_response(self, response, tool_definitions):
261
+ context = ""
262
+ try:
263
+ logger.debug("Extracting context from response")
264
+ tool_calls = self._parse_tools_from_response(response=response)
265
+ logger.debug(f"Tool Calls parsed successfully : {tool_calls}")
266
+ if tool_calls:
267
+ for tool_call in tool_calls:
268
+ if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
269
+ tool_name = tool_call.get("name")
270
+ for tool in tool_definitions:
271
+ if tool.get("name") == tool_name and tool.get("type") in self._SUPPORTED_TOOLS:
272
+ if tool_name == "file_search":
273
+ tool_result = tool_call.get("tool_result")
274
+ if tool_result:
275
+ for result in tool_result:
276
+ content_list = result.get("content")
277
+ if content_list:
278
+ for content in content_list:
279
+ text = content.get("text")
280
+ if text:
281
+ context = context + "\n" + str(text)
282
+ except Exception as ex:
283
+ logger.debug(f"Error extracting context from agent response : {str(ex)}")
284
+ context = ""
285
+
286
+ return context if context else None
@@ -61,11 +61,17 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
61
61
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
62
62
 
63
63
  @override
64
- def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
64
+ def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
65
65
  current_dir = os.path.dirname(__file__)
66
66
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
67
67
  self.threshold = threshold
68
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
68
+ super().__init__(
69
+ model_config=model_config,
70
+ prompty_file=prompty_path,
71
+ result_key=self._RESULT_KEY,
72
+ credential=credential,
73
+ **kwargs,
74
+ )
69
75
 
70
76
  @overload
71
77
  def __call__(
@@ -79,7 +79,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
79
79
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
80
80
 
81
81
  @override
82
- def __init__(self, model_config, *, threshold=3):
82
+ def __init__(self, model_config, *, credential=None, threshold=3):
83
83
  current_dir = os.path.dirname(__file__)
84
84
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
85
85
  self._threshold = threshold
@@ -89,6 +89,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
89
89
  prompty_file=prompty_path,
90
90
  result_key=self._RESULT_KEY,
91
91
  threshold=threshold,
92
+ credential=credential,
92
93
  _higher_is_better=self._higher_is_better,
93
94
  )
94
95
 
@@ -73,11 +73,19 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
73
73
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
74
74
 
75
75
  @override
76
- def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs):
76
+ def __init__(
77
+ self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
78
+ ):
77
79
  current_dir = os.path.dirname(__file__)
78
80
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
79
81
  self.threshold = threshold
80
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
82
+ super().__init__(
83
+ model_config=model_config,
84
+ prompty_file=prompty_path,
85
+ result_key=self._RESULT_KEY,
86
+ credential=credential,
87
+ **kwargs,
88
+ )
81
89
 
82
90
  @overload
83
91
  def __call__(
@@ -78,7 +78,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
78
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
79
79
 
80
80
  @override
81
- def __init__(self, model_config, *, threshold: float = 3): # pylint: disable=super-init-not-called
81
+ def __init__(self, model_config, *, threshold: float = 3, credential=None):
82
82
  current_dir = os.path.dirname(__file__)
83
83
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
84
84
  self._threshold = threshold
@@ -88,6 +88,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
88
88
  prompty_file=prompty_path,
89
89
  result_key=self._RESULT_KEY,
90
90
  threshold=threshold,
91
+ credential=credential,
91
92
  _higher_is_better=self._higher_is_better,
92
93
  )
93
94
 
@@ -75,7 +75,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
75
75
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
76
76
 
77
77
  @override
78
- def __init__(self, model_config, *, threshold=3):
78
+ def __init__(self, model_config, *, threshold=3, credential=None):
79
79
  current_dir = os.path.dirname(__file__)
80
80
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
81
81
  self._threshold = threshold
@@ -85,6 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
85
85
  prompty_file=prompty_path,
86
86
  result_key=self._RESULT_KEY,
87
87
  threshold=threshold,
88
+ credential=credential,
88
89
  _higher_is_better=self._higher_is_better,
89
90
  )
90
91
 
@@ -69,11 +69,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
69
69
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
70
70
 
71
71
  @override
72
- def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
72
+ def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
73
73
  current_dir = os.path.dirname(__file__)
74
74
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
75
75
  self.threshold = threshold
76
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
76
+ super().__init__(
77
+ model_config=model_config,
78
+ prompty_file=prompty_path,
79
+ result_key=self._RESULT_KEY,
80
+ credential=credential,
81
+ **kwargs,
82
+ )
77
83
 
78
84
  @overload
79
85
  def __call__(
@@ -1,11 +1,12 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from itertools import chain
4
5
  import math
5
6
  import os
6
7
  import logging
7
8
  import re
8
- from typing import Dict, List, Union, TypeVar, cast
9
+ from typing import Dict, List, Union, TypeVar, Optional
9
10
  from typing_extensions import overload, override
10
11
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
12
  from azure.ai.evaluation._exceptions import (
@@ -16,12 +17,46 @@ from azure.ai.evaluation._exceptions import (
16
17
  )
17
18
  from ..._common.utils import check_score_is_valid
18
19
  from azure.ai.evaluation._common._experimental import experimental
20
+ from ..._converters._models import (
21
+ _BUILT_IN_DESCRIPTIONS,
22
+ _BUILT_IN_PARAMS,
23
+ )
19
24
 
20
25
  logger = logging.getLogger(__name__)
21
26
 
22
27
  T_EvalValue = TypeVar("T_EvalValue")
23
28
 
24
29
 
30
+ def _get_built_in_definition(tool_name: str):
31
+ """Get the definition for the built-in tool."""
32
+ if tool_name in _BUILT_IN_DESCRIPTIONS:
33
+ return {
34
+ "type": tool_name,
35
+ "description": _BUILT_IN_DESCRIPTIONS[tool_name],
36
+ "name": tool_name,
37
+ "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
38
+ }
39
+ return None
40
+
41
+
42
+ def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
43
+ """Extract tool definitions needed for the given built-in tool calls."""
44
+ needed_definitions = []
45
+ for tool_call in tool_calls:
46
+ if isinstance(tool_call, dict):
47
+ tool_type = tool_call.get("type")
48
+
49
+ # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
50
+ if tool_type == "tool_call":
51
+ tool_name = tool_call.get("name")
52
+ if tool_name in _BUILT_IN_DESCRIPTIONS:
53
+ built_in_def = _get_built_in_definition(tool_name)
54
+ if built_in_def and built_in_def not in needed_definitions:
55
+ needed_definitions.append(built_in_def)
56
+
57
+ return needed_definitions
58
+
59
+
25
60
  @experimental
26
61
  class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
27
62
  """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
@@ -88,7 +123,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
88
123
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
89
124
 
90
125
  @override
91
- def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
126
+ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
92
127
  current_dir = os.path.dirname(__file__)
93
128
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
94
129
  self.threshold = threshold
@@ -96,6 +131,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
96
131
  model_config=model_config,
97
132
  prompty_file=prompty_path,
98
133
  result_key=self._RESULT_KEY,
134
+ credential=credential,
99
135
  **kwargs,
100
136
  )
101
137
 
@@ -153,10 +189,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
153
189
  # TODO add warning that only tool calls of type function are supported
154
190
  # Collect inputs
155
191
  tool_calls = kwargs.get("tool_calls")
156
- tool_definitions = kwargs.get("tool_definitions")
192
+ tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
157
193
  query = kwargs.get("query")
158
194
  response = kwargs.get("response")
159
-
160
195
  # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
161
196
  if response:
162
197
  parsed_tool_calls = self._parse_tools_from_response(response)
@@ -165,20 +200,23 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
165
200
 
166
201
  if not tool_calls:
167
202
  return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
168
- if not tool_definitions or len(tool_definitions) == 0:
169
- return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
170
203
 
171
204
  if not isinstance(tool_calls, list):
172
205
  tool_calls = [tool_calls]
173
206
  if not isinstance(tool_definitions, list):
174
- tool_definitions = [tool_definitions]
207
+ tool_definitions = [tool_definitions] if tool_definitions else []
175
208
 
176
209
  try:
177
210
  needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
178
211
  except EvaluationException as e:
179
- return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
212
+ # Check if this is because no tool definitions were provided at all
213
+ if len(tool_definitions) == 0:
214
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
215
+ else:
216
+ return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
217
+
180
218
  if len(needed_tool_definitions) == 0:
181
- return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
219
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
182
220
 
183
221
  return {
184
222
  "query": query,
@@ -268,66 +306,72 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
268
306
  "details": {},
269
307
  }
270
308
 
271
- def _parse_tools_from_response(self, response):
272
- """Parse the response to extract tool calls and results.
273
- :param response: The response to parse.
274
- :type response: Union[str, List[dict]]
275
- :return: List of tool calls extracted from the response.
276
- :rtype: List[dict]
277
- """
278
- tool_calls = []
279
- tool_results_map = {}
280
- if isinstance(response, list):
281
- for message in response:
282
- # Extract tool calls from assistant messages
283
- if message.get("role") == "assistant" and isinstance(message.get("content"), list):
284
- for content_item in message.get("content"):
285
- if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
286
- tool_calls.append(content_item)
287
-
288
- # Extract tool results from tool messages
289
- elif message.get("role") == "tool" and message.get("tool_call_id"):
290
- tool_call_id = message.get("tool_call_id")
291
- if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
292
- result_content = message.get("content")[0]
293
- if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
294
- tool_results_map[tool_call_id] = result_content
295
-
296
- # Attach results to their corresponding calls
297
- for tool_call in tool_calls:
298
- tool_call_id = tool_call.get("tool_call_id")
299
- if tool_call_id in tool_results_map:
300
- tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
301
-
302
- return tool_calls
303
-
304
309
  def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
305
- """Extract the tool definitions that are needed for the provided tool calls.
306
- :param tool_calls: List of tool calls to evaluate.
307
- :type tool_calls: List[dict]
308
- :param tool_definitions: List of tool definitions to use for evaluation.
309
- :type tool_definitions: List[dict]
310
- :return: List of tool definitions that are needed for the provided tool calls.
311
- :rtype: List[dict]
312
- """
310
+ """Extract the tool definitions that are needed for the provided tool calls."""
313
311
  needed_tool_definitions = []
312
+
313
+ # Add all user-provided tool definitions
314
+ needed_tool_definitions.extend(tool_definitions)
315
+
316
+ # Add the needed built-in tool definitions (if they are called)
317
+ built_in_definitions = _get_needed_built_in_definitions(tool_calls)
318
+ needed_tool_definitions.extend(built_in_definitions)
319
+
320
+ # OpenAPI tool is a collection of functions, so we need to expand it
321
+ tool_definitions_expanded = list(
322
+ chain.from_iterable(
323
+ tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
324
+ for tool in needed_tool_definitions
325
+ )
326
+ )
327
+
328
+ # Validate that all tool calls have corresponding definitions
314
329
  for tool_call in tool_calls:
315
- if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
316
- tool_name = tool_call.get("name")
317
- tool_definition = [
318
- tool
319
- for tool in tool_definitions
320
- if tool.get("name") == tool_name and tool.get("type", "function") == "function"
321
- ]
322
- if len(tool_definition) > 0:
323
- needed_tool_definitions.extend(tool_definition)
330
+ if isinstance(tool_call, dict):
331
+ tool_type = tool_call.get("type")
332
+
333
+ if tool_type == "tool_call":
334
+ tool_name = tool_call.get("name")
335
+ if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
336
+ # This is a built-in tool from converter, already handled above
337
+ continue
338
+ elif tool_name:
339
+ # This is a regular function tool from converter
340
+ tool_definition_exists = any(
341
+ tool.get("name") == tool_name and tool.get("type", "function") == "function"
342
+ for tool in tool_definitions_expanded
343
+ )
344
+ if not tool_definition_exists:
345
+ raise EvaluationException(
346
+ message=f"Tool definition for {tool_name} not found",
347
+ blame=ErrorBlame.USER_ERROR,
348
+ category=ErrorCategory.INVALID_VALUE,
349
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
350
+ )
351
+ else:
352
+ raise EvaluationException(
353
+ message=f"Tool call missing name: {tool_call}",
354
+ blame=ErrorBlame.USER_ERROR,
355
+ category=ErrorCategory.INVALID_VALUE,
356
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
357
+ )
324
358
  else:
359
+ # Unsupported tool format - only converter format is supported
325
360
  raise EvaluationException(
326
- message=f"Tool definition for {tool_name} not found",
361
+ message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
327
362
  blame=ErrorBlame.USER_ERROR,
328
363
  category=ErrorCategory.INVALID_VALUE,
329
364
  target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
330
365
  )
366
+ else:
367
+ # Tool call is not a dictionary
368
+ raise EvaluationException(
369
+ message=f"Tool call is not a dictionary: {tool_call}",
370
+ blame=ErrorBlame.USER_ERROR,
371
+ category=ErrorCategory.INVALID_VALUE,
372
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
373
+ )
374
+
331
375
  return needed_tool_definitions
332
376
 
333
377
  @override