azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
  3. azure/ai/evaluation/_aoai/label_grader.py +6 -10
  4. azure/ai/evaluation/_aoai/python_grader.py +7 -10
  5. azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
  6. azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +241 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -2
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
  33. azure/ai/evaluation/_evaluate/_utils.py +10 -3
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
  38. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  39. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
  40. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  41. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  42. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  43. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
  44. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  45. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  46. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  47. azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
  48. azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
  49. azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
  50. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
  52. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  53. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  55. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  56. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  57. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  58. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  59. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  60. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  61. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  62. azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
  63. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  64. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  65. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  66. azure/ai/evaluation/_exceptions.py +6 -1
  67. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  68. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  69. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  70. azure/ai/evaluation/_model_configurations.py +26 -0
  71. azure/ai/evaluation/_version.py +1 -1
  72. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  73. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  74. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  75. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  76. azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
  77. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  78. azure/ai/evaluation/red_team/_red_team.py +494 -37
  79. azure/ai/evaluation/red_team/_red_team_result.py +48 -28
  80. azure/ai/evaluation/red_team/_result_processor.py +558 -29
  81. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  82. azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
  83. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  84. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  85. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  86. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  87. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  88. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  90. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  91. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  92. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  94. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  95. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
  96. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
  97. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  98. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  99. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator, _TaskNavigationEfficiencyMatchingMode
6
+
7
+ __all__ = ["_TaskNavigationEfficiencyEvaluator", "_TaskNavigationEfficiencyMatchingMode"]
@@ -1,40 +1,73 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- import json
4
+ from enum import Enum
5
5
  from collections import Counter
6
+ import json
6
7
  from typing import Dict, List, Union, Any, Tuple
7
8
  from typing_extensions import overload, override
8
9
 
9
- from azure.ai.evaluation._evaluators._common import EvaluatorBase
10
10
  from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
12
+ from azure.ai.evaluation._exceptions import (
13
+ ErrorCategory,
14
+ ErrorTarget,
15
+ EvaluationException,
16
+ )
17
+
18
+
19
+ class _TaskNavigationEfficiencyMatchingMode(str, Enum):
20
+ """
21
+ Enumeration of task navigation efficiency matching mode.
22
+
23
+ This enum allows you to specify which single matching technique should be used when evaluating
24
+ the efficiency of an agent's tool calls sequence against a ground truth path.
25
+ """
26
+
27
+ EXACT_MATCH = "exact_match"
28
+ """
29
+ Binary metric indicating whether the agent's tool calls exactly match the ground truth.
30
+
31
+ Returns True only if the agent's tool calls sequence is identical to the expected sequence
32
+ in both order and content (no extra steps, no missing steps, correct order).
33
+ """
34
+
35
+ IN_ORDER_MATCH = "in_order_match"
36
+ """
37
+ Binary metric allowing extra steps but requiring correct order of required tool calls.
38
+
39
+ Returns True if all ground truth steps appear in the agent's sequence in the correct
40
+ order, even if there are additional steps interspersed.
41
+ """
42
+
43
+ ANY_ORDER_MATCH = "any_order_match"
44
+ """
45
+ Binary metric allowing both extra steps and different ordering.
46
+
47
+ Returns True if all ground truth steps appear in the agent's sequence with sufficient
48
+ frequency, regardless of order. Most lenient matching criterion.
49
+ """
11
50
 
12
51
 
13
- class PathEfficiencyEvaluator(EvaluatorBase):
52
+ class _TaskNavigationEfficiencyEvaluator(EvaluatorBase):
14
53
  """
15
54
  Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
16
55
 
17
- The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
18
- between the agent's tool usage trajectory and the ground truth expected steps. It also provides
19
- three binary match metrics: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
56
+ The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
57
+ It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
58
+ It also returns precision, recall, and F1 scores in properties bag.
20
59
 
21
- :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
22
- :type precision_threshold: float
23
- :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
24
- :type recall_threshold: float
25
- :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
26
- :type f1_score_threshold: float
60
+ :param matching_mode: The matching mode to use. Default is "exact_match".
61
+ :type matching_mode: enum[str, _TaskNavigationEfficiencyMatchingMode]
27
62
 
28
63
  .. admonition:: Example:
29
64
 
30
65
  .. code-block:: python
31
66
 
32
- from azure.ai.evaluation import PathEfficiencyEvaluator
67
+ from azure.ai.evaluation._evaluators._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator
33
68
 
34
- path_efficiency_eval = PathEfficiencyEvaluator(
35
- precision_threshold=0.7,
36
- recall_threshold=0.8,
37
- f1_score_threshold=0.75
69
+ task_navigation_efficiency_eval = _TaskNavigationEfficiencyEvaluator(
70
+ matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
38
71
  )
39
72
 
40
73
  # Example 1: Using simple tool names list
@@ -64,36 +97,39 @@ class PathEfficiencyEvaluator(EvaluatorBase):
64
97
  )
65
98
  """
66
99
 
67
- _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5
68
-
69
- id = "azureai://built-in/evaluators/path_efficiency"
100
+ id = "azureai://built-in/evaluators/task_navigation_efficiency"
70
101
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71
102
 
103
+ matching_mode: _TaskNavigationEfficiencyMatchingMode
104
+ """The matching mode to use."""
105
+
72
106
  @override
73
107
  def __init__(
74
108
  self,
75
109
  *,
76
- precision_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
77
- recall_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
78
- f1_score_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
110
+ matching_mode: Union[
111
+ str, _TaskNavigationEfficiencyMatchingMode
112
+ ] = _TaskNavigationEfficiencyMatchingMode.EXACT_MATCH,
79
113
  ):
80
- self._higher_is_better = True
81
- super().__init__()
114
+ # Type checking for metric parameter
115
+ if isinstance(matching_mode, str):
116
+ try:
117
+ self.matching_mode = _TaskNavigationEfficiencyMatchingMode(matching_mode)
118
+ except ValueError:
119
+ raise ValueError(
120
+ f"matching_mode must be one of {[m.value for m in _TaskNavigationEfficiencyMatchingMode]}, got '{matching_mode}'"
121
+ )
122
+ elif isinstance(matching_mode, _TaskNavigationEfficiencyMatchingMode):
123
+ self.matching_mode = matching_mode
124
+ else:
125
+ raise EvaluationException(
126
+ f"matching_mode must be a string with one of {[m.value for m in _TaskNavigationEfficiencyMatchingMode]} or _TaskNavigationEfficiencyMatchingMode enum, got {type(matching_mode)}",
127
+ internal_message=str(matching_mode),
128
+ target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
129
+ category=ErrorCategory.INVALID_VALUE,
130
+ )
82
131
 
83
- # Type checking for threshold parameters
84
- for name, value in [
85
- ("precision_threshold", precision_threshold),
86
- ("recall_threshold", recall_threshold),
87
- ("f1_score_threshold", f1_score_threshold),
88
- ]:
89
- if not isinstance(value, float):
90
- raise TypeError(f"{name} must be a float, got {type(value)}")
91
-
92
- self._threshold = {
93
- "path_efficiency_precision": precision_threshold,
94
- "path_efficiency_recall": recall_threshold,
95
- "path_efficiency_f1": f1_score_threshold,
96
- }
132
+ super().__init__()
97
133
 
98
134
  def _prepare_steps_for_comparison(
99
135
  self,
@@ -192,14 +228,20 @@ class PathEfficiencyEvaluator(EvaluatorBase):
192
228
  # Check if agent has at least as many occurrences of each ground truth step
193
229
  return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
194
230
 
231
+ _TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
232
+ _TaskNavigationEfficiencyMatchingMode.EXACT_MATCH: _calculate_exact_match,
233
+ _TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH: _calculate_in_order_match,
234
+ _TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH: _calculate_any_order_match,
235
+ }
236
+
195
237
  @override
196
- async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
238
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[str, float]]]:
197
239
  """Produce a path efficiency evaluation result.
198
240
 
199
241
  :param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
200
242
  :type eval_input: Dict
201
243
  :return: The evaluation result.
202
- :rtype: Dict[str, Union[float, str]]
244
+ :rtype: Dict[str, Union[float, str, Dict[str, float]]]
203
245
  """
204
246
  response = eval_input["response"]
205
247
  ground_truth = eval_input["ground_truth"]
@@ -244,12 +286,10 @@ class PathEfficiencyEvaluator(EvaluatorBase):
244
286
  ground_truth_names = [name.strip() for name in tool_names_list]
245
287
  ground_truth_params_dict = params_dict
246
288
  use_parameter_matching = True
247
-
248
289
  elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
249
290
  # List format: just tool names
250
291
  ground_truth_names = [step.strip() for step in ground_truth]
251
292
  use_parameter_matching = False
252
-
253
293
  else:
254
294
  raise TypeError(
255
295
  "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
@@ -267,42 +307,44 @@ class PathEfficiencyEvaluator(EvaluatorBase):
267
307
  )
268
308
 
269
309
  # Calculate precision, recall, and F1 scores
270
- metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
271
-
272
- # Calculate binary match metrics
273
- exact_match = self._calculate_exact_match(agent_steps, ground_truth_steps)
274
- in_order_match = self._calculate_in_order_match(agent_steps, ground_truth_steps)
275
- any_order_match = self._calculate_any_order_match(agent_steps, ground_truth_steps)
310
+ additional_properties_metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
276
311
 
277
312
  # Convert metrics to floats, using nan for None or non-convertible values
278
- path_efficiency_precision = (
279
- float(metrics["precision_score"]) if metrics["precision_score"] is not None else float("nan")
280
- )
281
- path_efficiency_recall = float(metrics["recall_score"]) if metrics["recall_score"] is not None else float("nan")
282
- path_efficiency_f1_score = float(metrics["f1_score"]) if metrics["f1_score"] is not None else float("nan")
313
+ for metric, score in additional_properties_metrics.items():
314
+ additional_properties_metrics[metric] = float(score) if score is not None else float("nan")
283
315
 
284
- return {
285
- "path_efficiency_precision_score": path_efficiency_precision,
286
- "path_efficiency_recall_score": path_efficiency_recall,
287
- "path_efficiency_f1_score": path_efficiency_f1_score,
288
- "path_efficiency_exact_match_result": EVALUATION_PASS_FAIL_MAPPING[exact_match],
289
- "path_efficiency_in_order_match_result": EVALUATION_PASS_FAIL_MAPPING[in_order_match],
290
- "path_efficiency_any_order_match_result": EVALUATION_PASS_FAIL_MAPPING[any_order_match],
291
- }
316
+ if self.matching_mode in self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS:
317
+ # Calculate binary match metrics
318
+ match_result = self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS[self.matching_mode](
319
+ self, agent_steps, ground_truth_steps
320
+ )
321
+
322
+ return {
323
+ "task_navigation_efficiency_label": match_result,
324
+ "task_navigation_efficiency_result": EVALUATION_PASS_FAIL_MAPPING[match_result],
325
+ "task_navigation_efficiency_details": additional_properties_metrics,
326
+ }
327
+ else:
328
+ raise EvaluationException(
329
+ f"Unsupported matching_mode '{self.matching_mode}'",
330
+ internal_message=str(self.matching_mode),
331
+ target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
332
+ category=ErrorCategory.INVALID_VALUE,
333
+ )
292
334
 
293
335
  @overload
294
336
  def __call__( # type: ignore
295
337
  self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str]
296
- ) -> Dict[str, Union[float, str]]:
338
+ ) -> Dict[str, Union[float, str, Dict[str, float]]]:
297
339
  """
298
- Evaluate the path efficiency of an agent's action sequence.
340
+ Evaluate the task navigation efficiency of an agent's action sequence.
299
341
 
300
342
  :keyword response: The agent's response containing tool calls.
301
343
  :paramtype response: Union[str, List[Dict[str, Any]]]
302
344
  :keyword ground_truth: List of expected tool/action steps.
303
345
  :paramtype ground_truth: List[str]
304
- :return: The path efficiency scores and results.
305
- :rtype: Dict[str, Union[float, str]]
346
+ :return: The task navigation efficiency scores and results.
347
+ :rtype: Dict[str, Union[float, str, Dict[str, float]]]
306
348
  """
307
349
 
308
350
  @overload
@@ -311,16 +353,16 @@ class PathEfficiencyEvaluator(EvaluatorBase):
311
353
  *,
312
354
  response: Union[str, List[Dict[str, Any]]],
313
355
  ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
314
- ) -> Dict[str, Union[float, str]]:
356
+ ) -> Dict[str, Union[float, str, Dict[str, float]]]:
315
357
  """
316
- Evaluate the path efficiency of an agent's action sequence with tool parameters.
358
+ Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.
317
359
 
318
360
  :keyword response: The agent's response containing tool calls.
319
361
  :paramtype response: Union[str, List[Dict[str, Any]]]
320
362
  :keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321
363
  :paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322
- :return: The path efficiency scores and results.
323
- :rtype: Dict[str, Union[float, str]]
364
+ :return: The task navigation efficiency scores and results.
365
+ :rtype: Dict[str, Union[float, str, Dict[str, float]]]
324
366
  """
325
367
 
326
368
  @override
@@ -330,13 +372,13 @@ class PathEfficiencyEvaluator(EvaluatorBase):
330
372
  **kwargs,
331
373
  ):
332
374
  """
333
- Evaluate path efficiency.
375
+ Evaluate task navigation efficiency.
334
376
 
335
377
  :keyword response: The agent's response containing tool calls.
336
378
  :paramtype response: Union[str, List[Dict[str, Any]]]
337
379
  :keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
338
380
  :paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
339
- :return: The path efficiency scores and results.
340
- :rtype: Dict[str, Union[float, str]]
381
+ :return: The task navigation efficiency scores and results.
382
+ :rtype: Dict[str, Union[float, str, Dict[str, float]]]
341
383
  """
342
384
  return super().__call__(*args, **kwargs)
@@ -1,7 +1,6 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from itertools import chain
5
4
  import math
6
5
  import os
7
6
  import logging
@@ -17,46 +16,12 @@ from azure.ai.evaluation._exceptions import (
17
16
  )
18
17
  from ..._common.utils import check_score_is_valid
19
18
  from azure.ai.evaluation._common._experimental import experimental
20
- from ..._converters._models import (
21
- _BUILT_IN_DESCRIPTIONS,
22
- _BUILT_IN_PARAMS,
23
- )
24
19
 
25
20
  logger = logging.getLogger(__name__)
26
21
 
27
22
  T_EvalValue = TypeVar("T_EvalValue")
28
23
 
29
24
 
30
- def _get_built_in_definition(tool_name: str):
31
- """Get the definition for the built-in tool."""
32
- if tool_name in _BUILT_IN_DESCRIPTIONS:
33
- return {
34
- "type": tool_name,
35
- "description": _BUILT_IN_DESCRIPTIONS[tool_name],
36
- "name": tool_name,
37
- "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
38
- }
39
- return None
40
-
41
-
42
- def _get_needed_built_in_definitions(tool_calls: List[Dict]) -> List[Dict]:
43
- """Extract tool definitions needed for the given built-in tool calls."""
44
- needed_definitions = []
45
- for tool_call in tool_calls:
46
- if isinstance(tool_call, dict):
47
- tool_type = tool_call.get("type")
48
-
49
- # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
50
- if tool_type == "tool_call":
51
- tool_name = tool_call.get("name")
52
- if tool_name in _BUILT_IN_DESCRIPTIONS:
53
- built_in_def = _get_built_in_definition(tool_name)
54
- if built_in_def and built_in_def not in needed_definitions:
55
- needed_definitions.append(built_in_def)
56
-
57
- return needed_definitions
58
-
59
-
60
25
  @experimental
61
26
  class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
62
27
  """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
@@ -100,9 +65,12 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
100
65
 
101
66
  .. note::
102
67
 
68
+ The output field "details" has been renamed to "tool_call_accuracy_details" for clarity.
69
+
103
70
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
104
71
  To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
105
72
  however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
73
+
106
74
  """
107
75
 
108
76
  _PROMPTY_FILE = "tool_call_accuracy.prompty"
@@ -132,6 +100,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
132
100
  prompty_file=prompty_path,
133
101
  result_key=self._RESULT_KEY,
134
102
  credential=credential,
103
+ threshold=threshold,
135
104
  **kwargs,
136
105
  )
137
106
 
@@ -207,7 +176,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
207
176
  tool_definitions = [tool_definitions] if tool_definitions else []
208
177
 
209
178
  try:
210
- needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
179
+ needed_tool_definitions = self._extract_needed_tool_definitions(
180
+ tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
181
+ )
211
182
  except EvaluationException as e:
212
183
  # Check if this is because no tool definitions were provided at all
213
184
  if len(tool_definitions) == 0:
@@ -235,8 +206,8 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
235
206
  :rtype: Dict
236
207
  """
237
208
  # Single LLM call for all tool calls
238
- llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
239
-
209
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
210
+ llm_output = prompty_output_dict.get("llm_output", {})
240
211
  if isinstance(llm_output, dict):
241
212
  score = llm_output.get(self._LLM_SCORE_KEY, None)
242
213
  if not score or not check_score_is_valid(
@@ -248,6 +219,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
248
219
  message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
249
220
  internal_message="Invalid score value.",
250
221
  category=ErrorCategory.FAILED_EXECUTION,
222
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
251
223
  blame=ErrorBlame.SYSTEM_ERROR,
252
224
  )
253
225
 
@@ -257,10 +229,18 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
257
229
  score_result = "pass" if score >= self.threshold else "fail"
258
230
  response_dict = {
259
231
  self._result_key: score,
232
+ f"gpt_{self._result_key}": score,
260
233
  f"{self._result_key}_result": score_result,
261
- f"{self._result_key}_threshold": self.threshold,
234
+ f"{self._result_key}_threshold": self._threshold,
262
235
  f"{self._result_key}_reason": reason,
263
- "details": llm_output.get("details", {}),
236
+ f"{self._result_key}_details": llm_output.get("details", {}),
237
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
238
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
239
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
240
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
241
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
242
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
243
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
264
244
  }
265
245
  return response_dict
266
246
 
@@ -275,105 +255,21 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
275
255
  async def _real_call(self, **kwargs):
276
256
  """The asynchronous call where real end-to-end evaluation logic is performed.
277
257
 
278
- :keyword kwargs: The inputs to evaluate.
258
+ :keyword kwargs: The inputs to evaluate
279
259
  :type kwargs: Dict
280
- :return: The evaluation result.
260
+ :return: The evaluation result
281
261
  :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
282
262
  """
283
263
  # Convert inputs into list of evaluable inputs.
284
264
  eval_input = self._convert_kwargs_to_eval_input(**kwargs)
285
265
  if isinstance(eval_input, dict) and eval_input.get("error_message"):
286
266
  # If there is an error message, return not applicable result
287
- return self._not_applicable_result(eval_input.get("error_message"))
267
+ return self._not_applicable_result(eval_input.get("error_message"), self.threshold)
288
268
  # Do the evaluation
289
269
  result = await self._do_eval(eval_input)
290
270
  # Return the result
291
271
  return result
292
272
 
293
- def _not_applicable_result(self, error_message):
294
- """Return a result indicating that the tool call is not applicable for evaluation.
295
- :param eval_input: The input to the evaluator.
296
- :type eval_input: Dict
297
- :return: A dictionary containing the result of the evaluation.
298
- :rtype: Dict[str, Union[str, float]]
299
- """
300
- # If no tool calls were made or tool call type is not supported, return not applicable result
301
- return {
302
- self._result_key: self._NOT_APPLICABLE_RESULT,
303
- f"{self._result_key}_result": "pass",
304
- f"{self._result_key}_threshold": self.threshold,
305
- f"{self._result_key}_reason": error_message,
306
- "details": {},
307
- }
308
-
309
- def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
310
- """Extract the tool definitions that are needed for the provided tool calls."""
311
- needed_tool_definitions = []
312
-
313
- # Add all user-provided tool definitions
314
- needed_tool_definitions.extend(tool_definitions)
315
-
316
- # Add the needed built-in tool definitions (if they are called)
317
- built_in_definitions = _get_needed_built_in_definitions(tool_calls)
318
- needed_tool_definitions.extend(built_in_definitions)
319
-
320
- # OpenAPI tool is a collection of functions, so we need to expand it
321
- tool_definitions_expanded = list(
322
- chain.from_iterable(
323
- tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
324
- for tool in needed_tool_definitions
325
- )
326
- )
327
-
328
- # Validate that all tool calls have corresponding definitions
329
- for tool_call in tool_calls:
330
- if isinstance(tool_call, dict):
331
- tool_type = tool_call.get("type")
332
-
333
- if tool_type == "tool_call":
334
- tool_name = tool_call.get("name")
335
- if tool_name and tool_name in _BUILT_IN_DESCRIPTIONS:
336
- # This is a built-in tool from converter, already handled above
337
- continue
338
- elif tool_name:
339
- # This is a regular function tool from converter
340
- tool_definition_exists = any(
341
- tool.get("name") == tool_name and tool.get("type", "function") == "function"
342
- for tool in tool_definitions_expanded
343
- )
344
- if not tool_definition_exists:
345
- raise EvaluationException(
346
- message=f"Tool definition for {tool_name} not found",
347
- blame=ErrorBlame.USER_ERROR,
348
- category=ErrorCategory.INVALID_VALUE,
349
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
350
- )
351
- else:
352
- raise EvaluationException(
353
- message=f"Tool call missing name: {tool_call}",
354
- blame=ErrorBlame.USER_ERROR,
355
- category=ErrorCategory.INVALID_VALUE,
356
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
357
- )
358
- else:
359
- # Unsupported tool format - only converter format is supported
360
- raise EvaluationException(
361
- message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
362
- blame=ErrorBlame.USER_ERROR,
363
- category=ErrorCategory.INVALID_VALUE,
364
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
365
- )
366
- else:
367
- # Tool call is not a dictionary
368
- raise EvaluationException(
369
- message=f"Tool call is not a dictionary: {tool_call}",
370
- blame=ErrorBlame.USER_ERROR,
371
- category=ErrorCategory.INVALID_VALUE,
372
- target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
373
- )
374
-
375
- return needed_tool_definitions
376
-
377
273
  @override
378
274
  def __call__( # pylint: disable=docstring-missing-param
379
275
  self,
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._tool_input_accuracy import _ToolInputAccuracyEvaluator
6
+
7
+ __all__ = [
8
+ "_ToolInputAccuracyEvaluator",
9
+ ]