azure-ai-evaluation 1.11.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (35) hide show
  1. azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
  2. azure/ai/evaluation/_aoai/label_grader.py +8 -3
  3. azure/ai/evaluation/_aoai/python_grader.py +8 -3
  4. azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
  5. azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
  7. azure/ai/evaluation/_eval_mapping.py +2 -0
  8. azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
  9. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +252 -48
  10. azure/ai/evaluation/_evaluate/_utils.py +7 -3
  11. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  12. azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
  13. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  14. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
  15. azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
  16. azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
  17. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
  18. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  19. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  20. azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
  21. azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
  22. azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
  23. azure/ai/evaluation/_exceptions.py +1 -0
  24. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
  25. azure/ai/evaluation/_version.py +1 -1
  26. azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
  27. azure/ai/evaluation/red_team/_red_team.py +9 -0
  28. azure/ai/evaluation/red_team/_red_team_result.py +230 -1
  29. azure/ai/evaluation/red_team/_result_processor.py +416 -23
  30. azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
  31. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +13 -3
  32. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
  33. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
  34. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
  35. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,342 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import json
5
+ from collections import Counter
6
+ from typing import Dict, List, Union, Any, Tuple
7
+ from typing_extensions import overload, override
8
+
9
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
10
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11
+
12
+
13
+ class PathEfficiencyEvaluator(EvaluatorBase):
14
+ """
15
+ Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
16
+
17
+ The Path Efficiency Evaluator calculates precision, recall, and F1 scores based on the comparison
18
+ between the agent's tool usage trajectory and the ground truth expected steps. It also provides
19
+ three binary match metrics: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
20
+
21
+ :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
22
+ :type precision_threshold: float
23
+ :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
24
+ :type recall_threshold: float
25
+ :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
26
+ :type f1_score_threshold: float
27
+
28
+ .. admonition:: Example:
29
+
30
+ .. code-block:: python
31
+
32
+ from azure.ai.evaluation import PathEfficiencyEvaluator
33
+
34
+ path_efficiency_eval = PathEfficiencyEvaluator(
35
+ precision_threshold=0.7,
36
+ recall_threshold=0.8,
37
+ f1_score_threshold=0.75
38
+ )
39
+
40
+ # Example 1: Using simple tool names list
41
+ result = path_efficiency_eval(
42
+ response=[
43
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "identify_tools_to_call", "arguments": {}}]},
44
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "call_tool_A", "arguments": {}}]},
45
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]},
46
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]}
47
+ ],
48
+ ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
49
+ )
50
+
51
+ # Example 2: Using tool names with parameters (exact parameter matching required)
52
+ result = path_efficiency_eval(
53
+ response=[
54
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "weather", "location": "NYC"}}]},
55
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "format_result", "arguments": {"format": "json"}}]}
56
+ ],
57
+ ground_truth=(
58
+ ["search", "format_result"],
59
+ {
60
+ "search": {"query": "weather", "location": "NYC"},
61
+ "format_result": {"format": "json"}
62
+ }
63
+ )
64
+ )
65
+ """
66
+
67
+ _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD = 0.5
68
+
69
+ id = "azureai://built-in/evaluators/path_efficiency"
70
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
71
+
72
+ @override
73
+ def __init__(
74
+ self,
75
+ *,
76
+ precision_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
77
+ recall_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
78
+ f1_score_threshold: float = _DEFAULT_PATH_EFFICIENCY_SCORE_THRESHOLD,
79
+ ):
80
+ self._higher_is_better = True
81
+ super().__init__()
82
+
83
+ # Type checking for threshold parameters
84
+ for name, value in [
85
+ ("precision_threshold", precision_threshold),
86
+ ("recall_threshold", recall_threshold),
87
+ ("f1_score_threshold", f1_score_threshold),
88
+ ]:
89
+ if not isinstance(value, float):
90
+ raise TypeError(f"{name} must be a float, got {type(value)}")
91
+
92
+ self._threshold = {
93
+ "path_efficiency_precision": precision_threshold,
94
+ "path_efficiency_recall": recall_threshold,
95
+ "path_efficiency_f1": f1_score_threshold,
96
+ }
97
+
98
+ def _prepare_steps_for_comparison(
99
+ self,
100
+ agent_tool_pairs: List[Tuple[str, Dict[str, Any]]],
101
+ ground_truth: List[str],
102
+ ground_truth_params: Dict[str, Dict[str, Any]],
103
+ use_parameter_matching: bool,
104
+ ) -> Tuple[
105
+ List[Union[str, Tuple[str, Tuple]]],
106
+ List[Union[str, Tuple[str, Tuple]]],
107
+ ]:
108
+ """Prepare agent and ground truth steps for comparison based on parameter matching mode."""
109
+ agent_steps: List[Union[str, Tuple[str, Tuple]]] = []
110
+ ground_truth_steps: List[Union[str, Tuple[str, Tuple]]] = []
111
+ if use_parameter_matching:
112
+ # When parameter matching is enabled, we need to match both tool name and parameters
113
+ agent_steps = [(pair[0], tuple(sorted(pair[1].items()))) for pair in agent_tool_pairs]
114
+ ground_truth_steps = [
115
+ (name, tuple(sorted(ground_truth_params.get(name, {}).items()))) for name in ground_truth
116
+ ]
117
+ else:
118
+ # When parameter matching is disabled, only compare tool names
119
+ agent_steps = [name for name, _ in agent_tool_pairs]
120
+ ground_truth_steps = [step for step in ground_truth]
121
+
122
+ return agent_steps, ground_truth_steps
123
+
124
+ def _calculate_precision_recall_f1_scores(self, agent_steps: List, ground_truth_steps: List) -> Dict[str, float]:
125
+ """Calculate precision, recall, and F1 scores."""
126
+ if not agent_steps:
127
+ return {"precision_score": 0.0, "recall_score": 0.0, "f1_score": 0.0}
128
+
129
+ # Count occurrences of each step in both lists to handle duplicates
130
+ agent_steps_counts = Counter(agent_steps)
131
+ ground_truth_counts = Counter(ground_truth_steps)
132
+
133
+ # Calculate true positives by taking the minimum count for each common element
134
+ # For each step, count the intersection (min count) of agent and ground truth steps
135
+ true_positives = sum(
136
+ min(agent_steps_counts[step], ground_truth_counts[step])
137
+ for step in agent_steps_counts
138
+ if step in ground_truth_counts
139
+ )
140
+
141
+ # Calculate false positives (agent steps not in ground truth or excess occurrences)
142
+ # For each step, count the excess occurrences of agent steps not in (minus) ground truth
143
+ # or zero (agent steps minus agent steps) if agent steps is less than ground truth
144
+ false_positives = sum(
145
+ agent_steps_counts[step] - min(agent_steps_counts[step], ground_truth_counts.get(step, 0))
146
+ for step in agent_steps_counts
147
+ )
148
+
149
+ # Calculate false negatives (ground truth steps not in agent or missing occurrences)
150
+ # For each step, count the excess occurrences of ground truth steps not in (minus) agent steps
151
+ # or zero (ground truth steps minus ground truth steps) if ground truth steps is less than agent steps
152
+ false_negatives = sum(
153
+ ground_truth_counts[step] - min(ground_truth_counts[step], agent_steps_counts.get(step, 0))
154
+ for step in ground_truth_counts
155
+ )
156
+
157
+ # Calculate precision, recall, F1
158
+ precision = (
159
+ true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
160
+ )
161
+ recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
162
+ f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
163
+
164
+ return {
165
+ "precision_score": precision,
166
+ "recall_score": recall,
167
+ "f1_score": f1_score,
168
+ }
169
+
170
+ def _calculate_exact_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
171
+ """Check if agent steps exactly match ground truth (order and content)."""
172
+ return agent_steps == ground_truth_steps
173
+
174
+ def _calculate_in_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
175
+ """Check if all ground truth steps appear in agent steps in correct order (extra steps allowed)."""
176
+ if not ground_truth_steps:
177
+ return True
178
+
179
+ gt_index = 0
180
+ for step in agent_steps:
181
+ if gt_index < len(ground_truth_steps) and step == ground_truth_steps[gt_index]:
182
+ gt_index += 1
183
+
184
+ return gt_index == len(ground_truth_steps)
185
+
186
+ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
187
+ """Check if all ground truth steps appear in agent steps with sufficient frequency (any order, extra steps allowed)."""
188
+ # Count occurrences of each step in both lists to handle duplicates
189
+ agent_counts = Counter(agent_steps)
190
+ ground_truth_counts = Counter(ground_truth_steps)
191
+
192
+ # Check if agent has at least as many occurrences of each ground truth step
193
+ return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
194
+
195
+ @override
196
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
197
+ """Produce a path efficiency evaluation result.
198
+
199
+ :param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
200
+ :type eval_input: Dict
201
+ :return: The evaluation result.
202
+ :rtype: Dict[str, Union[float, str]]
203
+ """
204
+ response = eval_input["response"]
205
+ ground_truth = eval_input["ground_truth"]
206
+
207
+ # Value and type checking for ground truth steps
208
+ if not ground_truth:
209
+ raise ValueError("ground_truth cannot be empty")
210
+
211
+ # Check if ground_truth is a tuple (tool names + parameters) or list (tool names only)
212
+ use_parameter_matching = False
213
+ ground_truth_names = []
214
+ ground_truth_params_dict: Dict[str, Dict[str, Any]] = {}
215
+
216
+ if isinstance(ground_truth, tuple) and len(ground_truth) == 2:
217
+ # Tuple format: (tool_names, parameters_dict)
218
+ tool_names_list, params_dict = ground_truth
219
+
220
+ if not isinstance(tool_names_list, list) or not all(isinstance(name, str) for name in tool_names_list):
221
+ raise TypeError("ground_truth tuple first element must be a list of strings (tool names)")
222
+
223
+ if not isinstance(params_dict, dict):
224
+ raise TypeError(
225
+ "ground_truth tuple second element must be a dictionary mapping tool names to parameters"
226
+ )
227
+
228
+ # Validate that all values in params_dict are dictionaries with string keys and values
229
+ for tool_name, params in params_dict.items():
230
+ if not isinstance(tool_name, str):
231
+ raise TypeError("ground_truth parameters dictionary keys must be strings (tool names)")
232
+ if not isinstance(params, dict):
233
+ raise TypeError(f"ground_truth parameters for tool '{tool_name}' must be a dictionary")
234
+ for k, v in params.items():
235
+ if not isinstance(k, str):
236
+ raise TypeError(f"ground_truth parameters for tool '{tool_name}' must have string keys")
237
+ try:
238
+ json.dumps(v)
239
+ except (TypeError, ValueError):
240
+ raise TypeError(
241
+ f"ground_truth parameters for tool '{tool_name}' must have JSON-serializable values (got type {type(v)} for key '{k}')"
242
+ )
243
+
244
+ ground_truth_names = [name.strip() for name in tool_names_list]
245
+ ground_truth_params_dict = params_dict
246
+ use_parameter_matching = True
247
+
248
+ elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
249
+ # List format: just tool names
250
+ ground_truth_names = [step.strip() for step in ground_truth]
251
+ use_parameter_matching = False
252
+
253
+ else:
254
+ raise TypeError(
255
+ "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
256
+ )
257
+
258
+ # Extract tool information from the response
259
+ agent_tool_pairs = self._extract_tool_names_and_params_from_response(response)
260
+
261
+ # Prepare steps for comparison
262
+ agent_steps, ground_truth_steps = self._prepare_steps_for_comparison(
263
+ agent_tool_pairs,
264
+ ground_truth_names,
265
+ ground_truth_params_dict,
266
+ use_parameter_matching,
267
+ )
268
+
269
+ # Calculate precision, recall, and F1 scores
270
+ metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
271
+
272
+ # Calculate binary match metrics
273
+ exact_match = self._calculate_exact_match(agent_steps, ground_truth_steps)
274
+ in_order_match = self._calculate_in_order_match(agent_steps, ground_truth_steps)
275
+ any_order_match = self._calculate_any_order_match(agent_steps, ground_truth_steps)
276
+
277
+ # Convert metrics to floats, using nan for None or non-convertible values
278
+ path_efficiency_precision = (
279
+ float(metrics["precision_score"]) if metrics["precision_score"] is not None else float("nan")
280
+ )
281
+ path_efficiency_recall = float(metrics["recall_score"]) if metrics["recall_score"] is not None else float("nan")
282
+ path_efficiency_f1_score = float(metrics["f1_score"]) if metrics["f1_score"] is not None else float("nan")
283
+
284
+ return {
285
+ "path_efficiency_precision_score": path_efficiency_precision,
286
+ "path_efficiency_recall_score": path_efficiency_recall,
287
+ "path_efficiency_f1_score": path_efficiency_f1_score,
288
+ "path_efficiency_exact_match_result": EVALUATION_PASS_FAIL_MAPPING[exact_match],
289
+ "path_efficiency_in_order_match_result": EVALUATION_PASS_FAIL_MAPPING[in_order_match],
290
+ "path_efficiency_any_order_match_result": EVALUATION_PASS_FAIL_MAPPING[any_order_match],
291
+ }
292
+
293
+ @overload
294
+ def __call__( # type: ignore
295
+ self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str]
296
+ ) -> Dict[str, Union[float, str]]:
297
+ """
298
+ Evaluate the path efficiency of an agent's action sequence.
299
+
300
+ :keyword response: The agent's response containing tool calls.
301
+ :paramtype response: Union[str, List[Dict[str, Any]]]
302
+ :keyword ground_truth: List of expected tool/action steps.
303
+ :paramtype ground_truth: List[str]
304
+ :return: The path efficiency scores and results.
305
+ :rtype: Dict[str, Union[float, str]]
306
+ """
307
+
308
+ @overload
309
+ def __call__( # type: ignore
310
+ self,
311
+ *,
312
+ response: Union[str, List[Dict[str, Any]]],
313
+ ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
314
+ ) -> Dict[str, Union[float, str]]:
315
+ """
316
+ Evaluate the path efficiency of an agent's action sequence with tool parameters.
317
+
318
+ :keyword response: The agent's response containing tool calls.
319
+ :paramtype response: Union[str, List[Dict[str, Any]]]
320
+ :keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
321
+ :paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
322
+ :return: The path efficiency scores and results.
323
+ :rtype: Dict[str, Union[float, str]]
324
+ """
325
+
326
+ @override
327
+ def __call__(
328
+ self,
329
+ *args,
330
+ **kwargs,
331
+ ):
332
+ """
333
+ Evaluate path efficiency.
334
+
335
+ :keyword response: The agent's response containing tool calls.
336
+ :paramtype response: Union[str, List[Dict[str, Any]]]
337
+ :keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
338
+ :paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
339
+ :return: The path efficiency scores and results.
340
+ :rtype: Dict[str, Union[float, str]]
341
+ """
342
+ return super().__call__(*args, **kwargs)
@@ -35,6 +35,11 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
35
35
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
36
  :param threshold: The threshold for the relevance evaluator. Default is 3.
37
37
  :type threshold: int
38
+ :param credential: The credential for authenticating to Azure AI service.
39
+ :type credential: ~azure.core.credentials.TokenCredential
40
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
41
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
42
+ :paramtype is_reasoning_model: bool
38
43
 
39
44
  .. admonition:: Example:
40
45
 
@@ -79,7 +84,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
79
84
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
80
85
 
81
86
  @override
82
- def __init__(self, model_config, *, credential=None, threshold=3):
87
+ def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
83
88
  current_dir = os.path.dirname(__file__)
84
89
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
85
90
  self._threshold = threshold
@@ -91,6 +96,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
91
96
  threshold=threshold,
92
97
  credential=credential,
93
98
  _higher_is_better=self._higher_is_better,
99
+ **kwargs,
94
100
  )
95
101
 
96
102
  @overload
@@ -33,6 +33,11 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
33
33
  ~azure.ai.evaluation.OpenAIModelConfiguration]
34
34
  :param threshold: The threshold for the evaluation. Default is 3.
35
35
  :type threshold: float
36
+ :param credential: The credential for authenticating to Azure AI service.
37
+ :type credential: ~azure.core.credentials.TokenCredential
38
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
39
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
40
+ :paramtype is_reasoning_model: bool
36
41
  :return: A function that evaluates and generates metrics for "chat" scenario.
37
42
  :rtype: Callable
38
43
 
@@ -78,7 +83,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
83
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
79
84
 
80
85
  @override
81
- def __init__(self, model_config, *, threshold: float = 3, credential=None):
86
+ def __init__(self, model_config, *, threshold: float = 3, credential=None, **kwargs):
82
87
  current_dir = os.path.dirname(__file__)
83
88
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
84
89
  self._threshold = threshold
@@ -90,6 +95,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
90
95
  threshold=threshold,
91
96
  credential=credential,
92
97
  _higher_is_better=self._higher_is_better,
98
+ **kwargs,
93
99
  )
94
100
 
95
101
  @overload
@@ -30,6 +30,11 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
30
30
  ~azure.ai.evaluation.OpenAIModelConfiguration]
31
31
  :param threshold: The threshold for the similarity evaluator. Default is 3.
32
32
  :type threshold: int
33
+ :param credential: The credential for authenticating to Azure AI service.
34
+ :type credential: ~azure.core.credentials.TokenCredential
35
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
36
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
37
+ :paramtype is_reasoning_model: bool
33
38
 
34
39
  .. admonition:: Example:
35
40
 
@@ -75,7 +80,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
75
80
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
76
81
 
77
82
  @override
78
- def __init__(self, model_config, *, threshold=3, credential=None):
83
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
79
84
  current_dir = os.path.dirname(__file__)
80
85
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
81
86
  self._threshold = threshold
@@ -87,6 +92,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
87
92
  threshold=threshold,
88
93
  credential=credential,
89
94
  _higher_is_better=self._higher_is_better,
95
+ **kwargs,
90
96
  )
91
97
 
92
98
  # Ignoring a mypy error about having only 1 overload function.
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._task_success import TaskSuccessEvaluator
6
+
7
+ __all__ = ["TaskSuccessEvaluator"]
@@ -0,0 +1,168 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import math
6
+ import logging
7
+ from typing import Dict, Union, List, Optional
8
+
9
+ from typing_extensions import overload, override
10
+
11
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13
+ from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
14
+ from azure.ai.evaluation._model_configurations import Message
15
+ from azure.ai.evaluation._common._experimental import experimental
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @experimental
21
+ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
22
+ """The Task Success evaluator determines whether an AI agent successfully completed the requested task based on:
23
+
24
+ - Final outcome and deliverable of the task
25
+ - Completeness of task requirements
26
+
27
+ This evaluator focuses solely on task completion and success, not on task adherence or intent understanding.
28
+
29
+ Scoring is binary:
30
+ - TRUE: Task fully completed with usable deliverable that meets all user requirements
31
+ - FALSE: Task incomplete, partially completed, or deliverable does not meet requirements
32
+
33
+ The evaluation includes task requirement analysis, outcome assessment, and completion gap identification.
34
+
35
+
36
+ :param model_config: Configuration for the Azure OpenAI model.
37
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
38
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
39
+
40
+ .. admonition:: Example:
41
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
42
+ :start-after: [START task_success_evaluator]
43
+ :end-before: [END task_success_evaluator]
44
+ :language: python
45
+ :dedent: 8
46
+ :caption: Initialize and call a TaskSuccessEvaluator with a query and response.
47
+
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START task_success_evaluator]
52
+ :end-before: [END task_success_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call TaskSuccessEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
58
+ """
59
+
60
+ _PROMPTY_FILE = "task_success.prompty"
61
+ _RESULT_KEY = "task_success"
62
+ _OPTIONAL_PARAMS = ["tool_definitions"]
63
+
64
+ id = "azureai://built-in/evaluators/task_success"
65
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
66
+
67
+ @override
68
+ def __init__(self, model_config, *, credential=None, **kwargs):
69
+ current_dir = os.path.dirname(__file__)
70
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
71
+ super().__init__(
72
+ model_config=model_config,
73
+ prompty_file=prompty_path,
74
+ result_key=self._RESULT_KEY,
75
+ credential=credential,
76
+ **kwargs,
77
+ )
78
+
79
+ @overload
80
+ def __call__(
81
+ self,
82
+ *,
83
+ query: Union[str, List[dict]],
84
+ response: Union[str, List[dict]],
85
+ tool_definitions: Optional[Union[dict, List[dict]]] = None,
86
+ ) -> Dict[str, Union[str, bool]]:
87
+ """Evaluate task success for a given query, response, and optionally tool definitions.
88
+ The query and response can be either a string or a list of messages.
89
+
90
+
91
+ Example with string inputs and no tools:
92
+ evaluator = TaskSuccessEvaluator(model_config)
93
+ query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
94
+ response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."
95
+
96
+ result = evaluator(query=query, response=response)
97
+
98
+ Example with list of messages:
99
+ evaluator = TaskSuccessEvaluator(model_config)
100
+ query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
101
+ response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
102
+ tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]
103
+
104
+ result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
105
+
106
+ :keyword query: The query being evaluated, either a string or a list of messages.
107
+ :paramtype query: Union[str, List[dict]]
108
+ :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
109
+ :paramtype response: Union[str, List[dict]]
110
+ :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
111
+ :paramtype tool_definitions: Optional[Union[dict, List[dict]]]
112
+ :return: A dictionary with the task success evaluation results.
113
+ :rtype: Dict[str, Union[str, bool]]
114
+ """
115
+
116
+ @override
117
+ def __call__( # pylint: disable=docstring-missing-param
118
+ self,
119
+ *args,
120
+ **kwargs,
121
+ ):
122
+ """
123
+ Invokes the instance using the overloaded __call__ signature.
124
+
125
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
126
+ """
127
+ return super().__call__(*args, **kwargs)
128
+
129
+ @override
130
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # type: ignore[override]
131
+ """Do Task Success evaluation.
132
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
133
+ :type eval_input: Dict
134
+ :return: The evaluation result.
135
+ :rtype: Dict
136
+ """
137
+ # we override the _do_eval method as we want the output to be a dictionary,
138
+ # which is a different schema than _base_prompty_eval.py
139
+ if "query" not in eval_input and "response" not in eval_input:
140
+ raise EvaluationException(
141
+ message=f"Both query and response must be provided as input to the Task Success evaluator.",
142
+ internal_message=f"Both query and response must be provided as input to the Task Success evaluator.",
143
+ blame=ErrorBlame.USER_ERROR,
144
+ category=ErrorCategory.MISSING_FIELD,
145
+ target=ErrorTarget.TASK_SUCCESS_EVALUATOR,
146
+ )
147
+ eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
148
+ eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
149
+ if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
150
+ eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
151
+
152
+ llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
153
+ if isinstance(llm_output, dict):
154
+ success = llm_output.get("success", False)
155
+ if isinstance(success, str):
156
+ success = success.upper() == "TRUE"
157
+
158
+ success_result = "pass" if success == True else "fail"
159
+ reason = llm_output.get("explanation", "")
160
+ return {
161
+ f"{self._result_key}": success,
162
+ f"{self._result_key}_result": success_result,
163
+ f"{self._result_key}_reason": reason,
164
+ f"{self._result_key}_details": llm_output.get("details", ""),
165
+ }
166
+ if logger:
167
+ logger.warning("LLM output is not a dictionary, returning False for the success.")
168
+ return {self._result_key: False}