azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. azure/ai/evaluation/__init__.py +13 -2
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
  6. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  8. azure/ai/evaluation/_azure/_envs.py +9 -10
  9. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  10. azure/ai/evaluation/_common/constants.py +11 -2
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  13. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  14. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  15. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  16. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  17. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  18. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  20. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  21. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  22. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  23. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  24. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  25. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  26. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  27. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  28. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  29. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  30. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  31. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  32. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  33. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
  34. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  35. azure/ai/evaluation/_common/rai_service.py +86 -50
  36. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  37. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  38. azure/ai/evaluation/_common/utils.py +124 -3
  39. azure/ai/evaluation/_constants.py +2 -1
  40. azure/ai/evaluation/_converters/__init__.py +1 -1
  41. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  42. azure/ai/evaluation/_converters/_models.py +46 -0
  43. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  44. azure/ai/evaluation/_eval_mapping.py +2 -2
  45. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
  46. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  47. azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
  48. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
  49. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  50. azure/ai/evaluation/_evaluate/_utils.py +24 -15
  51. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
  52. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
  53. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
  54. azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
  55. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  56. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
  57. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
  58. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
  59. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
  60. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
  61. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
  62. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  63. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
  64. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
  65. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
  66. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
  67. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
  68. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
  69. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
  70. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  71. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
  72. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
  73. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
  74. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
  75. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
  76. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
  77. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
  78. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
  79. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
  80. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
  81. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
  82. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
  83. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
  84. azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
  85. azure/ai/evaluation/_exceptions.py +10 -0
  86. azure/ai/evaluation/_http_utils.py +3 -3
  87. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
  88. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  91. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  92. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  93. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  94. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  95. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
  96. azure/ai/evaluation/_user_agent.py +32 -1
  97. azure/ai/evaluation/_version.py +1 -1
  98. azure/ai/evaluation/red_team/__init__.py +3 -1
  99. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  100. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  101. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  102. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  103. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  104. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  105. azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
  106. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  107. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  108. azure/ai/evaluation/red_team/_red_team.py +1622 -765
  109. azure/ai/evaluation/red_team/_red_team_result.py +43 -38
  110. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  111. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
  112. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
  113. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
  114. azure/ai/evaluation/red_team/_utils/constants.py +6 -12
  115. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  116. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  117. azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
  118. azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
  119. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  120. azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
  121. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  122. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  123. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
  124. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
  125. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  126. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
  127. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  128. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  129. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  130. azure/ai/evaluation/simulator/_simulator.py +9 -8
  131. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
  132. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
  133. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  134. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
  135. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
  136. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@
3
3
  # ---------------------------------------------------------
4
4
  from enum import Enum
5
5
 
6
- from typing import Dict
6
+ from typing import Dict, Union
7
7
  from typing_extensions import overload, override
8
8
 
9
9
  from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
@@ -12,7 +12,7 @@ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
12
12
  import math
13
13
 
14
14
 
15
- class RougeType(Enum):
15
+ class RougeType(str, Enum):
16
16
  """
17
17
  Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
18
18
  """
@@ -71,13 +71,13 @@ class RougeScoreEvaluator(EvaluatorBase):
71
71
  :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
72
72
 
73
73
  .. admonition:: Example using Azure AI Project URL:
74
-
74
+
75
75
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
76
76
  :start-after: [START rouge_score_evaluator]
77
77
  :end-before: [END rouge_score_evaluator]
78
78
  :language: python
79
79
  :dedent: 8
80
- :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
80
+ :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
81
81
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
82
82
 
83
83
  .. admonition:: Example with threshold:
@@ -95,17 +95,17 @@ class RougeScoreEvaluator(EvaluatorBase):
95
95
 
96
96
  @override
97
97
  def __init__(
98
- self,
98
+ self,
99
99
  rouge_type: RougeType,
100
100
  *,
101
101
  precision_threshold: float = 0.5,
102
102
  recall_threshold: float = 0.5,
103
- f1_score_threshold: float = 0.5
103
+ f1_score_threshold: float = 0.5,
104
104
  ):
105
105
  self._rouge_type = rouge_type
106
106
  self._higher_is_better = True
107
107
  super().__init__()
108
-
108
+
109
109
  # Type checking for threshold parameters
110
110
  for name, value in [
111
111
  ("precision_threshold", precision_threshold),
@@ -114,7 +114,7 @@ class RougeScoreEvaluator(EvaluatorBase):
114
114
  ]:
115
115
  if not isinstance(value, float):
116
116
  raise TypeError(f"{name} must be a float, got {type(value)}")
117
-
117
+
118
118
  self._threshold = {
119
119
  "precision": precision_threshold,
120
120
  "recall": recall_threshold,
@@ -122,10 +122,10 @@ class RougeScoreEvaluator(EvaluatorBase):
122
122
  }
123
123
 
124
124
  def _get_binary_result(
125
- self,
126
- rouge_precision: float,
127
- rouge_recall: float,
128
- rouge_f1_score: float,
125
+ self,
126
+ rouge_precision: float,
127
+ rouge_recall: float,
128
+ rouge_f1_score: float,
129
129
  ) -> Dict[str, bool]:
130
130
  """
131
131
  Get binary result based on the threshold.
@@ -150,22 +150,22 @@ class RougeScoreEvaluator(EvaluatorBase):
150
150
  precision_valid = not math.isnan(rouge_precision)
151
151
  recall_valid = not math.isnan(rouge_recall)
152
152
  f1_valid = not math.isnan(rouge_f1_score)
153
-
153
+
154
154
  if self._higher_is_better:
155
155
  if precision_valid:
156
- results["rouge_precision_result"] = (rouge_precision >= self._threshold["precision"])
156
+ results["rouge_precision_result"] = rouge_precision >= self._threshold["precision"]
157
157
  if recall_valid:
158
- results["rouge_recall_result"] = (rouge_recall >= self._threshold["recall"])
158
+ results["rouge_recall_result"] = rouge_recall >= self._threshold["recall"]
159
159
  if f1_valid:
160
- results["rouge_f1_score_result"] = (rouge_f1_score >= self._threshold["f1_score"])
160
+ results["rouge_f1_score_result"] = rouge_f1_score >= self._threshold["f1_score"]
161
161
  else:
162
162
  if precision_valid:
163
- results["rouge_precision_result"] = (rouge_precision <= self._threshold["precision"])
163
+ results["rouge_precision_result"] = rouge_precision <= self._threshold["precision"]
164
164
  if recall_valid:
165
- results["rouge_recall_result"] = (rouge_recall <= self._threshold["recall"])
165
+ results["rouge_recall_result"] = rouge_recall <= self._threshold["recall"]
166
166
  if f1_valid:
167
- results["rouge_f1_score_result"] = (rouge_f1_score <= self._threshold["f1_score"])
168
-
167
+ results["rouge_f1_score_result"] = rouge_f1_score <= self._threshold["f1_score"]
168
+
169
169
  return results
170
170
 
171
171
  @override
@@ -179,17 +179,17 @@ class RougeScoreEvaluator(EvaluatorBase):
179
179
  """
180
180
  ground_truth = eval_input["ground_truth"]
181
181
  response = eval_input["response"]
182
- scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
183
- metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
182
+ scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
183
+ metrics = scorer.score(ground_truth, response)[self._rouge_type]
184
184
  binary_results = {
185
185
  "rouge_precision_result": False,
186
186
  "rouge_recall_result": False,
187
187
  "rouge_f1_score_result": False,
188
188
  }
189
189
  # Convert metrics to floats, using nan for None or non-convertible values
190
- rouge_precision = float(metrics.precision) if metrics.precision is not None else float('nan')
191
- rouge_recall = float(metrics.recall) if metrics.recall is not None else float('nan')
192
- rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float('nan')
190
+ rouge_precision = float(metrics.precision) if metrics.precision is not None else float("nan")
191
+ rouge_recall = float(metrics.recall) if metrics.recall is not None else float("nan")
192
+ rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float("nan")
193
193
  binary_results = self._get_binary_result(
194
194
  rouge_precision=rouge_precision,
195
195
  rouge_recall=rouge_recall,
@@ -24,9 +24,9 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
24
24
 
25
25
  :param credential: The credential for connecting to Azure AI project. Required
26
26
  :type credential: ~azure.core.credentials.TokenCredential
27
- :param azure_ai_project: The scope of the Azure AI project.
28
- It contains subscription id, resource group, and project name.
29
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
27
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
28
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
29
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
30
30
  :param threshold: The threshold for the groundedness pro evaluator. Default is 5.
31
31
  :type threshold: int
32
32
  :param kwargs: Additional arguments to pass to the evaluator.
@@ -42,13 +42,13 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
42
42
  :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
43
43
 
44
44
  .. admonition:: Example using Azure AI Project URL:
45
-
45
+
46
46
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
47
47
  :start-after: [START groundedness_pro_evaluator]
48
48
  :end-before: [END groundedness_pro_evaluator]
49
49
  :language: python
50
50
  :dedent: 8
51
- :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
51
+ :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
52
52
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
53
53
 
54
54
  .. admonition:: Example with threshold:
@@ -41,13 +41,13 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
41
41
  :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
42
42
 
43
43
  .. admonition:: Example using Azure AI Project URL:
44
-
44
+
45
45
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
46
46
  :start-after: [START similarity_evaluator]
47
47
  :end-before: [END similarity_evaluator]
48
48
  :language: python
49
49
  :dedent: 8
50
- :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
50
+ :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
51
51
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
52
52
 
53
53
  .. admonition:: Example:
@@ -85,7 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
85
85
  prompty_file=prompty_path,
86
86
  result_key=self._RESULT_KEY,
87
87
  threshold=threshold,
88
- _higher_is_better=self._higher_is_better
88
+ _higher_is_better=self._higher_is_better,
89
89
  )
90
90
 
91
91
  # Ignoring a mypy error about having only 1 overload function.
@@ -13,6 +13,7 @@ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_sco
13
13
  from azure.ai.evaluation._model_configurations import Message
14
14
  from azure.ai.evaluation._common._experimental import experimental
15
15
 
16
+
16
17
  @experimental
17
18
  class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
18
19
  """The Task Adherence evaluator assesses how well an AI-generated response follows the assigned task based on:
@@ -42,15 +43,15 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
42
43
  :language: python
43
44
  :dedent: 8
44
45
  :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
45
-
46
+
46
47
  .. admonition:: Example using Azure AI Project URL:
47
-
48
+
48
49
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
49
50
  :start-after: [START task_adherence_evaluator]
50
51
  :end-before: [END task_adherence_evaluator]
51
52
  :language: python
52
53
  :dedent: 8
53
- :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
54
+ :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
54
55
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
55
56
 
56
57
  """
@@ -65,14 +66,11 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
65
66
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
66
67
 
67
68
  @override
68
- def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE,
69
- **kwargs):
69
+ def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
70
70
  current_dir = os.path.dirname(__file__)
71
71
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
72
72
  self.threshold = threshold
73
- super().__init__(model_config=model_config, prompty_file=prompty_path,
74
- result_key=self._RESULT_KEY,
75
- **kwargs)
73
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
76
74
 
77
75
  @overload
78
76
  def __call__(
@@ -85,7 +83,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
85
83
  """Evaluate task adherence for a given query, response, and optional tool defintions.
86
84
  The query and response can be either a string or a list of messages.
87
85
 
88
-
86
+
89
87
  Example with string inputs and no tools:
90
88
  evaluator = TaskAdherenceEvaluator(model_config)
91
89
  query = "What is the weather today?"
@@ -113,9 +111,9 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
113
111
 
114
112
  @override
115
113
  def __call__( # pylint: disable=docstring-missing-param
116
- self,
117
- *args,
118
- **kwargs,
114
+ self,
115
+ *args,
116
+ **kwargs,
119
117
  ):
120
118
  """
121
119
  Invokes the instance using the overloaded __call__ signature.
@@ -149,7 +147,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
149
147
  if llm_output:
150
148
  score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
151
149
 
152
- score_result = 'pass' if score >= self.threshold else 'fail'
150
+ score_result = "pass" if score >= self.threshold else "fail"
153
151
 
154
152
  return {
155
153
  f"{self._result_key}": score,
@@ -159,4 +157,3 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
159
157
  }
160
158
 
161
159
  return {self._result_key: math.nan}
162
-
@@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
  T_EvalValue = TypeVar("T_EvalValue")
19
19
 
20
+
20
21
  @experimental
21
22
  class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
22
23
  """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
@@ -46,13 +47,13 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
46
47
  :caption: Initialize and call a ToolCallAccuracyEvaluator.
47
48
 
48
49
  .. admonition:: Example using Azure AI Project URL:
49
-
50
+
50
51
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
52
  :start-after: [START tool_call_accuracy_evaluator]
52
53
  :end-before: [END tool_call_accuracy_evaluator]
53
54
  :language: python
54
55
  :dedent: 8
55
- :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
56
+ :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
56
57
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
58
 
58
59
  .. note::
@@ -74,15 +75,11 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
74
75
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
75
76
 
76
77
  @override
77
- def __init__(self, model_config, *,
78
- threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
79
- **kwargs):
78
+ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, **kwargs):
80
79
  current_dir = os.path.dirname(__file__)
81
80
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
82
81
  self.threshold = threshold
83
- super().__init__(model_config=model_config, prompty_file=prompty_path,
84
- result_key=self._RESULT_KEY,
85
- **kwargs)
82
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
86
83
 
87
84
  @overload
88
85
  def __call__(
@@ -90,8 +87,8 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
90
87
  *,
91
88
  query: Union[str, List[dict]],
92
89
  tool_definitions: Union[dict, List[dict]],
93
- tool_calls: Union[dict, List[dict]] = None,
94
- response: Union[str, List[dict]] = None
90
+ tool_calls: Union[dict, List[dict]] = None,
91
+ response: Union[str, List[dict]] = None,
95
92
  ) -> Dict[str, Union[str, float]]:
96
93
  """
97
94
  Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
@@ -165,8 +162,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
165
162
  if isinstance(response, list):
166
163
  for message in response:
167
164
  if message.get("role") == "assistant":
168
- tool_calls.extend([content for content in message.get("content")
169
- if content.get("type") == "tool_call"])
165
+ tool_calls.extend(
166
+ [content for content in message.get("content") if content.get("type") == "tool_call"]
167
+ )
170
168
  if len(tool_calls) == 0:
171
169
  raise EvaluationException(
172
170
  message="response does not have tool calls. Either provide tool_calls or response with tool calls.",
@@ -185,7 +183,9 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
185
183
  # TODO : When evaluating an agent tool that depends on the output of a previous tool call,
186
184
  # we need to provide the output of the previous tool call as part of messages.
187
185
  for tool_call in tool_calls:
188
- if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call": # TODO assuming dict here but it can be a class
186
+ if (
187
+ isinstance(tool_call, dict) and tool_call.get("type") == "tool_call"
188
+ ): # TODO assuming dict here but it can be a class
189
189
  function_name = tool_call.get("name")
190
190
  tool_definition = [tool for tool in tool_definitions if tool.get("name") == function_name]
191
191
  if len(tool_definition) > 0:
@@ -228,7 +228,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
228
228
  return {
229
229
  self._result_key: bool(float(score)),
230
230
  f"{self._result_key}_reason": reason,
231
- "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
231
+ "tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
232
232
  }
233
233
  raise EvaluationException(
234
234
  message="Tool call accuracy evaluator: Invalid score returned from LLM.",
@@ -248,13 +248,13 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
248
248
  # Convert inputs into list of evaluable inputs.
249
249
  eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
250
250
  if len(eval_input_list) == 0:
251
- return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
252
- f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
253
- f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
254
- f"{self._AGGREGATE_RESULT_KEY}_reason":
255
- "No tool calls were made.",
256
- "per_tool_call_details": []
257
- }
251
+ return {
252
+ self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
253
+ f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
254
+ f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
255
+ f"{self._AGGREGATE_RESULT_KEY}_reason": "No tool calls were made.",
256
+ "per_tool_call_details": [],
257
+ }
258
258
 
259
259
  per_turn_results = []
260
260
  # Evaluate all inputs.
@@ -293,7 +293,7 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
293
293
  return {
294
294
  f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
295
295
  f"{self._result_key}_reason": "Tool call not supported for evaluation",
296
- "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
296
+ "tool_call_id": eval_input.get("tool_call").get("tool_call_id"),
297
297
  }
298
298
 
299
299
  def _aggregate_results(self, per_turn_results):
@@ -318,23 +318,32 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
318
318
  # Go over each turn, and rotate the results into a
319
319
  # metric: List[values] format for the evals_per_turn dictionary.
320
320
 
321
- num_evaluated = len([per_turn_result for per_turn_result in per_turn_results
322
- if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT])
321
+ num_evaluated = len(
322
+ [
323
+ per_turn_result
324
+ for per_turn_result in per_turn_results
325
+ if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT
326
+ ]
327
+ )
323
328
  if num_evaluated == 0:
324
329
  # None of the invoked tools were applicable, return not applicable result
325
330
  # (If a tool fails evaluation, we'll throw an exception)
326
- return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
327
- f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
328
- f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
329
- f"{self._AGGREGATE_RESULT_KEY}_reason":
330
- "Tool call accuracy evaluation is not yet supported for the invoked tools.",
331
- "per_tool_call_details": []
332
- }
331
+ return {
332
+ self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
333
+ f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
334
+ f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
335
+ f"{self._AGGREGATE_RESULT_KEY}_reason": "Tool call accuracy evaluation is not yet supported for the invoked tools.",
336
+ "per_tool_call_details": [],
337
+ }
333
338
  # ignore not_applicable results, where the _result_key will be "not applicable"
334
- score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated
339
+ score = (
340
+ sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results]) / num_evaluated
341
+ )
335
342
  aggregated[self._AGGREGATE_RESULT_KEY] = score
336
- aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
337
- aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
343
+ aggregated[f"{self._AGGREGATE_RESULT_KEY}_result"] = (
344
+ self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
345
+ )
346
+ aggregated[f"{self._AGGREGATE_RESULT_KEY}_threshold"] = self.threshold
338
347
  aggregated["per_tool_call_details"] = per_turn_results
339
348
  return aggregated
340
349
 
@@ -14,11 +14,11 @@ model:
14
14
 
15
15
  inputs:
16
16
  query:
17
- type: List
17
+ type: array
18
18
  tool_call:
19
- type: Dict
19
+ type: object
20
20
  tool_definition:
21
- type: Dict
21
+ type: object
22
22
 
23
23
  ---
24
24
  system:
@@ -8,27 +8,28 @@ from azure.ai.evaluation._common._experimental import experimental
8
8
  from azure.ai.evaluation._common.constants import EvaluationMetrics
9
9
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
10
 
11
+
11
12
  @experimental
12
13
  class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
13
14
  """
14
- Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
15
- where query represents the user query and response represents the AI system response given the provided context.
16
-
17
- Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
15
+ Evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
16
+ where query represents the user query and response represents the AI system response given the provided context.
17
+
18
+ Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class or
18
19
  emotional state of a person.
19
20
 
20
21
 
21
22
  It identifies the following attributes:
22
-
23
+
23
24
  - emotional_state
24
25
  - protected_class
25
26
  - groundedness
26
27
 
27
28
  :param credential: The credential for connecting to Azure AI project. Required
28
29
  :type credential: ~azure.core.credentials.TokenCredential
29
- :param azure_ai_project: The scope of the Azure AI project.
30
- It contains subscription id, resource group, and project name.
31
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
30
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
31
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
32
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
32
33
  :param kwargs: Additional arguments to pass to the evaluator.
33
34
  :type kwargs: Any
34
35
 
@@ -42,13 +43,13 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
42
43
  :caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
43
44
 
44
45
  .. admonition:: Example using Azure AI Project URL:
45
-
46
+
46
47
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
47
48
  :start-after: [START ungrounded_attributes_evaluator]
48
49
  :end-before: [END ungrounded_attributes_evaluator]
49
50
  :language: python
50
51
  :dedent: 8
51
- :caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
52
+ :caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
52
53
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
53
54
 
54
55
  .. note::
@@ -109,5 +110,5 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
109
110
  :return: The ungrounded attributes label.
110
111
  :rtype: Dict[str, Union[str, bool]]
111
112
  """
112
-
113
+
113
114
  return super().__call__(*args, **kwargs)
@@ -40,9 +40,9 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
40
40
 
41
41
  :param credential: The credential for connecting to Azure AI project. Required
42
42
  :type credential: ~azure.core.credentials.TokenCredential
43
- :param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project
44
- name.
45
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
43
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
44
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
45
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
46
46
  :param threshold: The threshold for the IndirectAttack evaluator. Default is 0.
47
47
  :type threshold: int
48
48
 
@@ -54,15 +54,15 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
54
54
  :language: python
55
55
  :dedent: 8
56
56
  :caption: Initialize and call an IndirectAttackEvaluator.
57
-
57
+
58
58
  .. admonition:: Example using Azure AI Project URL:
59
-
59
+
60
60
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
61
61
  :start-after: [START indirect_attack_evaluator]
62
62
  :end-before: [END indirect_attack_evaluator]
63
63
  :language: python
64
64
  :dedent: 8
65
- :caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
65
+ :caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
66
66
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
67
67
 
68
68
  """
@@ -9,6 +9,15 @@ from typing import Optional
9
9
  from azure.core.exceptions import AzureError
10
10
 
11
11
 
12
+ class ErrorMessage(Enum):
13
+ """Error messages to be used when raising EvaluationException.
14
+
15
+ These messages are used to provide a consistent error message format across the SDK.
16
+ """
17
+
18
+ MALFORMED_CONVERSATION_HISTORY = "Malformed Conversation History: Query parameter representing conversation history should have exactly one more user query than agent responses"
19
+
20
+
12
21
  class ErrorCategory(Enum):
13
22
  """Error category to be specified when using EvaluationException class.
14
23
 
@@ -87,6 +96,7 @@ class ErrorTarget(Enum):
87
96
  TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
88
97
  RED_TEAM = "RedTeam"
89
98
  AOAI_GRADER = "AoaiGrader"
99
+ CONVERSATION_HISTORY_PARSING = "_get_conversation_history"
90
100
 
91
101
 
92
102
  class EvaluationException(AzureError):
@@ -7,7 +7,7 @@ from typing import Any, Dict, MutableMapping, Optional, TypedDict, cast
7
7
 
8
8
  from typing_extensions import Self, Unpack
9
9
 
10
- from azure.ai.evaluation._user_agent import USER_AGENT
10
+ from azure.ai.evaluation._user_agent import UserAgentSingleton
11
11
  from azure.core.configuration import Configuration
12
12
  from azure.core.pipeline import AsyncPipeline, Pipeline
13
13
  from azure.core.pipeline.policies import (
@@ -454,7 +454,7 @@ def get_http_client(**kwargs: Any) -> HttpPipeline:
454
454
  :returns: An HttpPipeline with a set of applied policies:
455
455
  :rtype: HttpPipeline
456
456
  """
457
- kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
457
+ kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=UserAgentSingleton().value))
458
458
  return HttpPipeline(**kwargs)
459
459
 
460
460
 
@@ -464,5 +464,5 @@ def get_async_http_client(**kwargs: Any) -> AsyncHttpPipeline:
464
464
  :returns: An AsyncHttpPipeline with a set of applied policies:
465
465
  :rtype: AsyncHttpPipeline
466
466
  """
467
- kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=USER_AGENT))
467
+ kwargs.setdefault("user_agent_policy", UserAgentPolicy(base_user_agent=UserAgentSingleton().value))
468
468
  return AsyncHttpPipeline(**kwargs)
@@ -319,9 +319,9 @@ class BatchEngine:
319
319
  # to maximize the parallelism, we run the synchronous function in a separate thread
320
320
  # and await its result
321
321
  output = await asyncio.get_event_loop().run_in_executor(
322
- self._executor,
323
- partial(self._func, **inputs))
324
-
322
+ self._executor, partial(self._func, **inputs)
323
+ )
324
+
325
325
  # This should in theory never happen but as an extra precaution, let's check if the output
326
326
  # is awaitable and await it if it is.
327
327
  if inspect.isawaitable(output):
@@ -90,7 +90,9 @@ def _openai_api_list() -> Generator[Tuple[Any, Callable, bool], None, None]:
90
90
  except ImportError:
91
91
  raise MissingRequiredPackage("Please install the 'openai' package to use the Azure AI Evaluation SDK")
92
92
  except AttributeError:
93
- logging.warning("The module '%s' does not have class '%s' or method '%s'", module_name, class_name, method_name)
93
+ logging.warning(
94
+ "The module '%s' does not have class '%s' or method '%s'", module_name, class_name, method_name
95
+ )
94
96
 
95
97
 
96
98
  def inject_openai_api():
@@ -117,6 +119,7 @@ def recover_openai_api():
117
119
 
118
120
  class CaptureOpenAITokenUsage:
119
121
  """Context manager to capture OpenAI token usage."""
122
+
120
123
  def __init__(self):
121
124
  self._tokens = TokenMetrics(0, 0, 0)
122
125
 
@@ -126,4 +129,4 @@ class CaptureOpenAITokenUsage:
126
129
 
127
130
  def __exit__(self, exc_type: Optional[Exception], exc_value: Optional[Exception], traceback: Optional[Any]) -> None:
128
131
  captured_metrics = _token_metrics.get()
129
- self._tokens.update(captured_metrics)
132
+ self._tokens.update(captured_metrics)