azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. azure/ai/evaluation/__init__.py +13 -2
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
  6. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  8. azure/ai/evaluation/_azure/_envs.py +9 -10
  9. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  10. azure/ai/evaluation/_common/constants.py +11 -2
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  13. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  14. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  15. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  16. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  17. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  18. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  20. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  21. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  22. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  23. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  24. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  25. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  26. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  27. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  28. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  29. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  30. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  31. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  32. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  33. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5655
  34. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  35. azure/ai/evaluation/_common/rai_service.py +86 -50
  36. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  37. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  38. azure/ai/evaluation/_common/utils.py +124 -3
  39. azure/ai/evaluation/_constants.py +2 -1
  40. azure/ai/evaluation/_converters/__init__.py +1 -1
  41. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  42. azure/ai/evaluation/_converters/_models.py +46 -0
  43. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  44. azure/ai/evaluation/_eval_mapping.py +2 -2
  45. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
  46. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  47. azure/ai/evaluation/_evaluate/_evaluate.py +64 -58
  48. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
  49. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  50. azure/ai/evaluation/_evaluate/_utils.py +24 -15
  51. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
  52. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
  53. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
  54. azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
  55. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  56. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
  57. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
  58. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
  59. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
  60. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
  61. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
  62. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  63. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
  64. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
  65. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
  66. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
  67. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
  68. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
  69. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
  70. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  71. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
  72. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
  73. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
  74. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
  75. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
  76. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
  77. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +25 -25
  78. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
  79. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
  80. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
  81. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
  82. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
  83. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
  84. azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
  85. azure/ai/evaluation/_exceptions.py +10 -0
  86. azure/ai/evaluation/_http_utils.py +3 -3
  87. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
  88. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  91. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  92. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  93. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  94. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  95. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
  96. azure/ai/evaluation/_user_agent.py +32 -1
  97. azure/ai/evaluation/_version.py +1 -1
  98. azure/ai/evaluation/red_team/__init__.py +3 -1
  99. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  100. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  101. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  102. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  103. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  104. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  105. azure/ai/evaluation/red_team/_attack_strategy.py +4 -1
  106. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  107. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  108. azure/ai/evaluation/red_team/_red_team.py +1622 -765
  109. azure/ai/evaluation/red_team/_red_team_result.py +43 -38
  110. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  111. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +121 -0
  112. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +595 -0
  113. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +108 -0
  114. azure/ai/evaluation/red_team/_utils/constants.py +6 -12
  115. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  116. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  117. azure/ai/evaluation/red_team/_utils/metric_mapping.py +33 -6
  118. azure/ai/evaluation/red_team/_utils/strategy_utils.py +35 -25
  119. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  120. azure/ai/evaluation/simulator/_adversarial_simulator.py +34 -16
  121. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  122. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  123. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
  124. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -23
  125. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  126. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +25 -15
  127. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  128. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  129. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  130. azure/ai/evaluation/simulator/_simulator.py +9 -8
  131. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +24 -1
  132. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -123
  133. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  134. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
  135. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
  136. {azure_ai_evaluation-1.7.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
@@ -14,9 +14,7 @@ RetrievalGroundTruthDocument = TypedDict(
14
14
  "RetrievalGroundTruthDocument", {"document_id": str, "query_relevance_label": int}
15
15
  )
16
16
 
17
- RetrievedDocument = TypedDict(
18
- "RetrievedDocument", {"document_id": str, "relevance_score": float}
19
- )
17
+ RetrievedDocument = TypedDict("RetrievedDocument", {"document_id": str, "relevance_score": float})
20
18
 
21
19
 
22
20
  class DocumentRetrievalEvaluator(EvaluatorBase):
@@ -33,15 +31,15 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
33
31
  :caption: Initialize and call a DocumentRetrievalEvaluator
34
32
 
35
33
  .. admonition:: Example using Azure AI Project URL:
36
-
34
+
37
35
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
38
36
  :start-after: [START document_retrieval_evaluator]
39
37
  :end-before: [END document_retrieval_evaluator]
40
38
  :language: python
41
39
  :dedent: 8
42
- :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
40
+ :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
43
41
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
44
-
42
+
45
43
  .. admonition:: Example with Threshold:
46
44
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
47
45
  :start-after: [START threshold_document_retrieval_evaluator]
@@ -62,7 +60,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
62
60
  top1_relevance_threshold: Optional[float] = 50.0,
63
61
  top3_max_relevance_threshold: Optional[float] = 50.0,
64
62
  total_retrieved_documents_threshold: Optional[int] = 50,
65
- total_ground_truth_documents_threshold: Optional[int] = 50
63
+ total_ground_truth_documents_threshold: Optional[int] = 50,
66
64
  ):
67
65
  super().__init__()
68
66
  self.k = 3
@@ -74,14 +72,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
74
72
  )
75
73
 
76
74
  if not isinstance(ground_truth_label_min, int):
77
- raise EvaluationException(
78
- "The ground truth label minimum must be an integer value."
79
- )
75
+ raise EvaluationException("The ground truth label minimum must be an integer value.")
80
76
 
81
77
  if not isinstance(ground_truth_label_max, int):
82
- raise EvaluationException(
83
- "The ground truth label maximum must be an integer value."
84
- )
78
+ raise EvaluationException("The ground truth label maximum must be an integer value.")
85
79
 
86
80
  self.ground_truth_label_min = ground_truth_label_min
87
81
  self.ground_truth_label_max = ground_truth_label_max
@@ -122,7 +116,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
122
116
  ) -> float:
123
117
  """NDCG (Normalized Discounted Cumulative Gain) calculated for the top K documents retrieved from a search query.
124
118
  NDCG measures how well a document ranking compares to an ideal document ranking given a list of ground-truth documents.
125
-
119
+
126
120
  :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
127
121
  :type result_docs_groundtruth_labels: List[int]
128
122
  :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
@@ -145,7 +139,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
145
139
  def _compute_xdcg(self, result_docs_groundtruth_labels: List[int]) -> float:
146
140
  """XDCG calculated for the top K documents retrieved from a search query.
147
141
  XDCG measures how objectively good are the top K documents, discounted by their position in the list.
148
-
142
+
149
143
  :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
150
144
  :type result_docs_groundtruth_labels: List[int]
151
145
  :return: The XDCG@K calculation result.
@@ -159,11 +153,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
159
153
  return math.pow(self.xdcg_discount_factor, rank - 1)
160
154
 
161
155
  ranks = list(range(1, self.k + 1))
162
- xdcg_n = sum(
163
- starmap(
164
- calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)
165
- )
166
- )
156
+ xdcg_n = sum(starmap(calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)))
167
157
  xdcg_d = sum(map(calculate_xdcg_denominator, ranks))
168
158
 
169
159
  return xdcg_n / float(xdcg_d)
@@ -175,7 +165,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
175
165
  ) -> float:
176
166
  """Fidelity calculated over all documents retrieved from a search query.
177
167
  Fidelity measures how objectively good are all of the documents retrieved compared with all known good documents in the underlying data store.
178
-
168
+
179
169
  :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
180
170
  :type result_docs_groundtruth_labels: List[int]
181
171
  :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
@@ -196,25 +186,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
196
186
  if label >= s:
197
187
  label_counts[str(label)] += 1
198
188
 
199
- sorted_label_counts = [
200
- x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])
201
- ]
189
+ sorted_label_counts = [x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])]
202
190
 
203
191
  # calculate weights
204
- weights = [
205
- (math.pow(2, i + 1) - 1)
206
- for i in range(s, self.ground_truth_label_max + 1)
207
- ]
192
+ weights = [(math.pow(2, i + 1) - 1) for i in range(s, self.ground_truth_label_max + 1)]
208
193
 
209
194
  # return weighted sum
210
195
  return sum(starmap(operator.mul, zip(sorted_label_counts, weights)))
211
196
 
212
- weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(
213
- result_docs_groundtruth_labels
214
- )
215
- weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(
216
- ideal_docs_groundtruth_labels
217
- )
197
+ weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(result_docs_groundtruth_labels)
198
+ weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(ideal_docs_groundtruth_labels)
218
199
 
219
200
  if weighted_sum_by_rating_index == 0:
220
201
  return math.nan
@@ -226,12 +207,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
226
207
 
227
208
  for metric_name, metric_value in metrics.items():
228
209
  if metric_name in self._threshold_metrics.keys():
229
- result[f"{metric_name}_result"] = "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
210
+ result[f"{metric_name}_result"] = (
211
+ "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
212
+ )
230
213
  result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
231
214
  result[f"{metric_name}_higher_is_better"] = True
232
215
 
233
216
  elif metric_name in self._threshold_holes.keys():
234
- result[f"{metric_name}_result"] = "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
217
+ result[f"{metric_name}_result"] = (
218
+ "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
219
+ )
235
220
  result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
236
221
  result[f"{metric_name}_higher_is_better"] = False
237
222
 
@@ -256,8 +241,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
256
241
  # if the qrels are empty, no meaningful evaluation is possible
257
242
  if not retrieval_ground_truth:
258
243
  raise EvaluationException(
259
- ("'retrieval_ground_truth' parameter must contain at least one item. "
260
- "Check your data input to be sure that each input record has ground truth defined.")
244
+ (
245
+ "'retrieval_ground_truth' parameter must contain at least one item. "
246
+ "Check your data input to be sure that each input record has ground truth defined."
247
+ )
261
248
  )
262
249
 
263
250
  qrels = []
@@ -277,9 +264,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
277
264
  )
278
265
 
279
266
  if not isinstance(query_relevance_label, int):
280
- raise EvaluationException(
281
- "Query relevance labels must be integer values."
282
- )
267
+ raise EvaluationException("Query relevance labels must be integer values.")
283
268
 
284
269
  if query_relevance_label < self.ground_truth_label_min:
285
270
  raise EvaluationException(
@@ -318,12 +303,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
318
303
  )
319
304
  )
320
305
 
321
- if not isinstance(relevance_score, float) and not isinstance(
322
- relevance_score, int
323
- ):
324
- raise EvaluationException(
325
- "Retrieved document relevance score must be a numerical value."
326
- )
306
+ if not isinstance(relevance_score, float) and not isinstance(relevance_score, int):
307
+ raise EvaluationException("Retrieved document relevance score must be a numerical value.")
327
308
 
328
309
  results.append(result)
329
310
 
@@ -368,24 +349,17 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
368
349
  results_lookup = {x["document_id"]: x["relevance_score"] for x in results}
369
350
 
370
351
  # sort each input set by label to get the ranking
371
- qrels_sorted_by_rank = sorted(
372
- qrels_lookup.items(), key=lambda x: x[1], reverse=True
373
- )
374
- results_sorted_by_rank = sorted(
375
- results_lookup.items(), key=lambda x: x[1], reverse=True
376
- )
352
+ qrels_sorted_by_rank = sorted(qrels_lookup.items(), key=lambda x: x[1], reverse=True)
353
+ results_sorted_by_rank = sorted(results_lookup.items(), key=lambda x: x[1], reverse=True)
377
354
 
378
355
  # find ground truth labels for the results set and ideal set
379
356
  result_docs_groundtruth_labels = [
380
- qrels_lookup[doc_id] if doc_id in qrels_lookup else 0
381
- for (doc_id, _) in results_sorted_by_rank
357
+ qrels_lookup[doc_id] if doc_id in qrels_lookup else 0 for (doc_id, _) in results_sorted_by_rank
382
358
  ]
383
359
  ideal_docs_groundtruth_labels = [label for (_, label) in qrels_sorted_by_rank]
384
360
 
385
361
  # calculate the proportion of result docs with no ground truth label (holes)
386
- holes = self._compute_holes(
387
- [x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank]
388
- )
362
+ holes = self._compute_holes([x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank])
389
363
  holes_ratio = holes / float(len(results))
390
364
 
391
365
  # if none of the retrieved docs are labeled, report holes only
@@ -412,12 +386,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
412
386
  result_docs_groundtruth_labels[: self.k],
413
387
  ideal_docs_groundtruth_labels[: self.k],
414
388
  ),
415
- f"xdcg@{self.k}": self._compute_xdcg(
416
- result_docs_groundtruth_labels[: self.k]
417
- ),
418
- "fidelity": self._compute_fidelity(
419
- result_docs_groundtruth_labels, ideal_docs_groundtruth_labels
420
- ),
389
+ f"xdcg@{self.k}": self._compute_xdcg(result_docs_groundtruth_labels[: self.k]),
390
+ "fidelity": self._compute_fidelity(result_docs_groundtruth_labels, ideal_docs_groundtruth_labels),
421
391
  "top1_relevance": result_docs_groundtruth_labels[0],
422
392
  "top3_max_relevance": max(result_docs_groundtruth_labels[: self.k]),
423
393
  "holes": holes,
@@ -22,9 +22,9 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
22
22
 
23
23
  :param credential: The credential for connecting to Azure AI project. Required
24
24
  :type credential: ~azure.core.credentials.TokenCredential
25
- :param azure_ai_project: The scope of the Azure AI project.
26
- It contains subscription id, resource group, and project name.
27
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
25
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
26
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
27
+ :type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
28
28
  :return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
29
29
  :rtype: Dict[str, str]
30
30
 
@@ -39,15 +39,15 @@ class F1ScoreEvaluator(EvaluatorBase):
39
39
  :caption: Initialize and call an F1ScoreEvaluator.
40
40
 
41
41
  .. admonition:: Example using Azure AI Project URL:
42
-
42
+
43
43
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
44
  :start-after: [START f1_score_evaluator]
45
45
  :end-before: [END f1_score_evaluator]
46
46
  :language: python
47
47
  :dedent: 8
48
- :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
48
+ :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
49
49
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
-
50
+
51
51
  .. admonition:: Example with Threshold:
52
52
 
53
53
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -147,7 +147,7 @@ class F1ScoreEvaluator(EvaluatorBase):
147
147
  if f1_result <= self._threshold:
148
148
  binary_result = True
149
149
  return {
150
- "f1_score": f1_result,
150
+ "f1_score": f1_result,
151
151
  "f1_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
152
152
  "f1_threshold": self._threshold,
153
153
  }
@@ -45,7 +45,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
45
45
  :caption: Initialize with threshold and call a FluencyEvaluator.
46
46
 
47
47
  .. admonition:: Example using Azure AI Project URL:
48
-
48
+
49
49
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
50
  :start-after: [START fluency_evaluator]
51
51
  :end-before: [END fluency_evaluator]
@@ -78,7 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
78
78
  prompty_file=prompty_path,
79
79
  result_key=self._RESULT_KEY,
80
80
  threshold=threshold,
81
- _higher_is_better=self._higher_is_better
81
+ _higher_is_better=self._higher_is_better,
82
82
  )
83
83
 
84
84
  @overload
@@ -34,7 +34,7 @@ class GleuScoreEvaluator(EvaluatorBase):
34
34
  :language: python
35
35
  :dedent: 8
36
36
  :caption: Initialize and call a GleuScoreEvaluator.
37
-
37
+
38
38
  .. admonition:: Example with Threshold:
39
39
 
40
40
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -45,13 +45,13 @@ class GleuScoreEvaluator(EvaluatorBase):
45
45
  :caption: Initialize with threshold and call a GleuScoreEvaluator.
46
46
 
47
47
  .. admonition:: Example using Azure AI Project URL:
48
-
48
+
49
49
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
50
  :start-after: [START gleu_score_evaluator]
51
51
  :end-before: [END gleu_score_evaluator]
52
52
  :language: python
53
53
  :dedent: 8
54
- :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
54
+ :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
55
55
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
56
56
  """
57
57
 
@@ -12,9 +12,13 @@ from azure.ai.evaluation._model_configurations import Conversation
12
12
  from ..._common.utils import construct_prompty_model_config, validate_model_config
13
13
 
14
14
  try:
15
- from ..._user_agent import USER_AGENT
15
+ from ..._user_agent import UserAgentSingleton
16
16
  except ImportError:
17
- USER_AGENT = "None"
17
+
18
+ class UserAgentSingleton:
19
+ @property
20
+ def value(self) -> str:
21
+ return "None"
18
22
 
19
23
 
20
24
  class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
@@ -35,7 +39,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
35
39
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
40
  :param threshold: The threshold for the groundedness evaluator. Default is 3.
37
41
  :type threshold: int
38
-
42
+
39
43
  .. admonition:: Example:
40
44
 
41
45
  .. literalinclude:: ../samples/evaluation_samples_evaluate.py
@@ -54,13 +58,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
54
58
  :caption: Initialize with threshold and call a GroundednessEvaluator.
55
59
 
56
60
  .. admonition:: Example using Azure AI Project URL:
57
-
61
+
58
62
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
59
63
  :start-after: [START groundedness_evaluator]
60
64
  :end-before: [END groundedness_evaluator]
61
65
  :language: python
62
66
  :dedent: 8
63
- :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
67
+ :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
64
68
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
65
69
 
66
70
  .. note::
@@ -89,7 +93,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
89
93
  prompty_file=prompty_path,
90
94
  result_key=self._RESULT_KEY,
91
95
  threshold=threshold,
92
- _higher_is_better=self._higher_is_better
96
+ _higher_is_better=self._higher_is_better,
93
97
  )
94
98
  self._model_config = model_config
95
99
  self.threshold = threshold
@@ -165,7 +169,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
165
169
  prompty_model_config = construct_prompty_model_config(
166
170
  validate_model_config(self._model_config),
167
171
  self._DEFAULT_OPEN_API_VERSION,
168
- USER_AGENT,
172
+ UserAgentSingleton().value,
169
173
  )
170
174
  self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
171
175
 
@@ -3,6 +3,7 @@
3
3
  # ---------------------------------------------------------
4
4
  import os
5
5
  import math
6
+ import logging
6
7
  from typing import Dict, Union, List, Optional
7
8
 
8
9
  from typing_extensions import overload, override
@@ -10,9 +11,12 @@ from typing_extensions import overload, override
10
11
  from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
11
12
  from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
12
13
  from azure.ai.evaluation._model_configurations import Conversation, Message
13
- from ..._common.utils import check_score_is_valid
14
+ from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response
14
15
  from azure.ai.evaluation._common._experimental import experimental
15
16
 
17
+ logger = logging.getLogger(__name__)
18
+
19
+
16
20
  @experimental
17
21
  class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
18
22
  """
@@ -34,13 +38,13 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
34
38
  :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
35
39
 
36
40
  .. admonition:: Example using Azure AI Project URL:
37
-
41
+
38
42
  .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
39
43
  :start-after: [START intent_resolution_evaluator]
40
44
  :end-before: [END intent_resolution_evaluator]
41
45
  :language: python
42
46
  :dedent: 8
43
- :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
47
+ :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
44
48
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
45
49
 
46
50
  """
@@ -57,23 +61,19 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
57
61
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
58
62
 
59
63
  @override
60
- def __init__(self, model_config, *,
61
- threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD,
62
- **kwargs):
64
+ def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
63
65
  current_dir = os.path.dirname(__file__)
64
66
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
65
67
  self.threshold = threshold
66
- super().__init__(model_config=model_config, prompty_file=prompty_path,
67
- result_key=self._RESULT_KEY,
68
- **kwargs)
68
+ super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
69
69
 
70
70
  @overload
71
71
  def __call__(
72
72
  self,
73
73
  *,
74
- query : Union[str, List[dict]],
75
- response : Union[str, List[dict]],
76
- tool_definitions : Optional[Union[dict, List[dict]]] = None,
74
+ query: Union[str, List[dict]],
75
+ response: Union[str, List[dict]],
76
+ tool_definitions: Optional[Union[dict, List[dict]]] = None,
77
77
  ) -> Dict[str, Union[str, float]]:
78
78
  """Evaluate intent resolution for a given query, response and optional tool definitions.
79
79
  The query and response can be either a string or a list of messages.
@@ -135,11 +135,19 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
135
135
  category=ErrorCategory.MISSING_FIELD,
136
136
  target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
137
137
  )
138
+ # reformat query and response to the format expected by the prompty flow
139
+ eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
140
+ eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
141
+
138
142
  llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
139
143
  # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
140
144
  if isinstance(llm_output, dict):
141
- score = llm_output.get("resolution_score", math.nan)
142
- if not check_score_is_valid(score, IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE, IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE):
145
+ score = llm_output.get("score", math.nan)
146
+ if not check_score_is_valid(
147
+ score,
148
+ IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE,
149
+ IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE,
150
+ ):
143
151
  raise EvaluationException(
144
152
  message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
145
153
  internal_message="Invalid score value.",
@@ -148,19 +156,16 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
148
156
  )
149
157
  reason = llm_output.get("explanation", "")
150
158
  score = float(score)
151
- score_result = 'pass' if score >= self.threshold else 'fail'
152
-
153
- #remove fields 'explanation' and 'resolution_score' from llm_output as they are already included in the response_dict
154
- if 'explanation' in llm_output: llm_output.pop("explanation")
155
- if 'resolution_score' in llm_output: llm_output.pop("resolution_score")
159
+ score_result = "pass" if score >= self.threshold else "fail"
156
160
 
157
161
  response_dict = {
158
- f"{self._result_key}" : score,
159
- f"{self._result_key}_result" : score_result,
160
- f"{self._result_key}_threshold" : self.threshold,
161
- f"{self._result_key}_reason" : reason,
162
- f"additional_details" : llm_output
163
- }
162
+ f"{self._result_key}": score,
163
+ f"{self._result_key}_result": score_result,
164
+ f"{self._result_key}_threshold": self.threshold,
165
+ f"{self._result_key}_reason": reason,
166
+ }
164
167
  return response_dict
165
168
  # If llm_output is not a dictionary, return NaN for the score. This should never happen
169
+ if logger:
170
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
166
171
  return {self._result_key: math.nan}