azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show
  1. azure/ai/evaluation/__init__.py +10 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +7 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +165 -34
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  59. azure/ai/evaluation/_converters/_models.py +76 -6
  60. azure/ai/evaluation/_eval_mapping.py +73 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  63. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  64. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  65. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  66. azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
  67. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
  68. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  69. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  70. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  71. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  72. azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  76. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  77. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  78. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  79. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  80. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  81. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
  82. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  83. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
  84. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  85. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
  86. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
  87. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  88. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  89. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  90. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
  91. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
  92. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  93. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  94. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  95. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  96. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
  97. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
  98. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
  99. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  100. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  101. azure/ai/evaluation/_exceptions.py +2 -0
  102. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  103. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  104. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  106. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  107. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  109. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  110. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  111. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  112. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  113. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  114. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  115. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  116. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  117. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  118. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  119. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
  120. azure/ai/evaluation/_version.py +1 -1
  121. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  122. azure/ai/evaluation/red_team/_red_team.py +976 -546
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  125. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  126. azure/ai/evaluation/simulator/_constants.py +1 -0
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  128. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  129. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
  140. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
  141. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  142. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  143. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  144. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,469 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import math
5
+ import operator
6
+ from itertools import starmap
7
+ from typing import Any, Dict, List, TypedDict, Tuple, Optional, Union
8
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
+ from azure.ai.evaluation._exceptions import EvaluationException
10
+ from typing_extensions import override, overload
11
+
12
+
13
+ RetrievalGroundTruthDocument = TypedDict(
14
+ "RetrievalGroundTruthDocument", {"document_id": str, "query_relevance_label": int}
15
+ )
16
+
17
+ RetrievedDocument = TypedDict(
18
+ "RetrievedDocument", {"document_id": str, "relevance_score": float}
19
+ )
20
+
21
+
22
+ class DocumentRetrievalEvaluator(EvaluatorBase):
23
+ """
24
+ Calculate document retrieval metrics, such as NDCG, XDCG, Fidelity, Top K Relevance and Holes.
25
+
26
+ .. admonition:: Example:
27
+
28
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
29
+ :start-after: [START document_retrieval_evaluator]
30
+ :end-before: [END document_retrieval_evaluator]
31
+ :language: python
32
+ :dedent: 8
33
+ :caption: Initialize and call a DocumentRetrievalEvaluator
34
+
35
+ .. admonition:: Example using Azure AI Project URL:
36
+
37
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
38
+ :start-after: [START document_retrieval_evaluator]
39
+ :end-before: [END document_retrieval_evaluator]
40
+ :language: python
41
+ :dedent: 8
42
+ :caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
43
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
44
+
45
+ .. admonition:: Example with Threshold:
46
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
47
+ :start-after: [START threshold_document_retrieval_evaluator]
48
+ :end-before: [END threshold_document_retrieval_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize with threshold and call a DocumentRetrievalEvaluator.
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ *,
57
+ ground_truth_label_min: int = 0,
58
+ ground_truth_label_max: int = 4,
59
+ ndcg_threshold: Optional[float] = 0.5,
60
+ xdcg_threshold: Optional[float] = 50.0,
61
+ fidelity_threshold: Optional[float] = 0.5,
62
+ top1_relevance_threshold: Optional[float] = 50.0,
63
+ top3_max_relevance_threshold: Optional[float] = 50.0,
64
+ total_retrieved_documents_threshold: Optional[int] = 50,
65
+ total_ground_truth_documents_threshold: Optional[int] = 50
66
+ ):
67
+ super().__init__()
68
+ self.k = 3
69
+ self.xdcg_discount_factor = 0.6
70
+
71
+ if ground_truth_label_min >= ground_truth_label_max:
72
+ raise EvaluationException(
73
+ "The ground truth label maximum must be strictly greater than the ground truth label minimum."
74
+ )
75
+
76
+ if not isinstance(ground_truth_label_min, int):
77
+ raise EvaluationException(
78
+ "The ground truth label minimum must be an integer value."
79
+ )
80
+
81
+ if not isinstance(ground_truth_label_max, int):
82
+ raise EvaluationException(
83
+ "The ground truth label maximum must be an integer value."
84
+ )
85
+
86
+ self.ground_truth_label_min = ground_truth_label_min
87
+ self.ground_truth_label_max = ground_truth_label_max
88
+
89
+ # The default threshold for metrics where higher numbers are better.
90
+ self._threshold_metrics: Dict[str, Any] = {
91
+ "ndcg@3": ndcg_threshold,
92
+ "xdcg@3": xdcg_threshold,
93
+ "fidelity": fidelity_threshold,
94
+ "top1_relevance": top1_relevance_threshold,
95
+ "top3_max_relevance": top3_max_relevance_threshold,
96
+ "total_retrieved_documents": total_retrieved_documents_threshold,
97
+ "total_ground_truth_documents": total_ground_truth_documents_threshold,
98
+ }
99
+
100
+ # Ideally, the number of holes should be zero.
101
+ self._threshold_holes = {"holes": 0, "holes_ratio": 0}
102
+
103
+ def _compute_holes(self, actual_docs: List[str], labeled_docs: List[str]) -> int:
104
+ """
105
+ The number of documents retrieved from a search query which have no provided ground-truth label.
106
+ This metric is helpful for determining the accuracy of other metrics that are highly sensitive to missing ground-truth knowledge,
107
+ such as NDCG, XDCG, and Fidelity.
108
+
109
+ :param actual_docs: A list of retrieved documents' IDs.
110
+ :type actual_docs: List[str]
111
+ :param labeled_docs: A list of ideal documents' IDs.
112
+ :type labeled: List[str]
113
+ :return: The holes calculation result.
114
+ :rtype: int
115
+ """
116
+ return len(set(actual_docs).difference(set(labeled_docs)))
117
+
118
+ def _compute_ndcg(
119
+ self,
120
+ result_docs_groundtruth_labels: List[int],
121
+ ideal_docs_groundtruth_labels: List[int],
122
+ ) -> float:
123
+ """NDCG (Normalized Discounted Cumulative Gain) calculated for the top K documents retrieved from a search query.
124
+ NDCG measures how well a document ranking compares to an ideal document ranking given a list of ground-truth documents.
125
+
126
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
127
+ :type result_docs_groundtruth_labels: List[int]
128
+ :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
129
+ :type ideal_docs_groundtruth_labels: List[int]
130
+ :return: The NDCG@K calculation result.
131
+ :rtype: float
132
+ """
133
+
134
+ # Set the scoring function
135
+ def calculate_dcg(relevance: float, rank: int):
136
+ return (math.pow(2, relevance) - 1) / (math.log2(rank + 1))
137
+
138
+ ranks = list(range(1, self.k + 1))
139
+ dcg = sum(starmap(calculate_dcg, zip(result_docs_groundtruth_labels, ranks)))
140
+ idcg = sum(starmap(calculate_dcg, zip(ideal_docs_groundtruth_labels, ranks)))
141
+ ndcg = dcg / float(idcg)
142
+
143
+ return ndcg
144
+
145
+ def _compute_xdcg(self, result_docs_groundtruth_labels: List[int]) -> float:
146
+ """XDCG calculated for the top K documents retrieved from a search query.
147
+ XDCG measures how objectively good are the top K documents, discounted by their position in the list.
148
+
149
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
150
+ :type result_docs_groundtruth_labels: List[int]
151
+ :return: The XDCG@K calculation result.
152
+ :rtype: float
153
+ """
154
+
155
+ def calculate_xdcg_numerator(relevance, rank):
156
+ return 25 * relevance * math.pow(self.xdcg_discount_factor, rank - 1)
157
+
158
+ def calculate_xdcg_denominator(rank):
159
+ return math.pow(self.xdcg_discount_factor, rank - 1)
160
+
161
+ ranks = list(range(1, self.k + 1))
162
+ xdcg_n = sum(
163
+ starmap(
164
+ calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)
165
+ )
166
+ )
167
+ xdcg_d = sum(map(calculate_xdcg_denominator, ranks))
168
+
169
+ return xdcg_n / float(xdcg_d)
170
+
171
+ def _compute_fidelity(
172
+ self,
173
+ result_docs_groundtruth_labels: List[int],
174
+ ideal_docs_groundtruth_labels: List[int],
175
+ ) -> float:
176
+ """Fidelity calculated over all documents retrieved from a search query.
177
+ Fidelity measures how objectively good are all of the documents retrieved compared with all known good documents in the underlying data store.
178
+
179
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
180
+ :type result_docs_groundtruth_labels: List[int]
181
+ :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
182
+ :type ideal_docs_groundtruth_labels: List[int]
183
+ :return: The fidelity calculation result.
184
+ :rtype: float
185
+ """
186
+
187
+ def calculate_weighted_sum_by_rating(labels: List[int]) -> float:
188
+ # here we assume that the configured groundtruth label minimum translates to "irrelevant",
189
+ # so we exclude documents with that label from the calculation.
190
+ s = self.ground_truth_label_min + 1
191
+
192
+ # get a count of each label
193
+ label_counts = {str(i): 0 for i in range(s, self.ground_truth_label_max + 1)}
194
+
195
+ for label in labels:
196
+ if label >= s:
197
+ label_counts[str(label)] += 1
198
+
199
+ sorted_label_counts = [
200
+ x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])
201
+ ]
202
+
203
+ # calculate weights
204
+ weights = [
205
+ (math.pow(2, i + 1) - 1)
206
+ for i in range(s, self.ground_truth_label_max + 1)
207
+ ]
208
+
209
+ # return weighted sum
210
+ return sum(starmap(operator.mul, zip(sorted_label_counts, weights)))
211
+
212
+ weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(
213
+ result_docs_groundtruth_labels
214
+ )
215
+ weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(
216
+ ideal_docs_groundtruth_labels
217
+ )
218
+
219
+ if weighted_sum_by_rating_index == 0:
220
+ return math.nan
221
+
222
+ return weighted_sum_by_rating_results / float(weighted_sum_by_rating_index)
223
+
224
+ def _get_binary_result(self, **metrics) -> Dict[str, float]:
225
+ result: Dict[str, Any] = {}
226
+
227
+ for metric_name, metric_value in metrics.items():
228
+ if metric_name in self._threshold_metrics.keys():
229
+ result[f"{metric_name}_result"] = "pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
230
+ result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
231
+ result[f"{metric_name}_higher_is_better"] = True
232
+
233
+ elif metric_name in self._threshold_holes.keys():
234
+ result[f"{metric_name}_result"] = "pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
235
+ result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
236
+ result[f"{metric_name}_higher_is_better"] = False
237
+
238
+ else:
239
+ raise ValueError(f"No threshold set for metric '{metric_name}'")
240
+
241
+ return result
242
+
243
+ def _validate_eval_input(
244
+ self, eval_input: Dict
245
+ ) -> Tuple[List[RetrievalGroundTruthDocument], List[RetrievedDocument]]:
246
+ """Validate document retrieval evaluator inputs.
247
+
248
+ :param eval_input: The input to the evaluation function.
249
+ :type eval_input: Dict
250
+ :return: The evaluation result.
251
+ :rtype: Tuple[List[azure.ai.evaluation.RetrievalGroundTruthDocument], List[azure.ai.evaluation.RetrievedDocument]]
252
+ """
253
+ retrieval_ground_truth = eval_input.get("retrieval_ground_truth")
254
+ retrieved_documents = eval_input.get("retrieved_documents")
255
+
256
+ # if the qrels are empty, no meaningful evaluation is possible
257
+ if not retrieval_ground_truth:
258
+ raise EvaluationException(
259
+ ("'retrieval_ground_truth' parameter must contain at least one item. "
260
+ "Check your data input to be sure that each input record has ground truth defined.")
261
+ )
262
+
263
+ qrels = []
264
+
265
+ # validate the qrels to be sure they are the correct type and are bounded by the given configuration
266
+ for qrel in retrieval_ground_truth:
267
+ document_id = qrel.get("document_id")
268
+ query_relevance_label = qrel.get("query_relevance_label")
269
+
270
+ if document_id is None or query_relevance_label is None:
271
+ raise EvaluationException(
272
+ (
273
+ "Invalid input data was found in the retrieval ground truth. "
274
+ "Ensure that all items in the 'retrieval_ground_truth' array contain "
275
+ "'document_id' and 'query_relevance_label' properties."
276
+ )
277
+ )
278
+
279
+ if not isinstance(query_relevance_label, int):
280
+ raise EvaluationException(
281
+ "Query relevance labels must be integer values."
282
+ )
283
+
284
+ if query_relevance_label < self.ground_truth_label_min:
285
+ raise EvaluationException(
286
+ (
287
+ "A query relevance label less than the configured minimum value was detected in the evaluation input data. "
288
+ "Check the range of ground truth label values in the input data and set the value of ground_truth_label_min to "
289
+ "the appropriate value for your data."
290
+ )
291
+ )
292
+
293
+ if query_relevance_label > self.ground_truth_label_max:
294
+ raise EvaluationException(
295
+ (
296
+ "A query relevance label greater than the configured maximum value was detected in the evaluation input data. "
297
+ "Check the range of ground truth label values in the input data and set the value of ground_truth_label_max to "
298
+ "the appropriate value for your data."
299
+ )
300
+ )
301
+
302
+ qrels.append(qrel)
303
+
304
+ # validate retrieved documents to be sure they are the correct type
305
+ results = []
306
+
307
+ if isinstance(retrieved_documents, list):
308
+ for result in retrieved_documents:
309
+ document_id = result.get("document_id")
310
+ relevance_score = result.get("relevance_score")
311
+
312
+ if document_id is None or relevance_score is None:
313
+ raise EvaluationException(
314
+ (
315
+ "Invalid input data was found in the retrieved documents. "
316
+ "Ensure that all items in the 'retrieved_documents' array contain "
317
+ "'document_id' and 'relevance_score' properties."
318
+ )
319
+ )
320
+
321
+ if not isinstance(relevance_score, float) and not isinstance(
322
+ relevance_score, int
323
+ ):
324
+ raise EvaluationException(
325
+ "Retrieved document relevance score must be a numerical value."
326
+ )
327
+
328
+ results.append(result)
329
+
330
+ if len(qrels) > 10000 or len(results) > 10000:
331
+ raise EvaluationException(
332
+ "'retrieval_ground_truth' and 'retrieved_documents' inputs should contain no more than 10000 items."
333
+ )
334
+
335
+ return qrels, results
336
+
337
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
338
+ """Produce a document retrieval evaluation result.
339
+
340
+ :param eval_input: The input to the evaluation function.
341
+ :type eval_input: Dict
342
+ :return: The evaluation result.
343
+ :rtype: Dict[str, float]
344
+ """
345
+ qrels, results = self._validate_eval_input(eval_input)
346
+
347
+ # if the results set is empty, results are all zero
348
+ if len(results) == 0:
349
+ metrics = {
350
+ f"ndcg@{self.k}": 0.0,
351
+ f"xdcg@{self.k}": 0.0,
352
+ "fidelity": 0.0,
353
+ "top1_relevance": 0.0,
354
+ "top3_max_relevance": 0.0,
355
+ "holes": 0,
356
+ "holes_ratio": 0,
357
+ "total_retrieved_documents": len(results),
358
+ "total_ground_truth_documents": len(qrels),
359
+ }
360
+ binary_result = self._get_binary_result(**metrics)
361
+ for k, v in binary_result.items():
362
+ metrics[k] = v
363
+
364
+ return metrics
365
+
366
+ # flatten qrels and results to normal dictionaries
367
+ qrels_lookup = {x["document_id"]: x["query_relevance_label"] for x in qrels}
368
+ results_lookup = {x["document_id"]: x["relevance_score"] for x in results}
369
+
370
+ # sort each input set by label to get the ranking
371
+ qrels_sorted_by_rank = sorted(
372
+ qrels_lookup.items(), key=lambda x: x[1], reverse=True
373
+ )
374
+ results_sorted_by_rank = sorted(
375
+ results_lookup.items(), key=lambda x: x[1], reverse=True
376
+ )
377
+
378
+ # find ground truth labels for the results set and ideal set
379
+ result_docs_groundtruth_labels = [
380
+ qrels_lookup[doc_id] if doc_id in qrels_lookup else 0
381
+ for (doc_id, _) in results_sorted_by_rank
382
+ ]
383
+ ideal_docs_groundtruth_labels = [label for (_, label) in qrels_sorted_by_rank]
384
+
385
+ # calculate the proportion of result docs with no ground truth label (holes)
386
+ holes = self._compute_holes(
387
+ [x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank]
388
+ )
389
+ holes_ratio = holes / float(len(results))
390
+
391
+ # if none of the retrieved docs are labeled, report holes only
392
+ if not any(result_docs_groundtruth_labels):
393
+ metrics = {
394
+ f"ndcg@{self.k}": 0,
395
+ f"xdcg@{self.k}": 0,
396
+ "fidelity": 0,
397
+ "top1_relevance": 0,
398
+ "top3_max_relevance": 0,
399
+ "holes": holes,
400
+ "holes_ratio": holes_ratio,
401
+ "total_retrieved_documents": len(results),
402
+ "total_ground_truth_documents": len(qrels),
403
+ }
404
+ binary_result = self._get_binary_result(**metrics)
405
+ for k, v in binary_result.items():
406
+ metrics[k] = v
407
+
408
+ return metrics
409
+
410
+ metrics = {
411
+ f"ndcg@{self.k}": self._compute_ndcg(
412
+ result_docs_groundtruth_labels[: self.k],
413
+ ideal_docs_groundtruth_labels[: self.k],
414
+ ),
415
+ f"xdcg@{self.k}": self._compute_xdcg(
416
+ result_docs_groundtruth_labels[: self.k]
417
+ ),
418
+ "fidelity": self._compute_fidelity(
419
+ result_docs_groundtruth_labels, ideal_docs_groundtruth_labels
420
+ ),
421
+ "top1_relevance": result_docs_groundtruth_labels[0],
422
+ "top3_max_relevance": max(result_docs_groundtruth_labels[: self.k]),
423
+ "holes": holes,
424
+ "holes_ratio": holes_ratio,
425
+ "total_retrieved_documents": len(results),
426
+ "total_ground_truth_documents": len(qrels),
427
+ }
428
+
429
+ binary_result = self._get_binary_result(**metrics)
430
+ for k, v in binary_result.items():
431
+ metrics[k] = v
432
+
433
+ return metrics
434
+
435
+ @overload
436
+ def __call__( # type: ignore
437
+ self,
438
+ *,
439
+ retrieval_ground_truth: List[RetrievalGroundTruthDocument],
440
+ retrieved_documents: List[RetrievedDocument],
441
+ ) -> Dict[str, float]:
442
+ """
443
+ Compute document retrieval metrics for documents retrieved from a search algorithm against a known set of ground truth documents.
444
+
445
+ Evaluation metrics calculated include NDCG@3, XDCG@3, Fidelity, Top K Relevance and Holes.
446
+
447
+ :keyword retrieval_ground_truth: a list of ground-truth document judgements for a query, where each item in the list contains a unique document identifier and a query relevance label.
448
+ :paramtype retrieval_ground_truth: List[azure.ai.evaluation.RetrievalGroundTruthDocument]
449
+ :keyword retrieved_documents: a list of documents scored by a search algorithm for a query, where each item in the list contains a unique document identifier and a relevance score.
450
+ :paramtype retrieved_documents: List[azure.ai.evaluation.RetrievedDocument]
451
+ :return: The document retrieval metrics.
452
+ :rtype: Dict[str, float]
453
+ """
454
+
455
+ @override
456
+ def __call__(self, *args, **kwargs):
457
+ """
458
+ Compute document retrieval metrics for documents retrieved from a search algorithm against a known set of ground truth documents.
459
+
460
+ Evaluation metrics calculated include NDCG@3, XDCG@3, Fidelity, Top K Relevance and Holes.
461
+
462
+ :keyword retrieval_ground_truth: a list of ground-truth document judgements for a query, where each item in the list contains a unique document identifier and a query relevance label.
463
+ :paramtype retrieval_ground_truth: List[azure.ai.evaluation.RetrievalGroundTruthDocument]
464
+ :keyword retrieved_documents: a list of documents scored by a search algorithm for a query, where each item in the list contains a unique document identifier and a relevance score.
465
+ :paramtype retrieved_documents: List[azure.ai.evaluation.RetrievedDocument]
466
+ :return: The document retrieval metrics.
467
+ :rtype: Dict[str, float]
468
+ """
469
+ return super().__call__(*args, **kwargs)
@@ -38,6 +38,16 @@ class F1ScoreEvaluator(EvaluatorBase):
38
38
  :dedent: 8
39
39
  :caption: Initialize and call an F1ScoreEvaluator.
40
40
 
41
+ .. admonition:: Example using Azure AI Project URL:
42
+
43
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
+ :start-after: [START f1_score_evaluator]
45
+ :end-before: [END f1_score_evaluator]
46
+ :language: python
47
+ :dedent: 8
48
+ :caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
49
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
+
41
51
  .. admonition:: Example with Threshold:
42
52
 
43
53
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -23,7 +23,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
23
23
  :param model_config: Configuration for the Azure OpenAI model.
24
24
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
25
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
26
- :param threshold: The threshold for the fluency evaluator. Default is 5.
26
+ :param threshold: The threshold for the fluency evaluator. Default is 3.
27
27
  :type threshold: int
28
28
 
29
29
  .. admonition:: Example:
@@ -44,6 +44,16 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
44
44
  :dedent: 8
45
45
  :caption: Initialize with threshold and call a FluencyEvaluator.
46
46
 
47
+ .. admonition:: Example using Azure AI Project URL:
48
+
49
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
+ :start-after: [START fluency_evaluator]
51
+ :end-before: [END fluency_evaluator]
52
+ :language: python
53
+ :dedent: 8
54
+ :caption: Initialize and call FluencyEvaluator using Azure AI Project URL in the following format
55
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
56
+
47
57
  .. note::
48
58
 
49
59
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -43,6 +43,16 @@ class GleuScoreEvaluator(EvaluatorBase):
43
43
  :language: python
44
44
  :dedent: 8
45
45
  :caption: Initialize with threshold and call a GleuScoreEvaluator.
46
+
47
+ .. admonition:: Example using Azure AI Project URL:
48
+
49
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
+ :start-after: [START gleu_score_evaluator]
51
+ :end-before: [END gleu_score_evaluator]
52
+ :language: python
53
+ :dedent: 8
54
+ :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
55
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
46
56
  """
47
57
 
48
58
  id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
@@ -33,7 +33,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
33
33
  :param model_config: Configuration for the Azure OpenAI model.
34
34
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
35
35
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
- :param threshold: The threshold for the groundedness evaluator. Default is 5.
36
+ :param threshold: The threshold for the groundedness evaluator. Default is 3.
37
37
  :type threshold: int
38
38
 
39
39
  .. admonition:: Example:
@@ -53,6 +53,16 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
53
53
  :dedent: 8
54
54
  :caption: Initialize with threshold and call a GroundednessEvaluator.
55
55
 
56
+ .. admonition:: Example using Azure AI Project URL:
57
+
58
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
59
+ :start-after: [START groundedness_evaluator]
60
+ :end-before: [END groundedness_evaluator]
61
+ :language: python
62
+ :dedent: 8
63
+ :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
64
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
65
+
56
66
  .. note::
57
67
 
58
68
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -33,6 +33,16 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
33
33
  :dedent: 8
34
34
  :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
35
35
 
36
+ .. admonition:: Example using Azure AI Project URL:
37
+
38
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
39
+ :start-after: [START intent_resolution_evaluator]
40
+ :end-before: [END intent_resolution_evaluator]
41
+ :language: python
42
+ :dedent: 8
43
+ :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
44
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
45
+
36
46
  """
37
47
 
38
48
  _PROMPTY_FILE = "intent_resolution.prompty"
@@ -47,11 +57,15 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
47
57
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
48
58
 
49
59
  @override
50
- def __init__(self, model_config, *, threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD):
60
+ def __init__(self, model_config, *,
61
+ threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD,
62
+ **kwargs):
51
63
  current_dir = os.path.dirname(__file__)
52
64
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
53
65
  self.threshold = threshold
54
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
66
+ super().__init__(model_config=model_config, prompty_file=prompty_path,
67
+ result_key=self._RESULT_KEY,
68
+ **kwargs)
55
69
 
56
70
  @overload
57
71
  def __call__(
@@ -45,6 +45,16 @@ class MeteorScoreEvaluator(EvaluatorBase):
45
45
  :dedent: 8
46
46
  :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
47
47
 
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START meteor_score_evaluator]
52
+ :end-before: [END meteor_score_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
48
58
  .. admonition:: Example with Threshold:
49
59
 
50
60
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -37,6 +37,17 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
37
37
  :language: python
38
38
  :dedent: 8
39
39
  :caption: Initialize and call a ProtectedMaterialEvaluator.
40
+
41
+ .. admonition:: Example using Azure AI Project URL:
42
+
43
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
+ :start-after: [START protected_material_evaluator]
45
+ :end-before: [END protected_material_evaluator]
46
+ :language: python
47
+ :dedent: 8
48
+ :caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
49
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
+
40
51
  """
41
52
 
42
53
  id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
@@ -48,6 +48,16 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
48
48
  :dedent: 8
49
49
  :caption: Initialize and call a QAEvaluator.
50
50
 
51
+ .. admonition:: Example using Azure AI Project URL:
52
+
53
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
54
+ :start-after: [START qa_evaluator]
55
+ :end-before: [END qa_evaluator]
56
+ :language: python
57
+ :dedent: 8
58
+ :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
59
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
60
+
51
61
  .. admonition:: Example with Threshold:
52
62
 
53
63
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -27,7 +27,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
27
27
  :param model_config: Configuration for the Azure OpenAI model.
28
28
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
29
29
  ~azure.ai.evaluation.OpenAIModelConfiguration]
30
- :param threshold: The threshold for the relevance evaluator. Default is 5.
30
+ :param threshold: The threshold for the relevance evaluator. Default is 3.
31
31
  :type threshold: int
32
32
 
33
33
  .. admonition:: Example:
@@ -39,6 +39,16 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
39
39
  :dedent: 8
40
40
  :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
41
41
 
42
+ .. admonition:: Example using Azure AI Project URL:
43
+
44
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
45
+ :start-after: [START relevance_evaluator]
46
+ :end-before: [END relevance_evaluator]
47
+ :language: python
48
+ :dedent: 8
49
+ :caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
50
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
51
+
42
52
  .. admonition:: Example with Threshold:
43
53
 
44
54
  .. literalinclude:: ../samples/evaluation_samples_threshold.py