azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (123) hide show
  1. azure/ai/evaluation/__init__.py +9 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +5 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +158 -28
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_eval_mapping.py +71 -0
  59. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  60. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  62. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  63. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  64. azure/ai/evaluation/_evaluate/_evaluate.py +325 -74
  65. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
  66. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  67. azure/ai/evaluation/_evaluators/_common/_base_eval.py +8 -3
  68. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  69. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  70. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  71. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
  72. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
  73. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +1 -1
  74. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
  76. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +7 -2
  77. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  78. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
  79. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
  80. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
  81. azure/ai/evaluation/_exceptions.py +2 -0
  82. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  83. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  84. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  85. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  86. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  87. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  88. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  90. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  91. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  92. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  93. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  94. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  95. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  96. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  97. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  98. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  99. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
  100. azure/ai/evaluation/_version.py +1 -1
  101. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  102. azure/ai/evaluation/red_team/_red_team.py +825 -450
  103. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  104. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  105. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  106. azure/ai/evaluation/simulator/_constants.py +1 -0
  107. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  108. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  109. azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
  110. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  111. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
  112. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  113. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
  114. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  115. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  116. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
  117. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  118. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +25 -2
  119. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +123 -65
  120. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  121. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
  122. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
  123. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,467 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import math
5
+ import operator
6
+ from itertools import starmap
7
+ from typing import Dict, List, TypedDict, Tuple, Optional
8
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
9
+ from azure.ai.evaluation._exceptions import EvaluationException
10
+ from typing_extensions import override, overload
11
+
12
+
13
+ RetrievalGroundTruthDocument = TypedDict(
14
+ "RetrievalGroundTruthDocument", {"document_id": str, "query_relevance_label": int}
15
+ )
16
+
17
+ RetrievedDocument = TypedDict(
18
+ "RetrievedDocument", {"document_id": str, "relevance_score": float}
19
+ )
20
+
21
+
22
+ class DocumentRetrievalEvaluator(EvaluatorBase):
23
+ """
24
+ Calculate document retrieval metrics, such as NDCG, XDCG, Fidelity, Top K Relevance and Holes.
25
+
26
+ .. admonition:: Example:
27
+
28
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
29
+ :start-after: [START document_retrieval_evaluator]
30
+ :end-before: [END document_retrieval_evaluator]
31
+ :language: python
32
+ :dedent: 8
33
+ :caption: Initialize and call a Document RetrievalEvaluator
34
+
35
+ .. admonition:: Example with Threshold:
36
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
37
+ :start-after: [START threshold_document_retrieval_evaluator]
38
+ :end-before: [END threshold_document_retrieval_evaluator]
39
+ :language: python
40
+ :dedent: 8
41
+ :caption: Initialize with threshold and call a DocumentRetrievalEvaluator.
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ *,
47
+ ground_truth_label_min: int = 0,
48
+ ground_truth_label_max: int = 4,
49
+ threshold: Optional[dict] = None,
50
+ ):
51
+ super().__init__()
52
+ self.k = 3
53
+ self.xdcg_discount_factor = 0.6
54
+
55
+ if ground_truth_label_min >= ground_truth_label_max:
56
+ raise EvaluationException(
57
+ "The ground truth label maximum must be strictly greater than the ground truth label minimum."
58
+ )
59
+
60
+ if not isinstance(ground_truth_label_min, int):
61
+ raise EvaluationException(
62
+ "The ground truth label minimum must be an integer value."
63
+ )
64
+
65
+ if not isinstance(ground_truth_label_max, int):
66
+ raise EvaluationException(
67
+ "The ground truth label maximum must be an integer value."
68
+ )
69
+
70
+ self.ground_truth_label_min = ground_truth_label_min
71
+ self.ground_truth_label_max = ground_truth_label_max
72
+
73
+ # The default threshold for metrics where higher numbers are better.
74
+ self._threshold_metrics = {
75
+ "ndcg@3": 0.5,
76
+ "xdcg@3": 0.5,
77
+ "fidelity": 0.5,
78
+ "top1_relevance": 50,
79
+ "top3_max_relevance": 50,
80
+ "total_retrieved_documents": 50,
81
+ "total_ground_truth_documents": 50,
82
+ }
83
+
84
+ # Ideally, the number of holes should be zero.
85
+ self._threshold_holes = {"holes": 0, "holes_ratio": 0}
86
+
87
+ if threshold and not isinstance(threshold, dict):
88
+ raise EvaluationException(
89
+ f"Threshold must be a dictionary, got {type(threshold)}"
90
+ )
91
+
92
+ elif isinstance(threshold, dict):
93
+ self._threshold_metrics.update(threshold)
94
+
95
+ def _compute_holes(self, actual_docs: List[str], labeled_docs: List[str]) -> int:
96
+ """
97
+ The number of documents retrieved from a search query which have no provided ground-truth label.
98
+ This metric is helpful for determining the accuracy of other metrics that are highly sensitive to missing ground-truth knowledge,
99
+ such as NDCG, XDCG, and Fidelity.
100
+
101
+ :param actual_docs: A list of retrieved documents' IDs.
102
+ :type actual_docs: List[str]
103
+ :param labeled_docs: A list of ideal documents' IDs.
104
+ :type labeled: List[str]
105
+ :return: The holes calculation result.
106
+ :rtype: int
107
+ """
108
+ return len(set(actual_docs).difference(set(labeled_docs)))
109
+
110
+ def _compute_ndcg(
111
+ self,
112
+ result_docs_groundtruth_labels: List[int],
113
+ ideal_docs_groundtruth_labels: List[int],
114
+ ) -> float:
115
+ """NDCG (Normalized Discounted Cumulative Gain) calculated for the top K documents retrieved from a search query.
116
+ NDCG measures how well a document ranking compares to an ideal document ranking given a list of ground-truth documents.
117
+
118
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
119
+ :type result_docs_groundtruth_labels: List[int]
120
+ :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
121
+ :type ideal_docs_groundtruth_labels: List[int]
122
+ :return: The NDCG@K calculation result.
123
+ :rtype: float
124
+ """
125
+
126
+ # Set the scoring function
127
+ def calculate_dcg(relevance: float, rank: int):
128
+ return (math.pow(2, relevance) - 1) / (math.log2(rank + 1))
129
+
130
+ ranks = list(range(1, self.k + 1))
131
+ dcg = sum(starmap(calculate_dcg, zip(result_docs_groundtruth_labels, ranks)))
132
+ idcg = sum(starmap(calculate_dcg, zip(ideal_docs_groundtruth_labels, ranks)))
133
+ ndcg = dcg / float(idcg)
134
+
135
+ return ndcg
136
+
137
+ def _compute_xdcg(self, result_docs_groundtruth_labels: List[int]) -> float:
138
+ """XDCG calculated for the top K documents retrieved from a search query.
139
+ XDCG measures how objectively good are the top K documents, discounted by their position in the list.
140
+
141
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
142
+ :type result_docs_groundtruth_labels: List[int]
143
+ :return: The XDCG@K calculation result.
144
+ :rtype: float
145
+ """
146
+
147
+ def calculate_xdcg_numerator(relevance, rank):
148
+ return 25 * relevance * math.pow(self.xdcg_discount_factor, rank - 1)
149
+
150
+ def calculate_xdcg_denominator(rank):
151
+ return math.pow(self.xdcg_discount_factor, rank - 1)
152
+
153
+ ranks = list(range(1, self.k + 1))
154
+ xdcg_n = sum(
155
+ starmap(
156
+ calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)
157
+ )
158
+ )
159
+ xdcg_d = sum(map(calculate_xdcg_denominator, ranks))
160
+
161
+ return xdcg_n / float(xdcg_d)
162
+
163
+ def _compute_fidelity(
164
+ self,
165
+ result_docs_groundtruth_labels: List[int],
166
+ ideal_docs_groundtruth_labels: List[int],
167
+ ) -> float:
168
+ """Fidelity calculated over all documents retrieved from a search query.
169
+ Fidelity measures how objectively good are all of the documents retrieved compared with all known good documents in the underlying data store.
170
+
171
+ :param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
172
+ :type result_docs_groundtruth_labels: List[int]
173
+ :param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
174
+ :type ideal_docs_groundtruth_labels: List[int]
175
+ :return: The fidelity calculation result.
176
+ :rtype: float
177
+ """
178
+
179
+ def calculate_weighted_sum_by_rating(labels: List[int]) -> float:
180
+ # here we assume that the configured groundtruth label minimum translates to "irrelevant",
181
+ # so we exclude documents with that label from the calculation.
182
+ s = self.ground_truth_label_min + 1
183
+
184
+ # get a count of each label
185
+ label_counts = {str(i): 0 for i in range(s, self.ground_truth_label_max + 1)}
186
+
187
+ for label in labels:
188
+ if label >= s:
189
+ label_counts[str(label)] += 1
190
+
191
+ sorted_label_counts = [
192
+ x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])
193
+ ]
194
+
195
+ # calculate weights
196
+ weights = [
197
+ (math.pow(2, i + 1) - 1)
198
+ for i in range(s, self.ground_truth_label_max + 1)
199
+ ]
200
+
201
+ # return weighted sum
202
+ return sum(starmap(operator.mul, zip(sorted_label_counts, weights)))
203
+
204
+ weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(
205
+ result_docs_groundtruth_labels
206
+ )
207
+ weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(
208
+ ideal_docs_groundtruth_labels
209
+ )
210
+
211
+ if weighted_sum_by_rating_index == 0:
212
+ return math.nan
213
+
214
+ return weighted_sum_by_rating_results / float(weighted_sum_by_rating_index)
215
+
216
+ def _get_binary_result(self, **metrics) -> Dict[str, float]:
217
+ result = {}
218
+
219
+ for metric_name, metric_value in metrics.items():
220
+ if metric_name in self._threshold_metrics.keys():
221
+ result[f"{metric_name}_result"] = (
222
+ metric_value >= self._threshold_metrics[metric_name]
223
+ )
224
+ result[f"{metric_name}_threshold"] = self._threshold_metrics[
225
+ metric_name
226
+ ]
227
+ result[f"{metric_name}_higher_is_better"] = True
228
+
229
+ elif metric_name in self._threshold_holes.keys():
230
+ result[f"{metric_name}_result"] = (
231
+ metric_value <= self._threshold_holes[metric_name]
232
+ )
233
+ result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
234
+ result[f"{metric_name}_higher_is_better"] = False
235
+
236
+ else:
237
+ raise ValueError(f"No threshold set for metric '{metric_name}'")
238
+
239
+ return result
240
+
241
+ def _validate_eval_input(
242
+ self, eval_input: Dict
243
+ ) -> Tuple[List[RetrievalGroundTruthDocument], List[RetrievedDocument]]:
244
+ """Validate document retrieval evaluator inputs.
245
+
246
+ :param eval_input: The input to the evaluation function.
247
+ :type eval_input: Dict
248
+ :return: The evaluation result.
249
+ :rtype: Tuple[List[azure.ai.evaluation.RetrievalGroundTruthDocument], List[azure.ai.evaluation.RetrievedDocument]]
250
+ """
251
+ retrieval_ground_truth = eval_input.get("retrieval_ground_truth")
252
+ retrieved_documents = eval_input.get("retrieved_documents")
253
+
254
+ # if the qrels are empty, no meaningful evaluation is possible
255
+ if not retrieval_ground_truth:
256
+ raise EvaluationException(
257
+ ("'retrieval_ground_truth' parameter must contain at least one item. "
258
+ "Check your data input to be sure that each input record has ground truth defined.")
259
+ )
260
+
261
+ qrels = []
262
+
263
+ # validate the qrels to be sure they are the correct type and are bounded by the given configuration
264
+ for qrel in retrieval_ground_truth:
265
+ document_id = qrel.get("document_id")
266
+ query_relevance_label = qrel.get("query_relevance_label")
267
+
268
+ if document_id is None or query_relevance_label is None:
269
+ raise EvaluationException(
270
+ (
271
+ "Invalid input data was found in the retrieval ground truth. "
272
+ "Ensure that all items in the 'retrieval_ground_truth' array contain "
273
+ "'document_id' and 'query_relevance_label' properties."
274
+ )
275
+ )
276
+
277
+ if not isinstance(query_relevance_label, int):
278
+ raise EvaluationException(
279
+ "Query relevance labels must be integer values."
280
+ )
281
+
282
+ if query_relevance_label < self.ground_truth_label_min:
283
+ raise EvaluationException(
284
+ (
285
+ "A query relevance label less than the configured minimum value was detected in the evaluation input data. "
286
+ "Check the range of ground truth label values in the input data and set the value of ground_truth_label_min to "
287
+ "the appropriate value for your data."
288
+ )
289
+ )
290
+
291
+ if query_relevance_label > self.ground_truth_label_max:
292
+ raise EvaluationException(
293
+ (
294
+ "A query relevance label greater than the configured maximum value was detected in the evaluation input data. "
295
+ "Check the range of ground truth label values in the input data and set the value of ground_truth_label_max to "
296
+ "the appropriate value for your data."
297
+ )
298
+ )
299
+
300
+ qrels.append(qrel)
301
+
302
+ # validate retrieved documents to be sure they are the correct type
303
+ results = []
304
+
305
+ if isinstance(retrieved_documents, list):
306
+ for result in retrieved_documents:
307
+ document_id = result.get("document_id")
308
+ relevance_score = result.get("relevance_score")
309
+
310
+ if document_id is None or relevance_score is None:
311
+ raise EvaluationException(
312
+ (
313
+ "Invalid input data was found in the retrieved documents. "
314
+ "Ensure that all items in the 'retrieved_documents' array contain "
315
+ "'document_id' and 'relevance_score' properties."
316
+ )
317
+ )
318
+
319
+ if not isinstance(relevance_score, float) and not isinstance(
320
+ relevance_score, int
321
+ ):
322
+ raise EvaluationException(
323
+ "Retrieved document relevance score must be a numerical value."
324
+ )
325
+
326
+ results.append(result)
327
+
328
+ if len(qrels) > 10000 or len(results) > 10000:
329
+ raise EvaluationException(
330
+ "'retrieval_ground_truth' and 'retrieved_documents' inputs should contain no more than 10000 items."
331
+ )
332
+
333
+ return qrels, results
334
+
335
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
336
+ """Produce a document retrieval evaluation result.
337
+
338
+ :param eval_input: The input to the evaluation function.
339
+ :type eval_input: Dict
340
+ :return: The evaluation result.
341
+ :rtype: Dict[str, float]
342
+ """
343
+ qrels, results = self._validate_eval_input(eval_input)
344
+
345
+ # if the results set is empty, results are all zero
346
+ if len(results) == 0:
347
+ metrics = {
348
+ f"ndcg@{self.k}": 0.0,
349
+ f"xdcg@{self.k}": 0.0,
350
+ "fidelity": 0.0,
351
+ "top1_relevance": 0.0,
352
+ "top3_max_relevance": 0.0,
353
+ "holes": 0,
354
+ "holes_ratio": 0,
355
+ "total_retrieved_documents": len(results),
356
+ "total_ground_truth_documents": len(qrels),
357
+ }
358
+ binary_result = self._get_binary_result(**metrics)
359
+ for k, v in binary_result.items():
360
+ metrics[k] = v
361
+
362
+ return metrics
363
+
364
+ # flatten qrels and results to normal dictionaries
365
+ qrels_lookup = {x["document_id"]: x["query_relevance_label"] for x in qrels}
366
+ results_lookup = {x["document_id"]: x["relevance_score"] for x in results}
367
+
368
+ # sort each input set by label to get the ranking
369
+ qrels_sorted_by_rank = sorted(
370
+ qrels_lookup.items(), key=lambda x: x[1], reverse=True
371
+ )
372
+ results_sorted_by_rank = sorted(
373
+ results_lookup.items(), key=lambda x: x[1], reverse=True
374
+ )
375
+
376
+ # find ground truth labels for the results set and ideal set
377
+ result_docs_groundtruth_labels = [
378
+ qrels_lookup[doc_id] if doc_id in qrels_lookup else 0
379
+ for (doc_id, _) in results_sorted_by_rank
380
+ ]
381
+ ideal_docs_groundtruth_labels = [label for (_, label) in qrels_sorted_by_rank]
382
+
383
+ # calculate the proportion of result docs with no ground truth label (holes)
384
+ holes = self._compute_holes(
385
+ [x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank]
386
+ )
387
+ holes_ratio = holes / float(len(results))
388
+
389
+ # if none of the retrieved docs are labeled, report holes only
390
+ if not any(result_docs_groundtruth_labels):
391
+ metrics = {
392
+ f"ndcg@{self.k}": 0,
393
+ f"xdcg@{self.k}": 0,
394
+ "fidelity": 0,
395
+ "top1_relevance": 0,
396
+ "top3_max_relevance": 0,
397
+ "holes": holes,
398
+ "holes_ratio": holes_ratio,
399
+ "total_retrieved_documents": len(results),
400
+ "total_ground_truth_documents": len(qrels),
401
+ }
402
+ binary_result = self._get_binary_result(**metrics)
403
+ for k, v in binary_result.items():
404
+ metrics[k] = v
405
+
406
+ return metrics
407
+
408
+ metrics = {
409
+ f"ndcg@{self.k}": self._compute_ndcg(
410
+ result_docs_groundtruth_labels[: self.k],
411
+ ideal_docs_groundtruth_labels[: self.k],
412
+ ),
413
+ f"xdcg@{self.k}": self._compute_xdcg(
414
+ result_docs_groundtruth_labels[: self.k]
415
+ ),
416
+ "fidelity": self._compute_fidelity(
417
+ result_docs_groundtruth_labels, ideal_docs_groundtruth_labels
418
+ ),
419
+ "top1_relevance": result_docs_groundtruth_labels[0],
420
+ "top3_max_relevance": max(result_docs_groundtruth_labels[: self.k]),
421
+ "holes": holes,
422
+ "holes_ratio": holes_ratio,
423
+ "total_retrieved_documents": len(results),
424
+ "total_ground_truth_documents": len(qrels),
425
+ }
426
+
427
+ binary_result = self._get_binary_result(**metrics)
428
+ for k, v in binary_result.items():
429
+ metrics[k] = v
430
+
431
+ return metrics
432
+
433
+ @overload
434
+ def __call__( # type: ignore
435
+ self,
436
+ *,
437
+ retrieval_ground_truth: List[RetrievalGroundTruthDocument],
438
+ retrieved_documents: List[RetrievedDocument],
439
+ ) -> Dict[str, float]:
440
+ """
441
+ Compute document retrieval metrics for documents retrieved from a search algorithm against a known set of ground truth documents.
442
+
443
+ Evaluation metrics calculated include NDCG@3, XDCG@3, Fidelity, Top K Relevance and Holes.
444
+
445
+ :keyword retrieval_ground_truth: a list of ground-truth document judgements for a query, where each item in the list contains a unique document identifier and a query relevance label.
446
+ :paramtype retrieval_ground_truth: List[azure.ai.evaluation.RetrievalGroundTruthDocument]
447
+ :keyword retrieved_documents: a list of documents scored by a search algorithm for a query, where each item in the list contains a unique document identifier and a relevance score.
448
+ :paramtype retrieved_documents: List[azure.ai.evaluation.RetrievedDocument]
449
+ :return: The document retrieval metrics.
450
+ :rtype: Dict[str, float]
451
+ """
452
+
453
+ @override
454
+ def __call__(self, *args, **kwargs):
455
+ """
456
+ Compute document retrieval metrics for documents retrieved from a search algorithm against a known set of ground truth documents.
457
+
458
+ Evaluation metrics calculated include NDCG@3, XDCG@3, Fidelity, Top K Relevance and Holes.
459
+
460
+ :keyword retrieval_ground_truth: a list of ground-truth document judgements for a query, where each item in the list contains a unique document identifier and a query relevance label.
461
+ :paramtype retrieval_ground_truth: List[azure.ai.evaluation.RetrievalGroundTruthDocument]
462
+ :keyword retrieved_documents: a list of documents scored by a search algorithm for a query, where each item in the list contains a unique document identifier and a relevance score.
463
+ :paramtype retrieved_documents: List[azure.ai.evaluation.RetrievedDocument]
464
+ :return: The document retrieval metrics.
465
+ :rtype: Dict[str, float]
466
+ """
467
+ return super().__call__(*args, **kwargs)
@@ -23,7 +23,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
23
23
  :param model_config: Configuration for the Azure OpenAI model.
24
24
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
25
25
  ~azure.ai.evaluation.OpenAIModelConfiguration]
26
- :param threshold: The threshold for the fluency evaluator. Default is 5.
26
+ :param threshold: The threshold for the fluency evaluator. Default is 3.
27
27
  :type threshold: int
28
28
 
29
29
  .. admonition:: Example:
@@ -33,7 +33,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
33
33
  :param model_config: Configuration for the Azure OpenAI model.
34
34
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
35
35
  ~azure.ai.evaluation.OpenAIModelConfiguration]
36
- :param threshold: The threshold for the groundedness evaluator. Default is 5.
36
+ :param threshold: The threshold for the groundedness evaluator. Default is 3.
37
37
  :type threshold: int
38
38
 
39
39
  .. admonition:: Example:
@@ -47,11 +47,15 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
47
47
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
48
48
 
49
49
  @override
50
- def __init__(self, model_config, *, threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD):
50
+ def __init__(self, model_config, *,
51
+ threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD,
52
+ **kwargs):
51
53
  current_dir = os.path.dirname(__file__)
52
54
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
53
55
  self.threshold = threshold
54
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
56
+ super().__init__(model_config=model_config, prompty_file=prompty_path,
57
+ result_key=self._RESULT_KEY,
58
+ **kwargs)
55
59
 
56
60
  @overload
57
61
  def __call__(
@@ -27,7 +27,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
27
27
  :param model_config: Configuration for the Azure OpenAI model.
28
28
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
29
29
  ~azure.ai.evaluation.OpenAIModelConfiguration]
30
- :param threshold: The threshold for the relevance evaluator. Default is 5.
30
+ :param threshold: The threshold for the relevance evaluator. Default is 3.
31
31
  :type threshold: int
32
32
 
33
33
  .. admonition:: Example:
@@ -60,11 +60,16 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
60
60
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
61
61
 
62
62
  @override
63
- def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD):
63
+ def __init__(self, model_config, *,
64
+ threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD,
65
+ **kwargs):
64
66
  current_dir = os.path.dirname(__file__)
65
67
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
66
68
  self.threshold = threshold
67
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
69
+ super().__init__(model_config=model_config,
70
+ prompty_file=prompty_path,
71
+ result_key=self._RESULT_KEY,
72
+ **kwargs)
68
73
 
69
74
  @overload
70
75
  def __call__(
@@ -22,65 +22,51 @@ inputs:
22
22
  ---
23
23
  system:
24
24
  # Instruction
25
- ## Context
26
- ### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided.
27
- - **Definition**: You are given a definition of the response quality that is being evaluated to help guide your Score.
28
- - **Data**: Your input data include a response and its ground truth.
29
- - **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways.
30
-
25
+ ## Goal
26
+ ### You are an expert in evaluating the quality of a Response from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
27
+ - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
28
+ - **Data**: Your input data include Response and Ground Truth.
29
+ - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
31
30
 
31
+ user:
32
32
  # Definition
33
+ **Completeness** refers to how accurately and thoroughly a response represents the information provided in the ground truth. It considers both the inclusion of all relevant statements and the correctness of those statements. Each statement in the ground truth should be evaluated individually to determine if it is accurately reflected in the response without missing any key information. The scale ranges from 1 to 5, with higher numbers indicating greater completeness.
33
34
 
34
- **Level 1: Fully incomplete**
35
-
36
- **Definition:**
37
- A response is considered fully incomplete if it does not contain any the necessary and relevant information with respect to the ground truth. In other words, it completely misses all the information - especially claims and statements - established in the ground truth.
35
+ # Ratings
36
+ ## [Completeness: 1] (Fully Incomplete)
37
+ **Definition:** A response that does not contain any of the necessary and relevant information with respect to the ground truth. It completely misses all the information, especially claims and statements, established in the ground truth.
38
38
 
39
39
  **Examples:**
40
- 1. **Response:** "Flu shot cannot cure cancer. Stay healthy requires sleeping exactly 8 hours a day. A few hours of exercise per week will have little benefits for physical and mental health. Physical and mental health benefits are separate topics. Scientists have not studied any of them."
41
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Staying healthy requires proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
42
-
43
-
44
- **Level 2: Barely complete**
40
+ **Response:** "Flu shot cannot cure cancer. Stay healthy requires sleeping exactly 8 hours a day. A few hours of exercise per week will have little benefits for physical and mental health. Physical and mental health benefits are separate topics. Scientists have not studied any of them."
41
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Staying healthy requires proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
45
42
 
46
- **Definition:**
47
- A response is considered barely complete if it only contains a small percentage of all the necessary and relevant information with respect to the ground truth. In other words, it misses almost all the information - especially claims and statements - established in the ground truth.
43
+ ## [Completeness: 2] (Barely Complete)
44
+ **Definition:** A response that contains only a small percentage of all the necessary and relevant information with respect to the ground truth. It misses almost all the information, especially claims and statements, established in the ground truth.
48
45
 
49
46
  **Examples:**
50
- 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires 2 meals a day. Exercise per week makes not difference to physical and mental health. This is because physical and mental health benefits have low correlation through scientific studies. Scientists are making this observation in studies."
51
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
47
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires 2 meals a day. Exercise per week makes no difference to physical and mental health. This is because physical and mental health benefits have low correlation through scientific studies. Scientists are making this observation in studies."
48
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
52
49
 
53
-
54
- **Level 3: Moderately complete**
55
-
56
- **Definition:**
57
- A response is considered moderately complete if it contains half of the necessary and relevant information with respect to the ground truth. In other words, it miss half of the information - especially claims and statements - established in the ground truth.
50
+ ## [Completeness: 3] (Moderately Complete)
51
+ **Definition:** A response that contains half of the necessary and relevant information with respect to the ground truth. It misses half of the information, especially claims and statements, established in the ground truth.
58
52
 
59
53
  **Examples:**
60
- 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires a few dollar of investments a day. Even a few dollars of investments per week will not make an impact on physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Fiction writers are starting to discover them through their works."
61
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
62
-
63
-
64
- **Level 4: Mostly complete**
65
-
66
- **Definition:**
67
- A response is considered mostly complete if it contains most of the necessary and relevant information with respect to the ground truth. In other words, it misses some minor information - especially claims and statements - established in the ground truth.
54
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires a few dollars of investments a day. Even a few dollars of investments per week will not make an impact on physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Fiction writers are starting to discover them through their works."
55
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
68
56
 
57
+ ## [Completeness: 4] (Mostly Complete)
58
+ **Definition:** A response that contains most of the necessary and relevant information with respect to the ground truth. It misses some minor information, especially claims and statements, established in the ground truth.
69
59
 
70
60
  **Examples:**
71
- 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires keto diet and rigorous athletic training. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
72
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
73
-
61
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires keto diet and rigorous athletic training. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
62
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
74
63
 
75
- **Level 5: Fully complete**
76
-
77
- **Definition:**
78
- A response is considered complete if it perfectly contains all the necessary and relevant information with respect to the ground truth. In other words, it does not miss any information from statements and claims in the ground truth.
64
+ ## [Completeness: 5] (Fully Complete)
65
+ **Definition:** A response that perfectly contains all the necessary and relevant information with respect to the ground truth. It does not miss any information from statements and claims in the ground truth.
79
66
 
80
67
  **Examples:**
81
- 1. **Response:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
82
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
83
-
68
+ **Response:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
69
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
84
70
 
85
71
 
86
72
  # Data
@@ -89,11 +75,10 @@ Ground Truth: {{ground_truth}}
89
75
 
90
76
 
91
77
  # Tasks
92
- ## Please provide your assessment Score for the previous answer. Your output should include the following information:
93
- - **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:".
78
+ ## Please provide your assessment Score for the previous RESPONSE in relation to the GROUND TRUTH based on the Definitions above. Your output should include the following information:
79
+ - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
94
80
  - **Explanation**: a very short explanation of why you think the input data should get that Score.
95
- - **Score**: based on your previous analysis, provide your Score. The answer you give MUST be a integer score ("1", "2", ...) based on the categories of the definitions.
96
-
81
+ - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be an integer score (i.e., "1", "2"...) based on the levels of the definitions.
97
82
 
98
83
  ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
99
84
  # Output
@@ -28,7 +28,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
28
28
  :param model_config: Configuration for the Azure OpenAI model.
29
29
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
30
30
  ~azure.ai.evaluation.OpenAIModelConfiguration]
31
- :param threshold: The threshold for the similarity evaluator. Default is 5.
31
+ :param threshold: The threshold for the similarity evaluator. Default is 3.
32
32
  :type threshold: int
33
33
 
34
34
  .. admonition:: Example: