azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +13 -2
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +86 -50
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +124 -3
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +60 -54
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +24 -15
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +21 -21
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1286 -739
- azure/ai/evaluation/red_team/_red_team_result.py +43 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +32 -32
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +2 -12
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +26 -15
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +10 -8
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +9 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +15 -1
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -131
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -14,9 +14,7 @@ RetrievalGroundTruthDocument = TypedDict(
|
|
|
14
14
|
"RetrievalGroundTruthDocument", {"document_id": str, "query_relevance_label": int}
|
|
15
15
|
)
|
|
16
16
|
|
|
17
|
-
RetrievedDocument = TypedDict(
|
|
18
|
-
"RetrievedDocument", {"document_id": str, "relevance_score": float}
|
|
19
|
-
)
|
|
17
|
+
RetrievedDocument = TypedDict("RetrievedDocument", {"document_id": str, "relevance_score": float})
|
|
20
18
|
|
|
21
19
|
|
|
22
20
|
class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
@@ -33,15 +31,15 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
33
31
|
:caption: Initialize and call a DocumentRetrievalEvaluator
|
|
34
32
|
|
|
35
33
|
.. admonition:: Example using Azure AI Project URL:
|
|
36
|
-
|
|
34
|
+
|
|
37
35
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
38
36
|
:start-after: [START document_retrieval_evaluator]
|
|
39
37
|
:end-before: [END document_retrieval_evaluator]
|
|
40
38
|
:language: python
|
|
41
39
|
:dedent: 8
|
|
42
|
-
:caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
|
|
40
|
+
:caption: Initialize and call DocumentRetrievalEvaluator using Azure AI Project URL in following format
|
|
43
41
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
44
|
-
|
|
42
|
+
|
|
45
43
|
.. admonition:: Example with Threshold:
|
|
46
44
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
47
45
|
:start-after: [START threshold_document_retrieval_evaluator]
|
|
@@ -62,7 +60,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
62
60
|
top1_relevance_threshold: Optional[float] = 50.0,
|
|
63
61
|
top3_max_relevance_threshold: Optional[float] = 50.0,
|
|
64
62
|
total_retrieved_documents_threshold: Optional[int] = 50,
|
|
65
|
-
total_ground_truth_documents_threshold: Optional[int] = 50
|
|
63
|
+
total_ground_truth_documents_threshold: Optional[int] = 50,
|
|
66
64
|
):
|
|
67
65
|
super().__init__()
|
|
68
66
|
self.k = 3
|
|
@@ -74,14 +72,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
74
72
|
)
|
|
75
73
|
|
|
76
74
|
if not isinstance(ground_truth_label_min, int):
|
|
77
|
-
raise EvaluationException(
|
|
78
|
-
"The ground truth label minimum must be an integer value."
|
|
79
|
-
)
|
|
75
|
+
raise EvaluationException("The ground truth label minimum must be an integer value.")
|
|
80
76
|
|
|
81
77
|
if not isinstance(ground_truth_label_max, int):
|
|
82
|
-
raise EvaluationException(
|
|
83
|
-
"The ground truth label maximum must be an integer value."
|
|
84
|
-
)
|
|
78
|
+
raise EvaluationException("The ground truth label maximum must be an integer value.")
|
|
85
79
|
|
|
86
80
|
self.ground_truth_label_min = ground_truth_label_min
|
|
87
81
|
self.ground_truth_label_max = ground_truth_label_max
|
|
@@ -122,7 +116,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
122
116
|
) -> float:
|
|
123
117
|
"""NDCG (Normalized Discounted Cumulative Gain) calculated for the top K documents retrieved from a search query.
|
|
124
118
|
NDCG measures how well a document ranking compares to an ideal document ranking given a list of ground-truth documents.
|
|
125
|
-
|
|
119
|
+
|
|
126
120
|
:param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
|
|
127
121
|
:type result_docs_groundtruth_labels: List[int]
|
|
128
122
|
:param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
|
|
@@ -145,7 +139,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
145
139
|
def _compute_xdcg(self, result_docs_groundtruth_labels: List[int]) -> float:
|
|
146
140
|
"""XDCG calculated for the top K documents retrieved from a search query.
|
|
147
141
|
XDCG measures how objectively good are the top K documents, discounted by their position in the list.
|
|
148
|
-
|
|
142
|
+
|
|
149
143
|
:param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
|
|
150
144
|
:type result_docs_groundtruth_labels: List[int]
|
|
151
145
|
:return: The XDCG@K calculation result.
|
|
@@ -159,11 +153,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
159
153
|
return math.pow(self.xdcg_discount_factor, rank - 1)
|
|
160
154
|
|
|
161
155
|
ranks = list(range(1, self.k + 1))
|
|
162
|
-
xdcg_n = sum(
|
|
163
|
-
starmap(
|
|
164
|
-
calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)
|
|
165
|
-
)
|
|
166
|
-
)
|
|
156
|
+
xdcg_n = sum(starmap(calculate_xdcg_numerator, zip(result_docs_groundtruth_labels, ranks)))
|
|
167
157
|
xdcg_d = sum(map(calculate_xdcg_denominator, ranks))
|
|
168
158
|
|
|
169
159
|
return xdcg_n / float(xdcg_d)
|
|
@@ -175,7 +165,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
175
165
|
) -> float:
|
|
176
166
|
"""Fidelity calculated over all documents retrieved from a search query.
|
|
177
167
|
Fidelity measures how objectively good are all of the documents retrieved compared with all known good documents in the underlying data store.
|
|
178
|
-
|
|
168
|
+
|
|
179
169
|
:param result_docs_groundtruth_labels: A list of retrieved documents' ground truth labels.
|
|
180
170
|
:type result_docs_groundtruth_labels: List[int]
|
|
181
171
|
:param ideal_docs_groundtruth_labels: A list of ideal documents' ground truth labels.
|
|
@@ -196,25 +186,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
196
186
|
if label >= s:
|
|
197
187
|
label_counts[str(label)] += 1
|
|
198
188
|
|
|
199
|
-
sorted_label_counts = [
|
|
200
|
-
x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])
|
|
201
|
-
]
|
|
189
|
+
sorted_label_counts = [x[1] for x in sorted(label_counts.items(), key=lambda x: x[0])]
|
|
202
190
|
|
|
203
191
|
# calculate weights
|
|
204
|
-
weights = [
|
|
205
|
-
(math.pow(2, i + 1) - 1)
|
|
206
|
-
for i in range(s, self.ground_truth_label_max + 1)
|
|
207
|
-
]
|
|
192
|
+
weights = [(math.pow(2, i + 1) - 1) for i in range(s, self.ground_truth_label_max + 1)]
|
|
208
193
|
|
|
209
194
|
# return weighted sum
|
|
210
195
|
return sum(starmap(operator.mul, zip(sorted_label_counts, weights)))
|
|
211
196
|
|
|
212
|
-
weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(
|
|
213
|
-
|
|
214
|
-
)
|
|
215
|
-
weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(
|
|
216
|
-
ideal_docs_groundtruth_labels
|
|
217
|
-
)
|
|
197
|
+
weighted_sum_by_rating_results = calculate_weighted_sum_by_rating(result_docs_groundtruth_labels)
|
|
198
|
+
weighted_sum_by_rating_index = calculate_weighted_sum_by_rating(ideal_docs_groundtruth_labels)
|
|
218
199
|
|
|
219
200
|
if weighted_sum_by_rating_index == 0:
|
|
220
201
|
return math.nan
|
|
@@ -226,12 +207,16 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
226
207
|
|
|
227
208
|
for metric_name, metric_value in metrics.items():
|
|
228
209
|
if metric_name in self._threshold_metrics.keys():
|
|
229
|
-
result[f"{metric_name}_result"] =
|
|
210
|
+
result[f"{metric_name}_result"] = (
|
|
211
|
+
"pass" if metric_value >= self._threshold_metrics[metric_name] else "fail"
|
|
212
|
+
)
|
|
230
213
|
result[f"{metric_name}_threshold"] = self._threshold_metrics[metric_name]
|
|
231
214
|
result[f"{metric_name}_higher_is_better"] = True
|
|
232
215
|
|
|
233
216
|
elif metric_name in self._threshold_holes.keys():
|
|
234
|
-
result[f"{metric_name}_result"] =
|
|
217
|
+
result[f"{metric_name}_result"] = (
|
|
218
|
+
"pass" if metric_value <= self._threshold_holes[metric_name] else "fail"
|
|
219
|
+
)
|
|
235
220
|
result[f"{metric_name}_threshold"] = self._threshold_holes[metric_name]
|
|
236
221
|
result[f"{metric_name}_higher_is_better"] = False
|
|
237
222
|
|
|
@@ -256,8 +241,10 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
256
241
|
# if the qrels are empty, no meaningful evaluation is possible
|
|
257
242
|
if not retrieval_ground_truth:
|
|
258
243
|
raise EvaluationException(
|
|
259
|
-
(
|
|
260
|
-
|
|
244
|
+
(
|
|
245
|
+
"'retrieval_ground_truth' parameter must contain at least one item. "
|
|
246
|
+
"Check your data input to be sure that each input record has ground truth defined."
|
|
247
|
+
)
|
|
261
248
|
)
|
|
262
249
|
|
|
263
250
|
qrels = []
|
|
@@ -277,9 +264,7 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
277
264
|
)
|
|
278
265
|
|
|
279
266
|
if not isinstance(query_relevance_label, int):
|
|
280
|
-
raise EvaluationException(
|
|
281
|
-
"Query relevance labels must be integer values."
|
|
282
|
-
)
|
|
267
|
+
raise EvaluationException("Query relevance labels must be integer values.")
|
|
283
268
|
|
|
284
269
|
if query_relevance_label < self.ground_truth_label_min:
|
|
285
270
|
raise EvaluationException(
|
|
@@ -318,12 +303,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
318
303
|
)
|
|
319
304
|
)
|
|
320
305
|
|
|
321
|
-
if not isinstance(relevance_score, float) and not isinstance(
|
|
322
|
-
|
|
323
|
-
):
|
|
324
|
-
raise EvaluationException(
|
|
325
|
-
"Retrieved document relevance score must be a numerical value."
|
|
326
|
-
)
|
|
306
|
+
if not isinstance(relevance_score, float) and not isinstance(relevance_score, int):
|
|
307
|
+
raise EvaluationException("Retrieved document relevance score must be a numerical value.")
|
|
327
308
|
|
|
328
309
|
results.append(result)
|
|
329
310
|
|
|
@@ -368,24 +349,17 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
368
349
|
results_lookup = {x["document_id"]: x["relevance_score"] for x in results}
|
|
369
350
|
|
|
370
351
|
# sort each input set by label to get the ranking
|
|
371
|
-
qrels_sorted_by_rank = sorted(
|
|
372
|
-
|
|
373
|
-
)
|
|
374
|
-
results_sorted_by_rank = sorted(
|
|
375
|
-
results_lookup.items(), key=lambda x: x[1], reverse=True
|
|
376
|
-
)
|
|
352
|
+
qrels_sorted_by_rank = sorted(qrels_lookup.items(), key=lambda x: x[1], reverse=True)
|
|
353
|
+
results_sorted_by_rank = sorted(results_lookup.items(), key=lambda x: x[1], reverse=True)
|
|
377
354
|
|
|
378
355
|
# find ground truth labels for the results set and ideal set
|
|
379
356
|
result_docs_groundtruth_labels = [
|
|
380
|
-
qrels_lookup[doc_id] if doc_id in qrels_lookup else 0
|
|
381
|
-
for (doc_id, _) in results_sorted_by_rank
|
|
357
|
+
qrels_lookup[doc_id] if doc_id in qrels_lookup else 0 for (doc_id, _) in results_sorted_by_rank
|
|
382
358
|
]
|
|
383
359
|
ideal_docs_groundtruth_labels = [label for (_, label) in qrels_sorted_by_rank]
|
|
384
360
|
|
|
385
361
|
# calculate the proportion of result docs with no ground truth label (holes)
|
|
386
|
-
holes = self._compute_holes(
|
|
387
|
-
[x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank]
|
|
388
|
-
)
|
|
362
|
+
holes = self._compute_holes([x[0] for x in results_sorted_by_rank], [x[0] for x in qrels_sorted_by_rank])
|
|
389
363
|
holes_ratio = holes / float(len(results))
|
|
390
364
|
|
|
391
365
|
# if none of the retrieved docs are labeled, report holes only
|
|
@@ -412,12 +386,8 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
|
|
|
412
386
|
result_docs_groundtruth_labels[: self.k],
|
|
413
387
|
ideal_docs_groundtruth_labels[: self.k],
|
|
414
388
|
),
|
|
415
|
-
f"xdcg@{self.k}": self._compute_xdcg(
|
|
416
|
-
|
|
417
|
-
),
|
|
418
|
-
"fidelity": self._compute_fidelity(
|
|
419
|
-
result_docs_groundtruth_labels, ideal_docs_groundtruth_labels
|
|
420
|
-
),
|
|
389
|
+
f"xdcg@{self.k}": self._compute_xdcg(result_docs_groundtruth_labels[: self.k]),
|
|
390
|
+
"fidelity": self._compute_fidelity(result_docs_groundtruth_labels, ideal_docs_groundtruth_labels),
|
|
421
391
|
"top1_relevance": result_docs_groundtruth_labels[0],
|
|
422
392
|
"top3_max_relevance": max(result_docs_groundtruth_labels[: self.k]),
|
|
423
393
|
"holes": holes,
|
|
@@ -22,9 +22,9 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
|
|
|
22
22
|
|
|
23
23
|
:param credential: The credential for connecting to Azure AI project. Required
|
|
24
24
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
25
|
-
:param azure_ai_project: The
|
|
26
|
-
It contains subscription id, resource group, and project name.
|
|
27
|
-
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
25
|
+
:param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
|
|
26
|
+
or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
|
|
27
|
+
:type azure_ai_project: Union[str, ~azure.ai.evaluation.AzureAIProject]
|
|
28
28
|
:return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
|
|
29
29
|
:rtype: Dict[str, str]
|
|
30
30
|
|
|
@@ -39,15 +39,15 @@ class F1ScoreEvaluator(EvaluatorBase):
|
|
|
39
39
|
:caption: Initialize and call an F1ScoreEvaluator.
|
|
40
40
|
|
|
41
41
|
.. admonition:: Example using Azure AI Project URL:
|
|
42
|
-
|
|
42
|
+
|
|
43
43
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
44
44
|
:start-after: [START f1_score_evaluator]
|
|
45
45
|
:end-before: [END f1_score_evaluator]
|
|
46
46
|
:language: python
|
|
47
47
|
:dedent: 8
|
|
48
|
-
:caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
|
|
48
|
+
:caption: Initialize and call F1ScoreEvaluator using Azure AI Project URL in following format
|
|
49
49
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
.. admonition:: Example with Threshold:
|
|
52
52
|
|
|
53
53
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -147,7 +147,7 @@ class F1ScoreEvaluator(EvaluatorBase):
|
|
|
147
147
|
if f1_result <= self._threshold:
|
|
148
148
|
binary_result = True
|
|
149
149
|
return {
|
|
150
|
-
"f1_score": f1_result,
|
|
150
|
+
"f1_score": f1_result,
|
|
151
151
|
"f1_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
|
|
152
152
|
"f1_threshold": self._threshold,
|
|
153
153
|
}
|
|
@@ -45,7 +45,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
45
45
|
:caption: Initialize with threshold and call a FluencyEvaluator.
|
|
46
46
|
|
|
47
47
|
.. admonition:: Example using Azure AI Project URL:
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
50
50
|
:start-after: [START fluency_evaluator]
|
|
51
51
|
:end-before: [END fluency_evaluator]
|
|
@@ -78,7 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
78
|
prompty_file=prompty_path,
|
|
79
79
|
result_key=self._RESULT_KEY,
|
|
80
80
|
threshold=threshold,
|
|
81
|
-
_higher_is_better=self._higher_is_better
|
|
81
|
+
_higher_is_better=self._higher_is_better,
|
|
82
82
|
)
|
|
83
83
|
|
|
84
84
|
@overload
|
|
@@ -34,7 +34,7 @@ class GleuScoreEvaluator(EvaluatorBase):
|
|
|
34
34
|
:language: python
|
|
35
35
|
:dedent: 8
|
|
36
36
|
:caption: Initialize and call a GleuScoreEvaluator.
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
.. admonition:: Example with Threshold:
|
|
39
39
|
|
|
40
40
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
@@ -45,13 +45,13 @@ class GleuScoreEvaluator(EvaluatorBase):
|
|
|
45
45
|
:caption: Initialize with threshold and call a GleuScoreEvaluator.
|
|
46
46
|
|
|
47
47
|
.. admonition:: Example using Azure AI Project URL:
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
50
50
|
:start-after: [START gleu_score_evaluator]
|
|
51
51
|
:end-before: [END gleu_score_evaluator]
|
|
52
52
|
:language: python
|
|
53
53
|
:dedent: 8
|
|
54
|
-
:caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
|
|
54
|
+
:caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
|
|
55
55
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
56
56
|
"""
|
|
57
57
|
|
|
@@ -12,9 +12,13 @@ from azure.ai.evaluation._model_configurations import Conversation
|
|
|
12
12
|
from ..._common.utils import construct_prompty_model_config, validate_model_config
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
|
-
from ..._user_agent import
|
|
15
|
+
from ..._user_agent import UserAgentSingleton
|
|
16
16
|
except ImportError:
|
|
17
|
-
|
|
17
|
+
|
|
18
|
+
class UserAgentSingleton:
|
|
19
|
+
@property
|
|
20
|
+
def value(self) -> str:
|
|
21
|
+
return "None"
|
|
18
22
|
|
|
19
23
|
|
|
20
24
|
class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
@@ -35,7 +39,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
35
39
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
36
40
|
:param threshold: The threshold for the groundedness evaluator. Default is 3.
|
|
37
41
|
:type threshold: int
|
|
38
|
-
|
|
42
|
+
|
|
39
43
|
.. admonition:: Example:
|
|
40
44
|
|
|
41
45
|
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
@@ -54,13 +58,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
54
58
|
:caption: Initialize with threshold and call a GroundednessEvaluator.
|
|
55
59
|
|
|
56
60
|
.. admonition:: Example using Azure AI Project URL:
|
|
57
|
-
|
|
61
|
+
|
|
58
62
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
59
63
|
:start-after: [START groundedness_evaluator]
|
|
60
64
|
:end-before: [END groundedness_evaluator]
|
|
61
65
|
:language: python
|
|
62
66
|
:dedent: 8
|
|
63
|
-
:caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
|
|
67
|
+
:caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
|
|
64
68
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
65
69
|
|
|
66
70
|
.. note::
|
|
@@ -89,7 +93,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
89
93
|
prompty_file=prompty_path,
|
|
90
94
|
result_key=self._RESULT_KEY,
|
|
91
95
|
threshold=threshold,
|
|
92
|
-
_higher_is_better=self._higher_is_better
|
|
96
|
+
_higher_is_better=self._higher_is_better,
|
|
93
97
|
)
|
|
94
98
|
self._model_config = model_config
|
|
95
99
|
self.threshold = threshold
|
|
@@ -165,7 +169,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
165
169
|
prompty_model_config = construct_prompty_model_config(
|
|
166
170
|
validate_model_config(self._model_config),
|
|
167
171
|
self._DEFAULT_OPEN_API_VERSION,
|
|
168
|
-
|
|
172
|
+
UserAgentSingleton().value,
|
|
169
173
|
)
|
|
170
174
|
self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
|
|
171
175
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
5
|
import math
|
|
6
|
+
import logging
|
|
6
7
|
from typing import Dict, Union, List, Optional
|
|
7
8
|
|
|
8
9
|
from typing_extensions import overload, override
|
|
@@ -10,9 +11,12 @@ from typing_extensions import overload, override
|
|
|
10
11
|
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
11
12
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
12
13
|
from azure.ai.evaluation._model_configurations import Conversation, Message
|
|
13
|
-
from ..._common.utils import check_score_is_valid
|
|
14
|
+
from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response
|
|
14
15
|
from azure.ai.evaluation._common._experimental import experimental
|
|
15
16
|
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
16
20
|
@experimental
|
|
17
21
|
class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
18
22
|
"""
|
|
@@ -34,13 +38,13 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
34
38
|
:caption: Initialize and call an IntentResolutionEvaluator with a query and response.
|
|
35
39
|
|
|
36
40
|
.. admonition:: Example using Azure AI Project URL:
|
|
37
|
-
|
|
41
|
+
|
|
38
42
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
39
43
|
:start-after: [START intent_resolution_evaluator]
|
|
40
44
|
:end-before: [END intent_resolution_evaluator]
|
|
41
45
|
:language: python
|
|
42
46
|
:dedent: 8
|
|
43
|
-
:caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
|
|
47
|
+
:caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
|
|
44
48
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
45
49
|
|
|
46
50
|
"""
|
|
@@ -57,23 +61,19 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
57
61
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
58
62
|
|
|
59
63
|
@override
|
|
60
|
-
def __init__(self, model_config, *,
|
|
61
|
-
threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD,
|
|
62
|
-
**kwargs):
|
|
64
|
+
def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
|
|
63
65
|
current_dir = os.path.dirname(__file__)
|
|
64
66
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
65
67
|
self.threshold = threshold
|
|
66
|
-
super().__init__(model_config=model_config, prompty_file=prompty_path,
|
|
67
|
-
result_key=self._RESULT_KEY,
|
|
68
|
-
**kwargs)
|
|
68
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
|
|
69
69
|
|
|
70
70
|
@overload
|
|
71
71
|
def __call__(
|
|
72
72
|
self,
|
|
73
73
|
*,
|
|
74
|
-
query
|
|
75
|
-
response
|
|
76
|
-
tool_definitions
|
|
74
|
+
query: Union[str, List[dict]],
|
|
75
|
+
response: Union[str, List[dict]],
|
|
76
|
+
tool_definitions: Optional[Union[dict, List[dict]]] = None,
|
|
77
77
|
) -> Dict[str, Union[str, float]]:
|
|
78
78
|
"""Evaluate intent resolution for a given query, response and optional tool definitions.
|
|
79
79
|
The query and response can be either a string or a list of messages.
|
|
@@ -135,11 +135,19 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
135
135
|
category=ErrorCategory.MISSING_FIELD,
|
|
136
136
|
target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
|
|
137
137
|
)
|
|
138
|
+
# reformat query and response to the format expected by the prompty flow
|
|
139
|
+
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
|
|
140
|
+
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
|
|
141
|
+
|
|
138
142
|
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
139
143
|
# llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
|
|
140
144
|
if isinstance(llm_output, dict):
|
|
141
|
-
score
|
|
142
|
-
if not check_score_is_valid(
|
|
145
|
+
score = llm_output.get("score", math.nan)
|
|
146
|
+
if not check_score_is_valid(
|
|
147
|
+
score,
|
|
148
|
+
IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE,
|
|
149
|
+
IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE,
|
|
150
|
+
):
|
|
143
151
|
raise EvaluationException(
|
|
144
152
|
message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
|
|
145
153
|
internal_message="Invalid score value.",
|
|
@@ -148,19 +156,16 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
148
156
|
)
|
|
149
157
|
reason = llm_output.get("explanation", "")
|
|
150
158
|
score = float(score)
|
|
151
|
-
score_result =
|
|
152
|
-
|
|
153
|
-
#remove fields 'explanation' and 'resolution_score' from llm_output as they are already included in the response_dict
|
|
154
|
-
if 'explanation' in llm_output: llm_output.pop("explanation")
|
|
155
|
-
if 'resolution_score' in llm_output: llm_output.pop("resolution_score")
|
|
159
|
+
score_result = "pass" if score >= self.threshold else "fail"
|
|
156
160
|
|
|
157
161
|
response_dict = {
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
}
|
|
162
|
+
f"{self._result_key}": score,
|
|
163
|
+
f"{self._result_key}_result": score_result,
|
|
164
|
+
f"{self._result_key}_threshold": self.threshold,
|
|
165
|
+
f"{self._result_key}_reason": reason,
|
|
166
|
+
}
|
|
164
167
|
return response_dict
|
|
165
168
|
# If llm_output is not a dictionary, return NaN for the score. This should never happen
|
|
169
|
+
if logger:
|
|
170
|
+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
|
|
166
171
|
return {self._result_key: math.nan}
|