azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +22 -0
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
- azure/ai/evaluation/_common/constants.py +5 -0
- azure/ai/evaluation/_common/math.py +73 -2
- azure/ai/evaluation/_common/rai_service.py +250 -62
- azure/ai/evaluation/_common/utils.py +196 -23
- azure/ai/evaluation/_constants.py +7 -6
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
- azure/ai/evaluation/_evaluate/_utils.py +46 -11
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
- azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
- azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
- azure/ai/evaluation/_exceptions.py +19 -0
- azure/ai/evaluation/_model_configurations.py +83 -15
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
- azure/ai/evaluation/simulator/_simulator.py +165 -105
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
- azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
- /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -6,16 +6,21 @@ import importlib.metadata
|
|
|
6
6
|
import math
|
|
7
7
|
import re
|
|
8
8
|
import time
|
|
9
|
+
import json
|
|
10
|
+
import html
|
|
9
11
|
from ast import literal_eval
|
|
10
12
|
from typing import Dict, List, Optional, Union, cast
|
|
11
13
|
from urllib.parse import urlparse
|
|
14
|
+
from string import Template
|
|
12
15
|
|
|
13
16
|
import jwt
|
|
14
17
|
|
|
18
|
+
from promptflow.core._errors import MissingRequiredPackage
|
|
15
19
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
20
|
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
17
21
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
22
|
from azure.core.credentials import TokenCredential
|
|
23
|
+
from azure.core.exceptions import HttpResponseError
|
|
19
24
|
from azure.core.pipeline.policies import AsyncRetryPolicy
|
|
20
25
|
|
|
21
26
|
from .constants import (
|
|
@@ -23,10 +28,9 @@ from .constants import (
|
|
|
23
28
|
EvaluationMetrics,
|
|
24
29
|
RAIService,
|
|
25
30
|
Tasks,
|
|
26
|
-
_InternalAnnotationTasks,
|
|
27
31
|
_InternalEvaluationMetrics,
|
|
28
32
|
)
|
|
29
|
-
from .utils import get_harm_severity_level
|
|
33
|
+
from .utils import get_harm_severity_level, retrieve_content_type
|
|
30
34
|
|
|
31
35
|
try:
|
|
32
36
|
version = importlib.metadata.version("azure-ai-evaluation")
|
|
@@ -34,6 +38,39 @@ except importlib.metadata.PackageNotFoundError:
|
|
|
34
38
|
version = "unknown"
|
|
35
39
|
USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
36
40
|
|
|
41
|
+
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
42
|
+
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
47
|
+
"""Given the task and input data, produce a formatted string that will serve as the main
|
|
48
|
+
payload for the RAI service. Requires specific per-task logic.
|
|
49
|
+
|
|
50
|
+
:param data: The data to incorporate into the payload.
|
|
51
|
+
:type data: dict
|
|
52
|
+
:param annotation_task: The annotation task to use. This determines the template to use.
|
|
53
|
+
:type annotation_task: str
|
|
54
|
+
:return: The formatted based on the data and task template.
|
|
55
|
+
:rtype: str
|
|
56
|
+
"""
|
|
57
|
+
# Template class doesn't play nice with json dumping/loading, just handle groundedness'
|
|
58
|
+
# JSON format manually.
|
|
59
|
+
# Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
60
|
+
if annotation_task == Tasks.GROUNDEDNESS:
|
|
61
|
+
as_dict = {
|
|
62
|
+
"question": data.get("query", ""),
|
|
63
|
+
"answer": data.get("response", ""),
|
|
64
|
+
"context": data.get("context", ""),
|
|
65
|
+
}
|
|
66
|
+
return json.dumps(as_dict)
|
|
67
|
+
as_dict = {
|
|
68
|
+
"query": html.escape(data.get("query", "")),
|
|
69
|
+
"response": html.escape(data.get("response", "")),
|
|
70
|
+
}
|
|
71
|
+
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
|
|
72
|
+
return user_text.replace("'", '\\"')
|
|
73
|
+
|
|
37
74
|
|
|
38
75
|
def get_common_headers(token: str) -> Dict:
|
|
39
76
|
"""Get common headers for the HTTP request
|
|
@@ -76,30 +113,34 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
76
113
|
async with get_async_http_client() as client:
|
|
77
114
|
response = await client.get(svc_liveness_url, headers=headers)
|
|
78
115
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
116
|
+
if response.status_code != 200:
|
|
117
|
+
msg = (
|
|
118
|
+
f"RAI service is unavailable in this region, or you lack the necessary permissions "
|
|
119
|
+
f"to access the AI project. Status Code: {response.status_code}"
|
|
120
|
+
)
|
|
121
|
+
raise EvaluationException(
|
|
122
|
+
message=msg,
|
|
123
|
+
internal_message=msg,
|
|
124
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
125
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
126
|
+
blame=ErrorBlame.USER_ERROR,
|
|
127
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
128
|
+
)
|
|
90
129
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
130
|
+
capabilities = response.json()
|
|
131
|
+
if capability and capability not in capabilities:
|
|
132
|
+
msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
|
|
133
|
+
raise EvaluationException(
|
|
134
|
+
message=msg,
|
|
135
|
+
internal_message=msg,
|
|
136
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
137
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
138
|
+
blame=ErrorBlame.USER_ERROR,
|
|
139
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
140
|
+
)
|
|
100
141
|
|
|
101
142
|
|
|
102
|
-
def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
143
|
+
def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
|
|
103
144
|
"""Generate the payload for the annotation request
|
|
104
145
|
|
|
105
146
|
:param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
|
|
@@ -107,19 +148,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
|
107
148
|
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
108
149
|
in the payload.
|
|
109
150
|
:type metric: str
|
|
151
|
+
:param annotation_task: The annotation task to be passed to service
|
|
152
|
+
:type annotation_task: str
|
|
110
153
|
:return: The payload for the annotation request.
|
|
111
154
|
:rtype: Dict
|
|
112
155
|
"""
|
|
113
156
|
include_metric = True
|
|
114
|
-
task =
|
|
157
|
+
task = annotation_task
|
|
115
158
|
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
116
|
-
task = Tasks.PROTECTED_MATERIAL
|
|
117
159
|
include_metric = False
|
|
118
160
|
elif metric == _InternalEvaluationMetrics.ECI:
|
|
119
|
-
task = _InternalAnnotationTasks.ECI
|
|
120
161
|
include_metric = False
|
|
121
162
|
elif metric == EvaluationMetrics.XPIA:
|
|
122
|
-
task = Tasks.XPIA
|
|
123
163
|
include_metric = False
|
|
124
164
|
return (
|
|
125
165
|
{
|
|
@@ -135,25 +175,24 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
|
135
175
|
)
|
|
136
176
|
|
|
137
177
|
|
|
138
|
-
async def submit_request(
|
|
178
|
+
async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
|
|
139
179
|
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
140
180
|
|
|
141
|
-
:param
|
|
142
|
-
:type
|
|
143
|
-
:param response: The response to evaluate.
|
|
144
|
-
:type response: str
|
|
181
|
+
:param data: The data to evaluate.
|
|
182
|
+
:type data: dict
|
|
145
183
|
:param metric: The evaluation metric to use.
|
|
146
184
|
:type metric: str
|
|
147
185
|
:param rai_svc_url: The Responsible AI service URL.
|
|
148
186
|
:type rai_svc_url: str
|
|
149
187
|
:param token: The Azure authentication token.
|
|
150
188
|
:type token: str
|
|
189
|
+
:param annotation_task: The annotation task to use.
|
|
190
|
+
:type annotation_task: str
|
|
151
191
|
:return: The operation ID.
|
|
152
192
|
:rtype: str
|
|
153
193
|
"""
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
payload = generate_payload(normalized_user_text, metric)
|
|
194
|
+
normalized_user_text = get_formatted_template(data, annotation_task)
|
|
195
|
+
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
157
196
|
|
|
158
197
|
url = rai_svc_url + "/submitannotation"
|
|
159
198
|
headers = get_common_headers(token)
|
|
@@ -164,7 +203,6 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
|
|
|
164
203
|
if http_response.status_code != 202:
|
|
165
204
|
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
|
|
166
205
|
http_response.raise_for_status()
|
|
167
|
-
|
|
168
206
|
result = http_response.json()
|
|
169
207
|
operation_id = result["location"].split("/")[-1]
|
|
170
208
|
return operation_id
|
|
@@ -208,7 +246,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
208
246
|
|
|
209
247
|
|
|
210
248
|
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
211
|
-
batch_response: List[Dict], metric_name: str
|
|
249
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
212
250
|
) -> Dict[str, Union[str, float]]:
|
|
213
251
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
214
252
|
|
|
@@ -216,56 +254,87 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
216
254
|
:type batch_response: List[Dict]
|
|
217
255
|
:param metric_name: The evaluation metric to use.
|
|
218
256
|
:type metric_name: str
|
|
257
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
258
|
+
:type metric_display_name: Optional[str]
|
|
219
259
|
:return: The parsed annotation result.
|
|
220
260
|
:rtype: Dict[str, Union[str, float]]
|
|
221
261
|
"""
|
|
262
|
+
if metric_display_name is None:
|
|
263
|
+
metric_display_name = metric_name
|
|
264
|
+
|
|
222
265
|
# non-numeric metrics
|
|
223
|
-
if metric_name in {
|
|
224
|
-
|
|
266
|
+
if metric_name in {
|
|
267
|
+
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
268
|
+
_InternalEvaluationMetrics.ECI,
|
|
269
|
+
EvaluationMetrics.XPIA,
|
|
270
|
+
}:
|
|
271
|
+
result = {}
|
|
272
|
+
if not batch_response or len(batch_response[0]) == 0:
|
|
273
|
+
return {}
|
|
274
|
+
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
|
|
275
|
+
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
|
|
276
|
+
for pm_metric_name in pm_metric_names:
|
|
277
|
+
response = batch_response[0][pm_metric_name]
|
|
278
|
+
response = response.replace("false", "False")
|
|
279
|
+
response = response.replace("true", "True")
|
|
280
|
+
parsed_response = literal_eval(response)
|
|
281
|
+
result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
282
|
+
result[pm_metric_name + "_reason"] = (
|
|
283
|
+
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
284
|
+
)
|
|
285
|
+
return result
|
|
286
|
+
if metric_name not in batch_response[0]:
|
|
225
287
|
return {}
|
|
226
288
|
response = batch_response[0][metric_name]
|
|
227
289
|
response = response.replace("false", "False")
|
|
228
290
|
response = response.replace("true", "True")
|
|
229
291
|
parsed_response = literal_eval(response)
|
|
230
|
-
result = {}
|
|
231
292
|
# Use label instead of score since these are assumed to be boolean results.
|
|
232
293
|
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
233
|
-
result[
|
|
234
|
-
result[
|
|
294
|
+
result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
295
|
+
result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
235
296
|
|
|
236
297
|
if metric_name == EvaluationMetrics.XPIA:
|
|
237
298
|
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
|
|
238
299
|
# if present else set them to math.nan
|
|
239
|
-
result[
|
|
300
|
+
result[metric_display_name + "_manipulated_content"] = (
|
|
240
301
|
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
|
|
241
302
|
)
|
|
242
|
-
result[
|
|
303
|
+
result[metric_display_name + "_intrusion"] = (
|
|
243
304
|
parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
|
|
244
305
|
)
|
|
245
|
-
result[
|
|
306
|
+
result[metric_display_name + "_information_gathering"] = (
|
|
246
307
|
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
|
|
247
308
|
)
|
|
248
309
|
return result
|
|
249
|
-
return _parse_content_harm_response(batch_response, metric_name)
|
|
310
|
+
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
|
|
250
311
|
|
|
251
312
|
|
|
252
|
-
def _parse_content_harm_response(
|
|
313
|
+
def _parse_content_harm_response(
|
|
314
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
315
|
+
) -> Dict[str, Union[str, float]]:
|
|
253
316
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
254
317
|
|
|
255
318
|
:param batch_response: The annotation response from Responsible AI service.
|
|
256
319
|
:type batch_response: List[Dict]
|
|
257
320
|
:param metric_name: The evaluation metric to use.
|
|
258
321
|
:type metric_name: str
|
|
322
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
323
|
+
:type metric_display_name: Optional[str]
|
|
259
324
|
:return: The parsed annotation result.
|
|
260
325
|
:rtype: Dict[str, Union[str, float]]
|
|
261
326
|
"""
|
|
262
327
|
# Fix the metric name if it's "hate_fairness"
|
|
263
328
|
# Eventually we will remove this fix once the RAI service is updated
|
|
264
|
-
key = metric_name
|
|
329
|
+
key = metric_name if metric_display_name is None else metric_display_name
|
|
265
330
|
if key == EvaluationMetrics.HATE_FAIRNESS:
|
|
266
331
|
key = EvaluationMetrics.HATE_UNFAIRNESS
|
|
267
332
|
|
|
268
|
-
result: Dict[str, Union[str, float]] = {
|
|
333
|
+
result: Dict[str, Union[str, float]] = {
|
|
334
|
+
(key.value if hasattr(key, "value") else key): math.nan,
|
|
335
|
+
f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
|
|
336
|
+
f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
|
|
337
|
+
}
|
|
269
338
|
|
|
270
339
|
response = batch_response[0]
|
|
271
340
|
if metric_name not in response:
|
|
@@ -283,7 +352,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
283
352
|
|
|
284
353
|
# get content harm metric_value
|
|
285
354
|
if "label" in harm_response:
|
|
286
|
-
metric_value = harm_response["label"]
|
|
355
|
+
metric_value = float(harm_response["label"])
|
|
287
356
|
elif "valid" in harm_response:
|
|
288
357
|
metric_value = 0 if harm_response["valid"] else math.nan
|
|
289
358
|
else:
|
|
@@ -314,6 +383,8 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
314
383
|
reason = ""
|
|
315
384
|
|
|
316
385
|
harm_score = metric_value
|
|
386
|
+
if metric_value == "n/a":
|
|
387
|
+
return result
|
|
317
388
|
if not math.isnan(metric_value):
|
|
318
389
|
# int(math.nan) causes a value error, and math.nan is already handled
|
|
319
390
|
# by get_harm_severity_level
|
|
@@ -347,13 +418,17 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
347
418
|
)
|
|
348
419
|
|
|
349
420
|
if response.status_code != 200:
|
|
350
|
-
msg =
|
|
421
|
+
msg = (
|
|
422
|
+
f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
|
|
423
|
+
f"and make sure you have the necessary access permissions. "
|
|
424
|
+
f"Status code: {response.status_code}."
|
|
425
|
+
)
|
|
351
426
|
raise EvaluationException(
|
|
352
427
|
message=msg,
|
|
353
|
-
internal_message=msg,
|
|
354
428
|
target=ErrorTarget.RAI_CLIENT,
|
|
355
|
-
|
|
356
|
-
|
|
429
|
+
blame=ErrorBlame.USER_ERROR,
|
|
430
|
+
category=ErrorCategory.PROJECT_ACCESS_ERROR,
|
|
431
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
357
432
|
)
|
|
358
433
|
|
|
359
434
|
base_url = urlparse(response.json()["properties"]["discoveryUrl"])
|
|
@@ -412,14 +487,17 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
|
|
|
412
487
|
|
|
413
488
|
|
|
414
489
|
async def evaluate_with_rai_service(
|
|
415
|
-
|
|
490
|
+
data: dict,
|
|
491
|
+
metric_name: str,
|
|
492
|
+
project_scope: AzureAIProject,
|
|
493
|
+
credential: TokenCredential,
|
|
494
|
+
annotation_task: str = Tasks.CONTENT_HARM,
|
|
495
|
+
metric_display_name=None,
|
|
416
496
|
) -> Dict[str, Union[str, float]]:
|
|
417
497
|
""" "Evaluate the content safety of the response using Responsible AI service
|
|
418
498
|
|
|
419
|
-
:param
|
|
420
|
-
:type
|
|
421
|
-
:param response: The response to evaluate.
|
|
422
|
-
:type response: str
|
|
499
|
+
:param data: The data to evaluate.
|
|
500
|
+
:type data: dict
|
|
423
501
|
:param metric_name: The evaluation metric to use.
|
|
424
502
|
:type metric_name: str
|
|
425
503
|
:param project_scope: The Azure AI project scope details.
|
|
@@ -427,6 +505,10 @@ async def evaluate_with_rai_service(
|
|
|
427
505
|
:param credential: The Azure authentication credential.
|
|
428
506
|
:type credential:
|
|
429
507
|
~azure.core.credentials.TokenCredential
|
|
508
|
+
:param annotation_task: The annotation task to use.
|
|
509
|
+
:type annotation_task: str
|
|
510
|
+
:param metric_display_name: The display name of metric to use.
|
|
511
|
+
:type metric_display_name: str
|
|
430
512
|
:return: The parsed annotation result.
|
|
431
513
|
:rtype: Dict[str, Union[str, float]]
|
|
432
514
|
"""
|
|
@@ -434,11 +516,117 @@ async def evaluate_with_rai_service(
|
|
|
434
516
|
# Get RAI service URL from discovery service and check service availability
|
|
435
517
|
token = await fetch_or_reuse_token(credential)
|
|
436
518
|
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
437
|
-
await ensure_service_availability(rai_svc_url, token,
|
|
519
|
+
await ensure_service_availability(rai_svc_url, token, annotation_task)
|
|
438
520
|
|
|
439
521
|
# Submit annotation request and fetch result
|
|
440
|
-
operation_id = await submit_request(
|
|
522
|
+
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
|
|
441
523
|
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
442
|
-
result = parse_response(annotation_response, metric_name)
|
|
524
|
+
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
525
|
+
|
|
526
|
+
return result
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
|
|
530
|
+
"""Generate the payload for the annotation request
|
|
531
|
+
:param content_type: The type of the content representing multimodal or images.
|
|
532
|
+
:type content_type: str
|
|
533
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
534
|
+
:type messages: str
|
|
535
|
+
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
536
|
+
in the payload.
|
|
537
|
+
:type metric: str
|
|
538
|
+
:return: The payload for the annotation request.
|
|
539
|
+
:rtype: Dict
|
|
540
|
+
"""
|
|
541
|
+
include_metric = True
|
|
542
|
+
task = Tasks.CONTENT_HARM
|
|
543
|
+
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
544
|
+
task = Tasks.PROTECTED_MATERIAL
|
|
545
|
+
include_metric = False
|
|
546
|
+
|
|
547
|
+
if include_metric:
|
|
548
|
+
return {
|
|
549
|
+
"ContentType": content_type,
|
|
550
|
+
"Contents": [{"messages": messages}],
|
|
551
|
+
"AnnotationTask": task,
|
|
552
|
+
"MetricList": [metric],
|
|
553
|
+
}
|
|
554
|
+
return {
|
|
555
|
+
"ContentType": content_type,
|
|
556
|
+
"Contents": [{"messages": messages}],
|
|
557
|
+
"AnnotationTask": task,
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
|
|
562
|
+
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
563
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
564
|
+
:type messages: str
|
|
565
|
+
:param metric: The evaluation metric to use.
|
|
566
|
+
:type metric: str
|
|
567
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
568
|
+
:type rai_svc_url: str
|
|
569
|
+
:param token: The Azure authentication token.
|
|
570
|
+
:type token: str
|
|
571
|
+
:return: The operation ID.
|
|
572
|
+
:rtype: str
|
|
573
|
+
"""
|
|
574
|
+
## handle json payload and payload from inference sdk strongly type messages
|
|
575
|
+
if len(messages) > 0 and not isinstance(messages[0], dict):
|
|
576
|
+
try:
|
|
577
|
+
from azure.ai.inference.models import ChatRequestMessage
|
|
578
|
+
except ImportError as ex:
|
|
579
|
+
error_message = (
|
|
580
|
+
"Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
|
|
581
|
+
)
|
|
582
|
+
raise MissingRequiredPackage(message=error_message) from ex
|
|
583
|
+
if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
|
|
584
|
+
messages = [message.as_dict() for message in messages]
|
|
585
|
+
|
|
586
|
+
filtered_messages = [message for message in messages if message["role"] != "system"]
|
|
587
|
+
assistant_messages = [message for message in messages if message["role"] == "assistant"]
|
|
588
|
+
content_type = retrieve_content_type(assistant_messages, metric)
|
|
589
|
+
payload = generate_payload_multimodal(content_type, filtered_messages, metric)
|
|
590
|
+
|
|
591
|
+
## calling rai service for annotation
|
|
592
|
+
url = rai_svc_url + "/submitannotation"
|
|
593
|
+
headers = get_common_headers(token)
|
|
594
|
+
async with get_async_http_client() as client:
|
|
595
|
+
response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
596
|
+
url, json=payload, headers=headers
|
|
597
|
+
)
|
|
598
|
+
if response.status_code != 202:
|
|
599
|
+
raise HttpResponseError(
|
|
600
|
+
message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
|
|
601
|
+
)
|
|
602
|
+
result = response.json()
|
|
603
|
+
operation_id = result["location"].split("/")[-1]
|
|
604
|
+
return operation_id
|
|
605
|
+
|
|
443
606
|
|
|
607
|
+
async def evaluate_with_rai_service_multimodal(
|
|
608
|
+
messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
|
|
609
|
+
):
|
|
610
|
+
""" "Evaluate the content safety of the response using Responsible AI service
|
|
611
|
+
:param messages: The normalized list of messages.
|
|
612
|
+
:type messages: str
|
|
613
|
+
:param metric_name: The evaluation metric to use.
|
|
614
|
+
:type metric_name: str
|
|
615
|
+
:param project_scope: The Azure AI project scope details.
|
|
616
|
+
:type project_scope: Dict
|
|
617
|
+
:param credential: The Azure authentication credential.
|
|
618
|
+
:type credential:
|
|
619
|
+
~azure.core.credentials.TokenCredential
|
|
620
|
+
:return: The parsed annotation result.
|
|
621
|
+
:rtype: List[List[Dict]]
|
|
622
|
+
"""
|
|
623
|
+
|
|
624
|
+
# Get RAI service URL from discovery service and check service availability
|
|
625
|
+
token = await fetch_or_reuse_token(credential)
|
|
626
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
627
|
+
await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
|
|
628
|
+
# Submit annotation request and fetch result
|
|
629
|
+
operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
|
|
630
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
631
|
+
result = parse_response(annotation_response, metric_name)
|
|
444
632
|
return result
|