azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +23 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
- azure/ai/evaluation/_common/constants.py +9 -2
- azure/ai/evaluation/_common/math.py +29 -0
- azure/ai/evaluation/_common/rai_service.py +222 -93
- azure/ai/evaluation/_common/utils.py +328 -19
- azure/ai/evaluation/_constants.py +16 -8
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
- azure/ai/evaluation/_evaluate/_utils.py +77 -25
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
- azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
- azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
- azure/ai/evaluation/_exceptions.py +26 -6
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +23 -6
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +222 -169
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
- azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
|
@@ -3,30 +3,32 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import asyncio
|
|
5
5
|
import importlib.metadata
|
|
6
|
+
import math
|
|
6
7
|
import re
|
|
7
8
|
import time
|
|
8
9
|
from ast import literal_eval
|
|
9
|
-
from typing import Dict, List
|
|
10
|
+
from typing import Dict, List, Optional, Union, cast
|
|
10
11
|
from urllib.parse import urlparse
|
|
12
|
+
from string import Template
|
|
11
13
|
|
|
12
14
|
import jwt
|
|
13
|
-
import numpy as np
|
|
14
15
|
|
|
16
|
+
from promptflow.core._errors import MissingRequiredPackage
|
|
15
17
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
|
-
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
18
|
+
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
17
19
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
20
|
from azure.core.credentials import TokenCredential
|
|
19
|
-
from azure.
|
|
21
|
+
from azure.core.exceptions import HttpResponseError
|
|
22
|
+
from azure.core.pipeline.policies import AsyncRetryPolicy
|
|
20
23
|
|
|
21
24
|
from .constants import (
|
|
22
25
|
CommonConstants,
|
|
23
26
|
EvaluationMetrics,
|
|
24
27
|
RAIService,
|
|
25
28
|
Tasks,
|
|
26
|
-
_InternalAnnotationTasks,
|
|
27
29
|
_InternalEvaluationMetrics,
|
|
28
30
|
)
|
|
29
|
-
from .utils import get_harm_severity_level
|
|
31
|
+
from .utils import get_harm_severity_level, retrieve_content_type
|
|
30
32
|
|
|
31
33
|
try:
|
|
32
34
|
version = importlib.metadata.version("azure-ai-evaluation")
|
|
@@ -34,6 +36,11 @@ except importlib.metadata.PackageNotFoundError:
|
|
|
34
36
|
version = "unknown"
|
|
35
37
|
USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
36
38
|
|
|
39
|
+
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
40
|
+
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
41
|
+
Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
42
|
+
}
|
|
43
|
+
|
|
37
44
|
|
|
38
45
|
def get_common_headers(token: str) -> Dict:
|
|
39
46
|
"""Get common headers for the HTTP request
|
|
@@ -53,7 +60,13 @@ def get_common_headers(token: str) -> Dict:
|
|
|
53
60
|
}
|
|
54
61
|
|
|
55
62
|
|
|
56
|
-
|
|
63
|
+
def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
|
|
64
|
+
return get_async_http_client().with_policies(
|
|
65
|
+
retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
|
|
57
70
|
"""Check if the Responsible AI service is available in the region and has the required capability, if relevant.
|
|
58
71
|
|
|
59
72
|
:param rai_svc_url: The Responsible AI service URL.
|
|
@@ -68,9 +81,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
68
81
|
svc_liveness_url = rai_svc_url + "/checkannotation"
|
|
69
82
|
|
|
70
83
|
async with get_async_http_client() as client:
|
|
71
|
-
response = await client.get(
|
|
72
|
-
svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
73
|
-
)
|
|
84
|
+
response = await client.get(svc_liveness_url, headers=headers)
|
|
74
85
|
|
|
75
86
|
if response.status_code != 200:
|
|
76
87
|
msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
|
|
@@ -95,7 +106,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
95
106
|
)
|
|
96
107
|
|
|
97
108
|
|
|
98
|
-
def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
109
|
+
def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
|
|
99
110
|
"""Generate the payload for the annotation request
|
|
100
111
|
|
|
101
112
|
:param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
|
|
@@ -103,19 +114,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
|
103
114
|
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
104
115
|
in the payload.
|
|
105
116
|
:type metric: str
|
|
117
|
+
:param annotation_task: The annotation task to be passed to service
|
|
118
|
+
:type annotation_task: str
|
|
106
119
|
:return: The payload for the annotation request.
|
|
107
120
|
:rtype: Dict
|
|
108
121
|
"""
|
|
109
122
|
include_metric = True
|
|
110
|
-
task =
|
|
123
|
+
task = annotation_task
|
|
111
124
|
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
112
|
-
task = Tasks.PROTECTED_MATERIAL
|
|
113
125
|
include_metric = False
|
|
114
126
|
elif metric == _InternalEvaluationMetrics.ECI:
|
|
115
|
-
task = _InternalAnnotationTasks.ECI
|
|
116
127
|
include_metric = False
|
|
117
128
|
elif metric == EvaluationMetrics.XPIA:
|
|
118
|
-
task = Tasks.XPIA
|
|
119
129
|
include_metric = False
|
|
120
130
|
return (
|
|
121
131
|
{
|
|
@@ -131,39 +141,36 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
|
131
141
|
)
|
|
132
142
|
|
|
133
143
|
|
|
134
|
-
async def submit_request(
|
|
144
|
+
async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
|
|
135
145
|
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
136
146
|
|
|
137
|
-
:param
|
|
138
|
-
:type
|
|
139
|
-
:param response: The response to evaluate.
|
|
140
|
-
:type response: str
|
|
147
|
+
:param data: The data to evaluate.
|
|
148
|
+
:type data: dict
|
|
141
149
|
:param metric: The evaluation metric to use.
|
|
142
150
|
:type metric: str
|
|
143
151
|
:param rai_svc_url: The Responsible AI service URL.
|
|
144
152
|
:type rai_svc_url: str
|
|
145
153
|
:param token: The Azure authentication token.
|
|
146
154
|
:type token: str
|
|
155
|
+
:param annotation_task: The annotation task to use.
|
|
156
|
+
:type annotation_task: str
|
|
147
157
|
:return: The operation ID.
|
|
148
158
|
:rtype: str
|
|
149
159
|
"""
|
|
150
|
-
user_text =
|
|
160
|
+
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
|
|
151
161
|
normalized_user_text = user_text.replace("'", '\\"')
|
|
152
|
-
payload = generate_payload(normalized_user_text, metric)
|
|
162
|
+
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
153
163
|
|
|
154
164
|
url = rai_svc_url + "/submitannotation"
|
|
155
165
|
headers = get_common_headers(token)
|
|
156
166
|
|
|
157
|
-
async with
|
|
158
|
-
|
|
159
|
-
url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
if response.status_code != 202:
|
|
163
|
-
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
|
|
164
|
-
response.raise_for_status()
|
|
167
|
+
async with get_async_http_client_with_timeout() as client:
|
|
168
|
+
http_response = await client.post(url, json=payload, headers=headers)
|
|
165
169
|
|
|
166
|
-
|
|
170
|
+
if http_response.status_code != 202:
|
|
171
|
+
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
|
|
172
|
+
http_response.raise_for_status()
|
|
173
|
+
result = http_response.json()
|
|
167
174
|
operation_id = result["location"].split("/")[-1]
|
|
168
175
|
return operation_id
|
|
169
176
|
|
|
@@ -190,10 +197,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
190
197
|
token = await fetch_or_reuse_token(credential, token)
|
|
191
198
|
headers = get_common_headers(token)
|
|
192
199
|
|
|
193
|
-
async with
|
|
194
|
-
response = await client.get(
|
|
195
|
-
url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
196
|
-
)
|
|
200
|
+
async with get_async_http_client_with_timeout() as client:
|
|
201
|
+
response = await client.get(url, headers=headers)
|
|
197
202
|
|
|
198
203
|
if response.status_code == 200:
|
|
199
204
|
return response.json()
|
|
@@ -208,19 +213,28 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
208
213
|
|
|
209
214
|
|
|
210
215
|
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
211
|
-
batch_response: List[Dict], metric_name: str
|
|
212
|
-
) -> Dict:
|
|
216
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
217
|
+
) -> Dict[str, Union[str, float]]:
|
|
213
218
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
214
219
|
|
|
215
220
|
:param batch_response: The annotation response from Responsible AI service.
|
|
216
221
|
:type batch_response: List[Dict]
|
|
217
222
|
:param metric_name: The evaluation metric to use.
|
|
218
223
|
:type metric_name: str
|
|
224
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
225
|
+
:type metric_display_name: Optional[str]
|
|
219
226
|
:return: The parsed annotation result.
|
|
220
|
-
:rtype:
|
|
227
|
+
:rtype: Dict[str, Union[str, float]]
|
|
221
228
|
"""
|
|
229
|
+
if metric_display_name is None:
|
|
230
|
+
metric_display_name = metric_name
|
|
231
|
+
|
|
222
232
|
# non-numeric metrics
|
|
223
|
-
if metric_name in {
|
|
233
|
+
if metric_name in {
|
|
234
|
+
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
235
|
+
_InternalEvaluationMetrics.ECI,
|
|
236
|
+
EvaluationMetrics.XPIA,
|
|
237
|
+
}:
|
|
224
238
|
if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
|
|
225
239
|
return {}
|
|
226
240
|
response = batch_response[0][metric_name]
|
|
@@ -229,43 +243,51 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
229
243
|
parsed_response = literal_eval(response)
|
|
230
244
|
result = {}
|
|
231
245
|
# Use label instead of score since these are assumed to be boolean results.
|
|
232
|
-
# Use
|
|
233
|
-
result[
|
|
234
|
-
result[
|
|
246
|
+
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
247
|
+
result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
248
|
+
result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
235
249
|
|
|
236
250
|
if metric_name == EvaluationMetrics.XPIA:
|
|
237
251
|
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
|
|
238
|
-
# if present else set them to
|
|
239
|
-
result[
|
|
240
|
-
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else
|
|
252
|
+
# if present else set them to math.nan
|
|
253
|
+
result[metric_display_name + "_manipulated_content"] = (
|
|
254
|
+
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
|
|
241
255
|
)
|
|
242
|
-
result[
|
|
243
|
-
parsed_response["intrusion"] if "intrusion" in parsed_response else
|
|
256
|
+
result[metric_display_name + "_intrusion"] = (
|
|
257
|
+
parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
|
|
244
258
|
)
|
|
245
|
-
result[
|
|
246
|
-
parsed_response["information_gathering"] if "information_gathering" in parsed_response else
|
|
259
|
+
result[metric_display_name + "_information_gathering"] = (
|
|
260
|
+
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
|
|
247
261
|
)
|
|
248
262
|
return result
|
|
249
|
-
return _parse_content_harm_response(batch_response, metric_name)
|
|
263
|
+
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
|
|
250
264
|
|
|
251
265
|
|
|
252
|
-
def _parse_content_harm_response(
|
|
266
|
+
def _parse_content_harm_response(
|
|
267
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
268
|
+
) -> Dict[str, Union[str, float]]:
|
|
253
269
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
254
270
|
|
|
255
271
|
:param batch_response: The annotation response from Responsible AI service.
|
|
256
272
|
:type batch_response: List[Dict]
|
|
257
273
|
:param metric_name: The evaluation metric to use.
|
|
258
274
|
:type metric_name: str
|
|
275
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
276
|
+
:type metric_display_name: Optional[str]
|
|
259
277
|
:return: The parsed annotation result.
|
|
260
|
-
:rtype:
|
|
278
|
+
:rtype: Dict[str, Union[str, float]]
|
|
261
279
|
"""
|
|
262
280
|
# Fix the metric name if it's "hate_fairness"
|
|
263
281
|
# Eventually we will remove this fix once the RAI service is updated
|
|
264
|
-
key = metric_name
|
|
282
|
+
key = metric_name if metric_display_name is None else metric_display_name
|
|
265
283
|
if key == EvaluationMetrics.HATE_FAIRNESS:
|
|
266
284
|
key = EvaluationMetrics.HATE_UNFAIRNESS
|
|
267
285
|
|
|
268
|
-
result
|
|
286
|
+
result: Dict[str, Union[str, float]] = {
|
|
287
|
+
(key.value if hasattr(key, "value") else key): math.nan,
|
|
288
|
+
f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
|
|
289
|
+
f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
|
|
290
|
+
}
|
|
269
291
|
|
|
270
292
|
response = batch_response[0]
|
|
271
293
|
if metric_name not in response:
|
|
@@ -283,11 +305,11 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
283
305
|
|
|
284
306
|
# get content harm metric_value
|
|
285
307
|
if "label" in harm_response:
|
|
286
|
-
metric_value = harm_response["label"]
|
|
308
|
+
metric_value = float(harm_response["label"])
|
|
287
309
|
elif "valid" in harm_response:
|
|
288
|
-
metric_value = 0 if harm_response["valid"] else
|
|
310
|
+
metric_value = 0 if harm_response["valid"] else math.nan
|
|
289
311
|
else:
|
|
290
|
-
metric_value =
|
|
312
|
+
metric_value = math.nan
|
|
291
313
|
|
|
292
314
|
# get reason
|
|
293
315
|
if "reasoning" in harm_response:
|
|
@@ -301,21 +323,23 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
301
323
|
if metric_value_match:
|
|
302
324
|
metric_value = int(metric_value_match[0])
|
|
303
325
|
else:
|
|
304
|
-
metric_value =
|
|
326
|
+
metric_value = math.nan
|
|
305
327
|
reason = harm_response
|
|
306
328
|
elif harm_response != "" and isinstance(harm_response, (int, float)):
|
|
307
329
|
if 0 < harm_response <= 7:
|
|
308
330
|
metric_value = harm_response
|
|
309
331
|
else:
|
|
310
|
-
metric_value =
|
|
332
|
+
metric_value = math.nan
|
|
311
333
|
reason = ""
|
|
312
334
|
else:
|
|
313
|
-
metric_value =
|
|
335
|
+
metric_value = math.nan
|
|
314
336
|
reason = ""
|
|
315
337
|
|
|
316
338
|
harm_score = metric_value
|
|
317
|
-
if
|
|
318
|
-
|
|
339
|
+
if metric_value == "n/a":
|
|
340
|
+
return result
|
|
341
|
+
if not math.isnan(metric_value):
|
|
342
|
+
# int(math.nan) causes a value error, and math.nan is already handled
|
|
319
343
|
# by get_harm_severity_level
|
|
320
344
|
harm_score = int(metric_value)
|
|
321
345
|
result[key] = get_harm_severity_level(harm_score)
|
|
@@ -337,14 +361,13 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
337
361
|
"""
|
|
338
362
|
headers = get_common_headers(token)
|
|
339
363
|
|
|
340
|
-
async with
|
|
341
|
-
response = await client.get(
|
|
364
|
+
async with get_async_http_client_with_timeout() as client:
|
|
365
|
+
response = await client.get(
|
|
342
366
|
f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
|
|
343
367
|
f"resourceGroups/{azure_ai_project['resource_group_name']}/"
|
|
344
368
|
f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
|
|
345
369
|
f"api-version=2023-08-01-preview",
|
|
346
370
|
headers=headers,
|
|
347
|
-
timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
|
|
348
371
|
)
|
|
349
372
|
|
|
350
373
|
if response.status_code != 200:
|
|
@@ -361,7 +384,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
361
384
|
return f"{base_url.scheme}://{base_url.netloc}"
|
|
362
385
|
|
|
363
386
|
|
|
364
|
-
async def get_rai_svc_url(project_scope:
|
|
387
|
+
async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
|
|
365
388
|
"""Get the Responsible AI service URL
|
|
366
389
|
|
|
367
390
|
:param project_scope: The Azure AI project scope details.
|
|
@@ -385,7 +408,7 @@ async def get_rai_svc_url(project_scope: dict, token: str) -> str:
|
|
|
385
408
|
return rai_url
|
|
386
409
|
|
|
387
410
|
|
|
388
|
-
async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str:
|
|
411
|
+
async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str:
|
|
389
412
|
"""Get token. Fetch a new token if the current token is near expiry
|
|
390
413
|
|
|
391
414
|
:param credential: The Azure authentication credential.
|
|
@@ -395,35 +418,147 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -
|
|
|
395
418
|
:type token: str
|
|
396
419
|
:return: The Azure authentication token.
|
|
397
420
|
"""
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
# Decode the token to get its expiration time
|
|
421
|
+
if token:
|
|
422
|
+
# Decode the token to get its expiration time
|
|
423
|
+
try:
|
|
402
424
|
decoded_token = jwt.decode(token, options={"verify_signature": False})
|
|
425
|
+
except jwt.PyJWTError:
|
|
426
|
+
pass
|
|
427
|
+
else:
|
|
403
428
|
exp_time = decoded_token["exp"]
|
|
404
429
|
current_time = time.time()
|
|
405
430
|
|
|
406
|
-
#
|
|
431
|
+
# Return current token if not near expiry
|
|
407
432
|
if (exp_time - current_time) >= 300:
|
|
408
|
-
|
|
409
|
-
except Exception: # pylint: disable=broad-exception-caught
|
|
410
|
-
pass
|
|
433
|
+
return token
|
|
411
434
|
|
|
412
|
-
|
|
413
|
-
token = credential.get_token("https://management.azure.com/.default").token
|
|
414
|
-
|
|
415
|
-
return token
|
|
435
|
+
return credential.get_token("https://management.azure.com/.default").token
|
|
416
436
|
|
|
417
437
|
|
|
418
438
|
async def evaluate_with_rai_service(
|
|
419
|
-
|
|
420
|
-
|
|
439
|
+
data: dict,
|
|
440
|
+
metric_name: str,
|
|
441
|
+
project_scope: AzureAIProject,
|
|
442
|
+
credential: TokenCredential,
|
|
443
|
+
annotation_task: str = Tasks.CONTENT_HARM,
|
|
444
|
+
metric_display_name=None,
|
|
445
|
+
) -> Dict[str, Union[str, float]]:
|
|
421
446
|
""" "Evaluate the content safety of the response using Responsible AI service
|
|
422
447
|
|
|
423
|
-
:param
|
|
424
|
-
:type
|
|
425
|
-
:param
|
|
426
|
-
:type
|
|
448
|
+
:param data: The data to evaluate.
|
|
449
|
+
:type data: dict
|
|
450
|
+
:param metric_name: The evaluation metric to use.
|
|
451
|
+
:type metric_name: str
|
|
452
|
+
:param project_scope: The Azure AI project scope details.
|
|
453
|
+
:type project_scope: Dict
|
|
454
|
+
:param credential: The Azure authentication credential.
|
|
455
|
+
:type credential:
|
|
456
|
+
~azure.core.credentials.TokenCredential
|
|
457
|
+
:param annotation_task: The annotation task to use.
|
|
458
|
+
:type annotation_task: str
|
|
459
|
+
:param metric_display_name: The display name of metric to use.
|
|
460
|
+
:type metric_display_name: str
|
|
461
|
+
:return: The parsed annotation result.
|
|
462
|
+
:rtype: Dict[str, Union[str, float]]
|
|
463
|
+
"""
|
|
464
|
+
|
|
465
|
+
# Get RAI service URL from discovery service and check service availability
|
|
466
|
+
token = await fetch_or_reuse_token(credential)
|
|
467
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
468
|
+
await ensure_service_availability(rai_svc_url, token, annotation_task)
|
|
469
|
+
|
|
470
|
+
# Submit annotation request and fetch result
|
|
471
|
+
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
|
|
472
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
473
|
+
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
474
|
+
|
|
475
|
+
return result
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
|
|
479
|
+
"""Generate the payload for the annotation request
|
|
480
|
+
:param content_type: The type of the content representing multimodal or images.
|
|
481
|
+
:type content_type: str
|
|
482
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
483
|
+
:type messages: str
|
|
484
|
+
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
485
|
+
in the payload.
|
|
486
|
+
:type metric: str
|
|
487
|
+
:return: The payload for the annotation request.
|
|
488
|
+
:rtype: Dict
|
|
489
|
+
"""
|
|
490
|
+
include_metric = True
|
|
491
|
+
task = Tasks.CONTENT_HARM
|
|
492
|
+
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
493
|
+
task = Tasks.PROTECTED_MATERIAL
|
|
494
|
+
include_metric = False
|
|
495
|
+
|
|
496
|
+
if include_metric:
|
|
497
|
+
return {
|
|
498
|
+
"ContentType": content_type,
|
|
499
|
+
"Contents": [{"messages": messages}],
|
|
500
|
+
"AnnotationTask": task,
|
|
501
|
+
"MetricList": [metric],
|
|
502
|
+
}
|
|
503
|
+
return {
|
|
504
|
+
"ContentType": content_type,
|
|
505
|
+
"Contents": [{"messages": messages}],
|
|
506
|
+
"AnnotationTask": task,
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
|
|
511
|
+
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
512
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
513
|
+
:type messages: str
|
|
514
|
+
:param metric: The evaluation metric to use.
|
|
515
|
+
:type metric: str
|
|
516
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
517
|
+
:type rai_svc_url: str
|
|
518
|
+
:param token: The Azure authentication token.
|
|
519
|
+
:type token: str
|
|
520
|
+
:return: The operation ID.
|
|
521
|
+
:rtype: str
|
|
522
|
+
"""
|
|
523
|
+
## handle json payload and payload from inference sdk strongly type messages
|
|
524
|
+
if len(messages) > 0 and not isinstance(messages[0], dict):
|
|
525
|
+
try:
|
|
526
|
+
from azure.ai.inference.models import ChatRequestMessage
|
|
527
|
+
except ImportError as ex:
|
|
528
|
+
error_message = (
|
|
529
|
+
"Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
|
|
530
|
+
)
|
|
531
|
+
raise MissingRequiredPackage(message=error_message) from ex
|
|
532
|
+
if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
|
|
533
|
+
messages = [message.as_dict() for message in messages]
|
|
534
|
+
|
|
535
|
+
filtered_messages = [message for message in messages if message["role"] != "system"]
|
|
536
|
+
assistant_messages = [message for message in messages if message["role"] == "assistant"]
|
|
537
|
+
content_type = retrieve_content_type(assistant_messages, metric)
|
|
538
|
+
payload = generate_payload_multimodal(content_type, filtered_messages, metric)
|
|
539
|
+
|
|
540
|
+
## calling rai service for annotation
|
|
541
|
+
url = rai_svc_url + "/submitannotation"
|
|
542
|
+
headers = get_common_headers(token)
|
|
543
|
+
async with get_async_http_client() as client:
|
|
544
|
+
response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
545
|
+
url, json=payload, headers=headers
|
|
546
|
+
)
|
|
547
|
+
if response.status_code != 202:
|
|
548
|
+
raise HttpResponseError(
|
|
549
|
+
message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
|
|
550
|
+
)
|
|
551
|
+
result = response.json()
|
|
552
|
+
operation_id = result["location"].split("/")[-1]
|
|
553
|
+
return operation_id
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
async def evaluate_with_rai_service_multimodal(
|
|
557
|
+
messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
|
|
558
|
+
):
|
|
559
|
+
""" "Evaluate the content safety of the response using Responsible AI service
|
|
560
|
+
:param messages: The normalized list of messages.
|
|
561
|
+
:type messages: str
|
|
427
562
|
:param metric_name: The evaluation metric to use.
|
|
428
563
|
:type metric_name: str
|
|
429
564
|
:param project_scope: The Azure AI project scope details.
|
|
@@ -434,19 +569,13 @@ async def evaluate_with_rai_service(
|
|
|
434
569
|
:return: The parsed annotation result.
|
|
435
570
|
:rtype: List[List[Dict]]
|
|
436
571
|
"""
|
|
437
|
-
# Use DefaultAzureCredential if no credential is provided
|
|
438
|
-
# This is for the for batch run scenario as the credential cannot be serialized by promoptflow
|
|
439
|
-
if credential is None or credential == {}:
|
|
440
|
-
credential = DefaultAzureCredential()
|
|
441
572
|
|
|
442
573
|
# Get RAI service URL from discovery service and check service availability
|
|
443
574
|
token = await fetch_or_reuse_token(credential)
|
|
444
575
|
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
445
576
|
await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
|
|
446
|
-
|
|
447
577
|
# Submit annotation request and fetch result
|
|
448
|
-
operation_id = await
|
|
449
|
-
annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
|
|
578
|
+
operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
|
|
579
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
450
580
|
result = parse_response(annotation_response, metric_name)
|
|
451
|
-
|
|
452
581
|
return result
|