azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
|
@@ -3,34 +3,30 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import asyncio
|
|
5
5
|
import importlib.metadata
|
|
6
|
-
import math
|
|
7
6
|
import re
|
|
8
7
|
import time
|
|
9
|
-
import json
|
|
10
|
-
import html
|
|
11
8
|
from ast import literal_eval
|
|
12
|
-
from typing import Dict, List
|
|
9
|
+
from typing import Dict, List
|
|
13
10
|
from urllib.parse import urlparse
|
|
14
|
-
from string import Template
|
|
15
11
|
|
|
16
12
|
import jwt
|
|
13
|
+
import numpy as np
|
|
14
|
+
from azure.core.credentials import TokenCredential
|
|
15
|
+
from azure.identity import DefaultAzureCredential
|
|
17
16
|
|
|
18
|
-
from
|
|
19
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget
|
|
20
|
-
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
17
|
+
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
18
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
21
19
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
22
|
-
from azure.core.credentials import TokenCredential
|
|
23
|
-
from azure.core.exceptions import HttpResponseError
|
|
24
|
-
from azure.core.pipeline.policies import AsyncRetryPolicy
|
|
25
20
|
|
|
26
21
|
from .constants import (
|
|
27
22
|
CommonConstants,
|
|
28
23
|
EvaluationMetrics,
|
|
29
24
|
RAIService,
|
|
30
25
|
Tasks,
|
|
26
|
+
_InternalAnnotationTasks,
|
|
31
27
|
_InternalEvaluationMetrics,
|
|
32
28
|
)
|
|
33
|
-
from .utils import get_harm_severity_level
|
|
29
|
+
from .utils import get_harm_severity_level
|
|
34
30
|
|
|
35
31
|
try:
|
|
36
32
|
version = importlib.metadata.version("azure-ai-evaluation")
|
|
@@ -38,39 +34,6 @@ except importlib.metadata.PackageNotFoundError:
|
|
|
38
34
|
version = "unknown"
|
|
39
35
|
USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
40
36
|
|
|
41
|
-
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
42
|
-
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
47
|
-
"""Given the task and input data, produce a formatted string that will serve as the main
|
|
48
|
-
payload for the RAI service. Requires specific per-task logic.
|
|
49
|
-
|
|
50
|
-
:param data: The data to incorporate into the payload.
|
|
51
|
-
:type data: dict
|
|
52
|
-
:param annotation_task: The annotation task to use. This determines the template to use.
|
|
53
|
-
:type annotation_task: str
|
|
54
|
-
:return: The formatted based on the data and task template.
|
|
55
|
-
:rtype: str
|
|
56
|
-
"""
|
|
57
|
-
# Template class doesn't play nice with json dumping/loading, just handle groundedness'
|
|
58
|
-
# JSON format manually.
|
|
59
|
-
# Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
60
|
-
if annotation_task == Tasks.GROUNDEDNESS:
|
|
61
|
-
as_dict = {
|
|
62
|
-
"question": data.get("query", ""),
|
|
63
|
-
"answer": data.get("response", ""),
|
|
64
|
-
"context": data.get("context", ""),
|
|
65
|
-
}
|
|
66
|
-
return json.dumps(as_dict)
|
|
67
|
-
as_dict = {
|
|
68
|
-
"query": html.escape(data.get("query", "")),
|
|
69
|
-
"response": html.escape(data.get("response", "")),
|
|
70
|
-
}
|
|
71
|
-
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
|
|
72
|
-
return user_text.replace("'", '\\"')
|
|
73
|
-
|
|
74
37
|
|
|
75
38
|
def get_common_headers(token: str) -> Dict:
|
|
76
39
|
"""Get common headers for the HTTP request
|
|
@@ -90,13 +53,7 @@ def get_common_headers(token: str) -> Dict:
|
|
|
90
53
|
}
|
|
91
54
|
|
|
92
55
|
|
|
93
|
-
def
|
|
94
|
-
return get_async_http_client().with_policies(
|
|
95
|
-
retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
|
|
56
|
+
async def ensure_service_availability(rai_svc_url: str, token: str, capability: str = None) -> None:
|
|
100
57
|
"""Check if the Responsible AI service is available in the region and has the required capability, if relevant.
|
|
101
58
|
|
|
102
59
|
:param rai_svc_url: The Responsible AI service URL.
|
|
@@ -111,36 +68,34 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
111
68
|
svc_liveness_url = rai_svc_url + "/checkannotation"
|
|
112
69
|
|
|
113
70
|
async with get_async_http_client() as client:
|
|
114
|
-
response = await client.get(
|
|
71
|
+
response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
72
|
+
svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
73
|
+
)
|
|
115
74
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
126
|
-
blame=ErrorBlame.USER_ERROR,
|
|
127
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
128
|
-
)
|
|
75
|
+
if response.status_code != 200:
|
|
76
|
+
msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
|
|
77
|
+
raise EvaluationException(
|
|
78
|
+
message=msg,
|
|
79
|
+
internal_message=msg,
|
|
80
|
+
target=ErrorTarget.UNKNOWN,
|
|
81
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
82
|
+
blame=ErrorBlame.USER_ERROR,
|
|
83
|
+
)
|
|
129
84
|
|
|
130
|
-
|
|
131
|
-
if capability and capability not in capabilities:
|
|
132
|
-
msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
|
|
133
|
-
raise EvaluationException(
|
|
134
|
-
message=msg,
|
|
135
|
-
internal_message=msg,
|
|
136
|
-
target=ErrorTarget.RAI_CLIENT,
|
|
137
|
-
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
138
|
-
blame=ErrorBlame.USER_ERROR,
|
|
139
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
140
|
-
)
|
|
85
|
+
capabilities = response.json()
|
|
141
86
|
|
|
87
|
+
if capability and capability not in capabilities:
|
|
88
|
+
msg = f"Capability '{capability}' is not available in this region"
|
|
89
|
+
raise EvaluationException(
|
|
90
|
+
message=msg,
|
|
91
|
+
internal_message=msg,
|
|
92
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
93
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
94
|
+
blame=ErrorBlame.USER_ERROR,
|
|
95
|
+
)
|
|
142
96
|
|
|
143
|
-
|
|
97
|
+
|
|
98
|
+
def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
144
99
|
"""Generate the payload for the annotation request
|
|
145
100
|
|
|
146
101
|
:param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
|
|
@@ -148,18 +103,19 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
|
|
|
148
103
|
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
149
104
|
in the payload.
|
|
150
105
|
:type metric: str
|
|
151
|
-
:param annotation_task: The annotation task to be passed to service
|
|
152
|
-
:type annotation_task: str
|
|
153
106
|
:return: The payload for the annotation request.
|
|
154
107
|
:rtype: Dict
|
|
155
108
|
"""
|
|
156
109
|
include_metric = True
|
|
157
|
-
task =
|
|
110
|
+
task = Tasks.CONTENT_HARM
|
|
158
111
|
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
112
|
+
task = Tasks.PROTECTED_MATERIAL
|
|
159
113
|
include_metric = False
|
|
160
114
|
elif metric == _InternalEvaluationMetrics.ECI:
|
|
115
|
+
task = _InternalAnnotationTasks.ECI
|
|
161
116
|
include_metric = False
|
|
162
117
|
elif metric == EvaluationMetrics.XPIA:
|
|
118
|
+
task = Tasks.XPIA
|
|
163
119
|
include_metric = False
|
|
164
120
|
return (
|
|
165
121
|
{
|
|
@@ -175,35 +131,39 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
|
|
|
175
131
|
)
|
|
176
132
|
|
|
177
133
|
|
|
178
|
-
async def submit_request(
|
|
134
|
+
async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
|
|
179
135
|
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
180
136
|
|
|
181
|
-
:param
|
|
182
|
-
:type
|
|
137
|
+
:param query: The query to evaluate.
|
|
138
|
+
:type query: str
|
|
139
|
+
:param response: The response to evaluate.
|
|
140
|
+
:type response: str
|
|
183
141
|
:param metric: The evaluation metric to use.
|
|
184
142
|
:type metric: str
|
|
185
143
|
:param rai_svc_url: The Responsible AI service URL.
|
|
186
144
|
:type rai_svc_url: str
|
|
187
145
|
:param token: The Azure authentication token.
|
|
188
146
|
:type token: str
|
|
189
|
-
:param annotation_task: The annotation task to use.
|
|
190
|
-
:type annotation_task: str
|
|
191
147
|
:return: The operation ID.
|
|
192
148
|
:rtype: str
|
|
193
149
|
"""
|
|
194
|
-
|
|
195
|
-
|
|
150
|
+
user_text = f"<Human>{query}</><System>{response}</>"
|
|
151
|
+
normalized_user_text = user_text.replace("'", '\\"')
|
|
152
|
+
payload = generate_payload(normalized_user_text, metric)
|
|
196
153
|
|
|
197
154
|
url = rai_svc_url + "/submitannotation"
|
|
198
155
|
headers = get_common_headers(token)
|
|
199
156
|
|
|
200
|
-
async with
|
|
201
|
-
|
|
157
|
+
async with get_async_http_client() as client:
|
|
158
|
+
response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
159
|
+
url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
if response.status_code != 202:
|
|
163
|
+
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
|
|
164
|
+
response.raise_for_status()
|
|
202
165
|
|
|
203
|
-
|
|
204
|
-
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
|
|
205
|
-
http_response.raise_for_status()
|
|
206
|
-
result = http_response.json()
|
|
166
|
+
result = response.json()
|
|
207
167
|
operation_id = result["location"].split("/")[-1]
|
|
208
168
|
return operation_id
|
|
209
169
|
|
|
@@ -230,8 +190,10 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
230
190
|
token = await fetch_or_reuse_token(credential, token)
|
|
231
191
|
headers = get_common_headers(token)
|
|
232
192
|
|
|
233
|
-
async with
|
|
234
|
-
response = await client.get(
|
|
193
|
+
async with get_async_http_client() as client:
|
|
194
|
+
response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
195
|
+
url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
|
|
196
|
+
)
|
|
235
197
|
|
|
236
198
|
if response.status_code == 200:
|
|
237
199
|
return response.json()
|
|
@@ -246,95 +208,64 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
246
208
|
|
|
247
209
|
|
|
248
210
|
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
249
|
-
batch_response: List[Dict], metric_name: str
|
|
250
|
-
) -> Dict
|
|
211
|
+
batch_response: List[Dict], metric_name: str
|
|
212
|
+
) -> Dict:
|
|
251
213
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
252
214
|
|
|
253
215
|
:param batch_response: The annotation response from Responsible AI service.
|
|
254
216
|
:type batch_response: List[Dict]
|
|
255
217
|
:param metric_name: The evaluation metric to use.
|
|
256
218
|
:type metric_name: str
|
|
257
|
-
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
258
|
-
:type metric_display_name: Optional[str]
|
|
259
219
|
:return: The parsed annotation result.
|
|
260
|
-
:rtype:
|
|
220
|
+
:rtype: List[List[Dict]]
|
|
261
221
|
"""
|
|
262
|
-
if metric_display_name is None:
|
|
263
|
-
metric_display_name = metric_name
|
|
264
|
-
|
|
265
222
|
# non-numeric metrics
|
|
266
|
-
if metric_name in {
|
|
267
|
-
|
|
268
|
-
_InternalEvaluationMetrics.ECI,
|
|
269
|
-
EvaluationMetrics.XPIA,
|
|
270
|
-
}:
|
|
271
|
-
result = {}
|
|
272
|
-
if not batch_response or len(batch_response[0]) == 0:
|
|
273
|
-
return {}
|
|
274
|
-
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
|
|
275
|
-
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
|
|
276
|
-
for pm_metric_name in pm_metric_names:
|
|
277
|
-
response = batch_response[0][pm_metric_name]
|
|
278
|
-
response = response.replace("false", "False")
|
|
279
|
-
response = response.replace("true", "True")
|
|
280
|
-
parsed_response = literal_eval(response)
|
|
281
|
-
result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
282
|
-
result[pm_metric_name + "_reason"] = (
|
|
283
|
-
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
284
|
-
)
|
|
285
|
-
return result
|
|
286
|
-
if metric_name not in batch_response[0]:
|
|
223
|
+
if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
|
|
224
|
+
if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
|
|
287
225
|
return {}
|
|
288
226
|
response = batch_response[0][metric_name]
|
|
289
227
|
response = response.replace("false", "False")
|
|
290
228
|
response = response.replace("true", "True")
|
|
291
229
|
parsed_response = literal_eval(response)
|
|
230
|
+
result = {}
|
|
292
231
|
# Use label instead of score since these are assumed to be boolean results.
|
|
293
|
-
# Use
|
|
294
|
-
result[
|
|
295
|
-
result[
|
|
232
|
+
# Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
233
|
+
result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
|
|
234
|
+
result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
296
235
|
|
|
297
236
|
if metric_name == EvaluationMetrics.XPIA:
|
|
298
237
|
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
|
|
299
|
-
# if present else set them to
|
|
300
|
-
result[
|
|
301
|
-
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else
|
|
238
|
+
# if present else set them to np.nan
|
|
239
|
+
result[metric_name + "_manipulated_content"] = (
|
|
240
|
+
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
|
|
302
241
|
)
|
|
303
|
-
result[
|
|
304
|
-
parsed_response["intrusion"] if "intrusion" in parsed_response else
|
|
242
|
+
result[metric_name + "_intrusion"] = (
|
|
243
|
+
parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
|
|
305
244
|
)
|
|
306
|
-
result[
|
|
307
|
-
parsed_response["information_gathering"] if "information_gathering" in parsed_response else
|
|
245
|
+
result[metric_name + "_information_gathering"] = (
|
|
246
|
+
parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
|
|
308
247
|
)
|
|
309
248
|
return result
|
|
310
|
-
return _parse_content_harm_response(batch_response, metric_name
|
|
249
|
+
return _parse_content_harm_response(batch_response, metric_name)
|
|
311
250
|
|
|
312
251
|
|
|
313
|
-
def _parse_content_harm_response(
|
|
314
|
-
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
315
|
-
) -> Dict[str, Union[str, float]]:
|
|
252
|
+
def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
|
|
316
253
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
317
254
|
|
|
318
255
|
:param batch_response: The annotation response from Responsible AI service.
|
|
319
256
|
:type batch_response: List[Dict]
|
|
320
257
|
:param metric_name: The evaluation metric to use.
|
|
321
258
|
:type metric_name: str
|
|
322
|
-
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
323
|
-
:type metric_display_name: Optional[str]
|
|
324
259
|
:return: The parsed annotation result.
|
|
325
|
-
:rtype:
|
|
260
|
+
:rtype: List[List[Dict]]
|
|
326
261
|
"""
|
|
327
262
|
# Fix the metric name if it's "hate_fairness"
|
|
328
263
|
# Eventually we will remove this fix once the RAI service is updated
|
|
329
|
-
key = metric_name
|
|
264
|
+
key = metric_name
|
|
330
265
|
if key == EvaluationMetrics.HATE_FAIRNESS:
|
|
331
266
|
key = EvaluationMetrics.HATE_UNFAIRNESS
|
|
332
267
|
|
|
333
|
-
result:
|
|
334
|
-
(key.value if hasattr(key, "value") else key): math.nan,
|
|
335
|
-
f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
|
|
336
|
-
f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
|
|
337
|
-
}
|
|
268
|
+
result = {key: np.nan, key + "_score": np.nan, key + "_reason": ""}
|
|
338
269
|
|
|
339
270
|
response = batch_response[0]
|
|
340
271
|
if metric_name not in response:
|
|
@@ -352,11 +283,11 @@ def _parse_content_harm_response(
|
|
|
352
283
|
|
|
353
284
|
# get content harm metric_value
|
|
354
285
|
if "label" in harm_response:
|
|
355
|
-
metric_value =
|
|
286
|
+
metric_value = harm_response["label"]
|
|
356
287
|
elif "valid" in harm_response:
|
|
357
|
-
metric_value = 0 if harm_response["valid"] else
|
|
288
|
+
metric_value = 0 if harm_response["valid"] else np.nan
|
|
358
289
|
else:
|
|
359
|
-
metric_value =
|
|
290
|
+
metric_value = np.nan
|
|
360
291
|
|
|
361
292
|
# get reason
|
|
362
293
|
if "reasoning" in harm_response:
|
|
@@ -370,23 +301,21 @@ def _parse_content_harm_response(
|
|
|
370
301
|
if metric_value_match:
|
|
371
302
|
metric_value = int(metric_value_match[0])
|
|
372
303
|
else:
|
|
373
|
-
metric_value =
|
|
304
|
+
metric_value = np.nan
|
|
374
305
|
reason = harm_response
|
|
375
306
|
elif harm_response != "" and isinstance(harm_response, (int, float)):
|
|
376
307
|
if 0 < harm_response <= 7:
|
|
377
308
|
metric_value = harm_response
|
|
378
309
|
else:
|
|
379
|
-
metric_value =
|
|
310
|
+
metric_value = np.nan
|
|
380
311
|
reason = ""
|
|
381
312
|
else:
|
|
382
|
-
metric_value =
|
|
313
|
+
metric_value = np.nan
|
|
383
314
|
reason = ""
|
|
384
315
|
|
|
385
316
|
harm_score = metric_value
|
|
386
|
-
if metric_value
|
|
387
|
-
|
|
388
|
-
if not math.isnan(metric_value):
|
|
389
|
-
# int(math.nan) causes a value error, and math.nan is already handled
|
|
317
|
+
if not np.isnan(metric_value):
|
|
318
|
+
# int(np.nan) causes a value error, and np.nan is already handled
|
|
390
319
|
# by get_harm_severity_level
|
|
391
320
|
harm_score = int(metric_value)
|
|
392
321
|
result[key] = get_harm_severity_level(harm_score)
|
|
@@ -408,34 +337,31 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
|
|
|
408
337
|
"""
|
|
409
338
|
headers = get_common_headers(token)
|
|
410
339
|
|
|
411
|
-
async with
|
|
412
|
-
response = await client.get(
|
|
340
|
+
async with get_async_http_client() as client:
|
|
341
|
+
response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
413
342
|
f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
|
|
414
343
|
f"resourceGroups/{azure_ai_project['resource_group_name']}/"
|
|
415
344
|
f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
|
|
416
345
|
f"api-version=2023-08-01-preview",
|
|
417
346
|
headers=headers,
|
|
347
|
+
timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
|
|
418
348
|
)
|
|
419
349
|
|
|
420
350
|
if response.status_code != 200:
|
|
421
|
-
msg =
|
|
422
|
-
f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
|
|
423
|
-
f"and make sure you have the necessary access permissions. "
|
|
424
|
-
f"Status code: {response.status_code}."
|
|
425
|
-
)
|
|
351
|
+
msg = f"Failed to retrieve the discovery service URL."
|
|
426
352
|
raise EvaluationException(
|
|
427
353
|
message=msg,
|
|
354
|
+
internal_message=msg,
|
|
428
355
|
target=ErrorTarget.RAI_CLIENT,
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
356
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
357
|
+
blame=ErrorBlame.UNKNOWN,
|
|
432
358
|
)
|
|
433
359
|
|
|
434
360
|
base_url = urlparse(response.json()["properties"]["discoveryUrl"])
|
|
435
361
|
return f"{base_url.scheme}://{base_url.netloc}"
|
|
436
362
|
|
|
437
363
|
|
|
438
|
-
async def get_rai_svc_url(project_scope:
|
|
364
|
+
async def get_rai_svc_url(project_scope: dict, token: str) -> str:
|
|
439
365
|
"""Get the Responsible AI service URL
|
|
440
366
|
|
|
441
367
|
:param project_scope: The Azure AI project scope details.
|
|
@@ -459,7 +385,7 @@ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
|
|
|
459
385
|
return rai_url
|
|
460
386
|
|
|
461
387
|
|
|
462
|
-
async def fetch_or_reuse_token(credential: TokenCredential, token:
|
|
388
|
+
async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str:
|
|
463
389
|
"""Get token. Fetch a new token if the current token is near expiry
|
|
464
390
|
|
|
465
391
|
:param credential: The Azure authentication credential.
|
|
@@ -469,147 +395,35 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
|
|
|
469
395
|
:type token: str
|
|
470
396
|
:return: The Azure authentication token.
|
|
471
397
|
"""
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
398
|
+
acquire_new_token = True
|
|
399
|
+
try:
|
|
400
|
+
if token:
|
|
401
|
+
# Decode the token to get its expiration time
|
|
475
402
|
decoded_token = jwt.decode(token, options={"verify_signature": False})
|
|
476
|
-
except jwt.PyJWTError:
|
|
477
|
-
pass
|
|
478
|
-
else:
|
|
479
403
|
exp_time = decoded_token["exp"]
|
|
480
404
|
current_time = time.time()
|
|
481
405
|
|
|
482
|
-
#
|
|
406
|
+
# Check if the token is near expiry
|
|
483
407
|
if (exp_time - current_time) >= 300:
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
async def evaluate_with_rai_service(
|
|
490
|
-
data: dict,
|
|
491
|
-
metric_name: str,
|
|
492
|
-
project_scope: AzureAIProject,
|
|
493
|
-
credential: TokenCredential,
|
|
494
|
-
annotation_task: str = Tasks.CONTENT_HARM,
|
|
495
|
-
metric_display_name=None,
|
|
496
|
-
) -> Dict[str, Union[str, float]]:
|
|
497
|
-
""" "Evaluate the content safety of the response using Responsible AI service
|
|
498
|
-
|
|
499
|
-
:param data: The data to evaluate.
|
|
500
|
-
:type data: dict
|
|
501
|
-
:param metric_name: The evaluation metric to use.
|
|
502
|
-
:type metric_name: str
|
|
503
|
-
:param project_scope: The Azure AI project scope details.
|
|
504
|
-
:type project_scope: Dict
|
|
505
|
-
:param credential: The Azure authentication credential.
|
|
506
|
-
:type credential:
|
|
507
|
-
~azure.core.credentials.TokenCredential
|
|
508
|
-
:param annotation_task: The annotation task to use.
|
|
509
|
-
:type annotation_task: str
|
|
510
|
-
:param metric_display_name: The display name of metric to use.
|
|
511
|
-
:type metric_display_name: str
|
|
512
|
-
:return: The parsed annotation result.
|
|
513
|
-
:rtype: Dict[str, Union[str, float]]
|
|
514
|
-
"""
|
|
515
|
-
|
|
516
|
-
# Get RAI service URL from discovery service and check service availability
|
|
517
|
-
token = await fetch_or_reuse_token(credential)
|
|
518
|
-
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
519
|
-
await ensure_service_availability(rai_svc_url, token, annotation_task)
|
|
520
|
-
|
|
521
|
-
# Submit annotation request and fetch result
|
|
522
|
-
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
|
|
523
|
-
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
524
|
-
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
525
|
-
|
|
526
|
-
return result
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
|
|
530
|
-
"""Generate the payload for the annotation request
|
|
531
|
-
:param content_type: The type of the content representing multimodal or images.
|
|
532
|
-
:type content_type: str
|
|
533
|
-
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
534
|
-
:type messages: str
|
|
535
|
-
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
536
|
-
in the payload.
|
|
537
|
-
:type metric: str
|
|
538
|
-
:return: The payload for the annotation request.
|
|
539
|
-
:rtype: Dict
|
|
540
|
-
"""
|
|
541
|
-
include_metric = True
|
|
542
|
-
task = Tasks.CONTENT_HARM
|
|
543
|
-
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
544
|
-
task = Tasks.PROTECTED_MATERIAL
|
|
545
|
-
include_metric = False
|
|
546
|
-
|
|
547
|
-
if include_metric:
|
|
548
|
-
return {
|
|
549
|
-
"ContentType": content_type,
|
|
550
|
-
"Contents": [{"messages": messages}],
|
|
551
|
-
"AnnotationTask": task,
|
|
552
|
-
"MetricList": [metric],
|
|
553
|
-
}
|
|
554
|
-
return {
|
|
555
|
-
"ContentType": content_type,
|
|
556
|
-
"Contents": [{"messages": messages}],
|
|
557
|
-
"AnnotationTask": task,
|
|
558
|
-
}
|
|
408
|
+
acquire_new_token = False
|
|
409
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
410
|
+
pass
|
|
559
411
|
|
|
412
|
+
if acquire_new_token:
|
|
413
|
+
token = credential.get_token("https://management.azure.com/.default").token
|
|
560
414
|
|
|
561
|
-
|
|
562
|
-
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
563
|
-
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
564
|
-
:type messages: str
|
|
565
|
-
:param metric: The evaluation metric to use.
|
|
566
|
-
:type metric: str
|
|
567
|
-
:param rai_svc_url: The Responsible AI service URL.
|
|
568
|
-
:type rai_svc_url: str
|
|
569
|
-
:param token: The Azure authentication token.
|
|
570
|
-
:type token: str
|
|
571
|
-
:return: The operation ID.
|
|
572
|
-
:rtype: str
|
|
573
|
-
"""
|
|
574
|
-
## handle json payload and payload from inference sdk strongly type messages
|
|
575
|
-
if len(messages) > 0 and not isinstance(messages[0], dict):
|
|
576
|
-
try:
|
|
577
|
-
from azure.ai.inference.models import ChatRequestMessage
|
|
578
|
-
except ImportError as ex:
|
|
579
|
-
error_message = (
|
|
580
|
-
"Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
|
|
581
|
-
)
|
|
582
|
-
raise MissingRequiredPackage(message=error_message) from ex
|
|
583
|
-
if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
|
|
584
|
-
messages = [message.as_dict() for message in messages]
|
|
415
|
+
return token
|
|
585
416
|
|
|
586
|
-
filtered_messages = [message for message in messages if message["role"] != "system"]
|
|
587
|
-
assistant_messages = [message for message in messages if message["role"] == "assistant"]
|
|
588
|
-
content_type = retrieve_content_type(assistant_messages, metric)
|
|
589
|
-
payload = generate_payload_multimodal(content_type, filtered_messages, metric)
|
|
590
417
|
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
headers = get_common_headers(token)
|
|
594
|
-
async with get_async_http_client() as client:
|
|
595
|
-
response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
596
|
-
url, json=payload, headers=headers
|
|
597
|
-
)
|
|
598
|
-
if response.status_code != 202:
|
|
599
|
-
raise HttpResponseError(
|
|
600
|
-
message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
|
|
601
|
-
)
|
|
602
|
-
result = response.json()
|
|
603
|
-
operation_id = result["location"].split("/")[-1]
|
|
604
|
-
return operation_id
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
async def evaluate_with_rai_service_multimodal(
|
|
608
|
-
messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
|
|
418
|
+
async def evaluate_with_rai_service(
|
|
419
|
+
query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
|
|
609
420
|
):
|
|
610
421
|
""" "Evaluate the content safety of the response using Responsible AI service
|
|
611
|
-
|
|
612
|
-
:
|
|
422
|
+
|
|
423
|
+
:param query: The query to evaluate.
|
|
424
|
+
:type query: str
|
|
425
|
+
:param response: The response to evaluate.
|
|
426
|
+
:type response: str
|
|
613
427
|
:param metric_name: The evaluation metric to use.
|
|
614
428
|
:type metric_name: str
|
|
615
429
|
:param project_scope: The Azure AI project scope details.
|
|
@@ -620,13 +434,19 @@ async def evaluate_with_rai_service_multimodal(
|
|
|
620
434
|
:return: The parsed annotation result.
|
|
621
435
|
:rtype: List[List[Dict]]
|
|
622
436
|
"""
|
|
437
|
+
# Use DefaultAzureCredential if no credential is provided
|
|
438
|
+
# This is for the for batch run scenario as the credential cannot be serialized by promoptflow
|
|
439
|
+
if credential is None or credential == {}:
|
|
440
|
+
credential = DefaultAzureCredential()
|
|
623
441
|
|
|
624
442
|
# Get RAI service URL from discovery service and check service availability
|
|
625
443
|
token = await fetch_or_reuse_token(credential)
|
|
626
444
|
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
627
445
|
await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
|
|
446
|
+
|
|
628
447
|
# Submit annotation request and fetch result
|
|
629
|
-
operation_id = await
|
|
630
|
-
annotation_response =
|
|
448
|
+
operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
|
|
449
|
+
annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
|
|
631
450
|
result = parse_response(annotation_response, metric_name)
|
|
451
|
+
|
|
632
452
|
return result
|