azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +82 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/_experimental.py +172 -0
- azure/ai/evaluation/_common/constants.py +72 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/rai_service.py +632 -0
- azure/ai/evaluation/_common/utils.py +445 -0
- azure/ai/evaluation/_constants.py +72 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
- azure/ai/evaluation/_evaluate/_utils.py +298 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
- azure/ai/evaluation/_exceptions.py +128 -0
- azure/ai/evaluation/_http_utils.py +466 -0
- azure/ai/evaluation/_model_configurations.py +123 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +16 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
- azure/ai/evaluation/simulator/_constants.py +27 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
- azure/ai/evaluation/simulator/_simulator.py +716 -0
- azure/ai/evaluation/simulator/_tracing.py +89 -0
- azure/ai/evaluation/simulator/_utils.py +132 -0
- azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import asyncio
|
|
5
|
+
import importlib.metadata
|
|
6
|
+
import math
|
|
7
|
+
import re
|
|
8
|
+
import time
|
|
9
|
+
import json
|
|
10
|
+
import html
|
|
11
|
+
from ast import literal_eval
|
|
12
|
+
from typing import Dict, List, Optional, Union, cast
|
|
13
|
+
from urllib.parse import urlparse
|
|
14
|
+
from string import Template
|
|
15
|
+
|
|
16
|
+
import jwt
|
|
17
|
+
|
|
18
|
+
from promptflow.core._errors import MissingRequiredPackage
|
|
19
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
20
|
+
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
21
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
22
|
+
from azure.core.credentials import TokenCredential
|
|
23
|
+
from azure.core.exceptions import HttpResponseError
|
|
24
|
+
from azure.core.pipeline.policies import AsyncRetryPolicy
|
|
25
|
+
|
|
26
|
+
from .constants import (
|
|
27
|
+
CommonConstants,
|
|
28
|
+
EvaluationMetrics,
|
|
29
|
+
RAIService,
|
|
30
|
+
Tasks,
|
|
31
|
+
_InternalEvaluationMetrics,
|
|
32
|
+
)
|
|
33
|
+
from .utils import get_harm_severity_level, retrieve_content_type
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
version = importlib.metadata.version("azure-ai-evaluation")
|
|
37
|
+
except importlib.metadata.PackageNotFoundError:
|
|
38
|
+
version = "unknown"
|
|
39
|
+
USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
40
|
+
|
|
41
|
+
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
42
|
+
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_formatted_template(data: dict, annotation_task: str) -> str:
|
|
47
|
+
"""Given the task and input data, produce a formatted string that will serve as the main
|
|
48
|
+
payload for the RAI service. Requires specific per-task logic.
|
|
49
|
+
|
|
50
|
+
:param data: The data to incorporate into the payload.
|
|
51
|
+
:type data: dict
|
|
52
|
+
:param annotation_task: The annotation task to use. This determines the template to use.
|
|
53
|
+
:type annotation_task: str
|
|
54
|
+
:return: The formatted based on the data and task template.
|
|
55
|
+
:rtype: str
|
|
56
|
+
"""
|
|
57
|
+
# Template class doesn't play nice with json dumping/loading, just handle groundedness'
|
|
58
|
+
# JSON format manually.
|
|
59
|
+
# Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
60
|
+
if annotation_task == Tasks.GROUNDEDNESS:
|
|
61
|
+
as_dict = {
|
|
62
|
+
"question": data.get("query", ""),
|
|
63
|
+
"answer": data.get("response", ""),
|
|
64
|
+
"context": data.get("context", ""),
|
|
65
|
+
}
|
|
66
|
+
return json.dumps(as_dict)
|
|
67
|
+
as_dict = {
|
|
68
|
+
"query": html.escape(data.get("query", "")),
|
|
69
|
+
"response": html.escape(data.get("response", "")),
|
|
70
|
+
}
|
|
71
|
+
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
|
|
72
|
+
return user_text.replace("'", '\\"')
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_common_headers(token: str) -> Dict:
|
|
76
|
+
"""Get common headers for the HTTP request
|
|
77
|
+
|
|
78
|
+
:param token: The Azure authentication token.
|
|
79
|
+
:type token: str
|
|
80
|
+
:return: The common headers.
|
|
81
|
+
:rtype: Dict
|
|
82
|
+
"""
|
|
83
|
+
return {
|
|
84
|
+
"Authorization": f"Bearer {token}",
|
|
85
|
+
"Content-Type": "application/json",
|
|
86
|
+
"User-Agent": USER_AGENT,
|
|
87
|
+
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
88
|
+
# https://github.com/encode/httpx/discussions/2959
|
|
89
|
+
"Connection": "close",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
|
|
94
|
+
return get_async_http_client().with_policies(
|
|
95
|
+
retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
|
|
100
|
+
"""Check if the Responsible AI service is available in the region and has the required capability, if relevant.
|
|
101
|
+
|
|
102
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
103
|
+
:type rai_svc_url: str
|
|
104
|
+
:param token: The Azure authentication token.
|
|
105
|
+
:type token: str
|
|
106
|
+
:param capability: The capability to check. Default is None.
|
|
107
|
+
:type capability: str
|
|
108
|
+
:raises Exception: If the service is not available in the region or the capability is not available.
|
|
109
|
+
"""
|
|
110
|
+
headers = get_common_headers(token)
|
|
111
|
+
svc_liveness_url = rai_svc_url + "/checkannotation"
|
|
112
|
+
|
|
113
|
+
async with get_async_http_client() as client:
|
|
114
|
+
response = await client.get(svc_liveness_url, headers=headers)
|
|
115
|
+
|
|
116
|
+
if response.status_code != 200:
|
|
117
|
+
msg = (
|
|
118
|
+
f"RAI service is unavailable in this region, or you lack the necessary permissions "
|
|
119
|
+
f"to access the AI project. Status Code: {response.status_code}"
|
|
120
|
+
)
|
|
121
|
+
raise EvaluationException(
|
|
122
|
+
message=msg,
|
|
123
|
+
internal_message=msg,
|
|
124
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
125
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
126
|
+
blame=ErrorBlame.USER_ERROR,
|
|
127
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
capabilities = response.json()
|
|
131
|
+
if capability and capability not in capabilities:
|
|
132
|
+
msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
|
|
133
|
+
raise EvaluationException(
|
|
134
|
+
message=msg,
|
|
135
|
+
internal_message=msg,
|
|
136
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
137
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
138
|
+
blame=ErrorBlame.USER_ERROR,
|
|
139
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
|
|
144
|
+
"""Generate the payload for the annotation request
|
|
145
|
+
|
|
146
|
+
:param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
|
|
147
|
+
:type normalized_user_text: str
|
|
148
|
+
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
149
|
+
in the payload.
|
|
150
|
+
:type metric: str
|
|
151
|
+
:param annotation_task: The annotation task to be passed to service
|
|
152
|
+
:type annotation_task: str
|
|
153
|
+
:return: The payload for the annotation request.
|
|
154
|
+
:rtype: Dict
|
|
155
|
+
"""
|
|
156
|
+
include_metric = True
|
|
157
|
+
task = annotation_task
|
|
158
|
+
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
159
|
+
include_metric = False
|
|
160
|
+
elif metric == _InternalEvaluationMetrics.ECI:
|
|
161
|
+
include_metric = False
|
|
162
|
+
elif metric == EvaluationMetrics.XPIA:
|
|
163
|
+
include_metric = False
|
|
164
|
+
return (
|
|
165
|
+
{
|
|
166
|
+
"UserTextList": [normalized_user_text],
|
|
167
|
+
"AnnotationTask": task,
|
|
168
|
+
"MetricList": [metric],
|
|
169
|
+
}
|
|
170
|
+
if include_metric
|
|
171
|
+
else {
|
|
172
|
+
"UserTextList": [normalized_user_text],
|
|
173
|
+
"AnnotationTask": task,
|
|
174
|
+
}
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
|
|
179
|
+
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
180
|
+
|
|
181
|
+
:param data: The data to evaluate.
|
|
182
|
+
:type data: dict
|
|
183
|
+
:param metric: The evaluation metric to use.
|
|
184
|
+
:type metric: str
|
|
185
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
186
|
+
:type rai_svc_url: str
|
|
187
|
+
:param token: The Azure authentication token.
|
|
188
|
+
:type token: str
|
|
189
|
+
:param annotation_task: The annotation task to use.
|
|
190
|
+
:type annotation_task: str
|
|
191
|
+
:return: The operation ID.
|
|
192
|
+
:rtype: str
|
|
193
|
+
"""
|
|
194
|
+
normalized_user_text = get_formatted_template(data, annotation_task)
|
|
195
|
+
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
196
|
+
|
|
197
|
+
url = rai_svc_url + "/submitannotation"
|
|
198
|
+
headers = get_common_headers(token)
|
|
199
|
+
|
|
200
|
+
async with get_async_http_client_with_timeout() as client:
|
|
201
|
+
http_response = await client.post(url, json=payload, headers=headers)
|
|
202
|
+
|
|
203
|
+
if http_response.status_code != 202:
|
|
204
|
+
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
|
|
205
|
+
http_response.raise_for_status()
|
|
206
|
+
result = http_response.json()
|
|
207
|
+
operation_id = result["location"].split("/")[-1]
|
|
208
|
+
return operation_id
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential, token: str) -> Dict:
|
|
212
|
+
"""Fetch the annotation result from Responsible AI service
|
|
213
|
+
|
|
214
|
+
:param operation_id: The operation ID.
|
|
215
|
+
:type operation_id: str
|
|
216
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
217
|
+
:type rai_svc_url: str
|
|
218
|
+
:param credential: The Azure authentication credential.
|
|
219
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
220
|
+
:param token: The Azure authentication token.
|
|
221
|
+
:type token: str
|
|
222
|
+
:return: The annotation result.
|
|
223
|
+
:rtype: Dict
|
|
224
|
+
"""
|
|
225
|
+
start = time.time()
|
|
226
|
+
request_count = 0
|
|
227
|
+
|
|
228
|
+
url = rai_svc_url + "/operations/" + operation_id
|
|
229
|
+
while True:
|
|
230
|
+
token = await fetch_or_reuse_token(credential, token)
|
|
231
|
+
headers = get_common_headers(token)
|
|
232
|
+
|
|
233
|
+
async with get_async_http_client_with_timeout() as client:
|
|
234
|
+
response = await client.get(url, headers=headers)
|
|
235
|
+
|
|
236
|
+
if response.status_code == 200:
|
|
237
|
+
return response.json()
|
|
238
|
+
|
|
239
|
+
request_count += 1
|
|
240
|
+
time_elapsed = time.time() - start
|
|
241
|
+
if time_elapsed > RAIService.TIMEOUT:
|
|
242
|
+
raise TimeoutError(f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds")
|
|
243
|
+
|
|
244
|
+
sleep_time = RAIService.SLEEP_TIME**request_count
|
|
245
|
+
await asyncio.sleep(sleep_time)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
249
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
250
|
+
) -> Dict[str, Union[str, float]]:
|
|
251
|
+
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
252
|
+
|
|
253
|
+
:param batch_response: The annotation response from Responsible AI service.
|
|
254
|
+
:type batch_response: List[Dict]
|
|
255
|
+
:param metric_name: The evaluation metric to use.
|
|
256
|
+
:type metric_name: str
|
|
257
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
258
|
+
:type metric_display_name: Optional[str]
|
|
259
|
+
:return: The parsed annotation result.
|
|
260
|
+
:rtype: Dict[str, Union[str, float]]
|
|
261
|
+
"""
|
|
262
|
+
if metric_display_name is None:
|
|
263
|
+
metric_display_name = metric_name
|
|
264
|
+
|
|
265
|
+
# non-numeric metrics
|
|
266
|
+
if metric_name in {
|
|
267
|
+
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
268
|
+
_InternalEvaluationMetrics.ECI,
|
|
269
|
+
EvaluationMetrics.XPIA,
|
|
270
|
+
}:
|
|
271
|
+
result = {}
|
|
272
|
+
if not batch_response or len(batch_response[0]) == 0:
|
|
273
|
+
return {}
|
|
274
|
+
if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
|
|
275
|
+
pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
|
|
276
|
+
for pm_metric_name in pm_metric_names:
|
|
277
|
+
response = batch_response[0][pm_metric_name]
|
|
278
|
+
response = response.replace("false", "False")
|
|
279
|
+
response = response.replace("true", "True")
|
|
280
|
+
parsed_response = literal_eval(response)
|
|
281
|
+
result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
282
|
+
result[pm_metric_name + "_reason"] = (
|
|
283
|
+
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
284
|
+
)
|
|
285
|
+
return result
|
|
286
|
+
if metric_name not in batch_response[0]:
|
|
287
|
+
return {}
|
|
288
|
+
response = batch_response[0][metric_name]
|
|
289
|
+
response = response.replace("false", "False")
|
|
290
|
+
response = response.replace("true", "True")
|
|
291
|
+
parsed_response = literal_eval(response)
|
|
292
|
+
# Use label instead of score since these are assumed to be boolean results.
|
|
293
|
+
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
294
|
+
result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
295
|
+
result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
296
|
+
|
|
297
|
+
if metric_name == EvaluationMetrics.XPIA:
|
|
298
|
+
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
|
|
299
|
+
# if present else set them to math.nan
|
|
300
|
+
result[metric_display_name + "_manipulated_content"] = (
|
|
301
|
+
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
|
|
302
|
+
)
|
|
303
|
+
result[metric_display_name + "_intrusion"] = (
|
|
304
|
+
parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
|
|
305
|
+
)
|
|
306
|
+
result[metric_display_name + "_information_gathering"] = (
|
|
307
|
+
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
|
|
308
|
+
)
|
|
309
|
+
return result
|
|
310
|
+
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _parse_content_harm_response(
|
|
314
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
315
|
+
) -> Dict[str, Union[str, float]]:
|
|
316
|
+
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
317
|
+
|
|
318
|
+
:param batch_response: The annotation response from Responsible AI service.
|
|
319
|
+
:type batch_response: List[Dict]
|
|
320
|
+
:param metric_name: The evaluation metric to use.
|
|
321
|
+
:type metric_name: str
|
|
322
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
323
|
+
:type metric_display_name: Optional[str]
|
|
324
|
+
:return: The parsed annotation result.
|
|
325
|
+
:rtype: Dict[str, Union[str, float]]
|
|
326
|
+
"""
|
|
327
|
+
# Fix the metric name if it's "hate_fairness"
|
|
328
|
+
# Eventually we will remove this fix once the RAI service is updated
|
|
329
|
+
key = metric_name if metric_display_name is None else metric_display_name
|
|
330
|
+
if key == EvaluationMetrics.HATE_FAIRNESS:
|
|
331
|
+
key = EvaluationMetrics.HATE_UNFAIRNESS
|
|
332
|
+
|
|
333
|
+
result: Dict[str, Union[str, float]] = {
|
|
334
|
+
(key.value if hasattr(key, "value") else key): math.nan,
|
|
335
|
+
f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
|
|
336
|
+
f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
response = batch_response[0]
|
|
340
|
+
if metric_name not in response:
|
|
341
|
+
return result
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
harm_response = literal_eval(response[metric_name])
|
|
345
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
346
|
+
harm_response = response[metric_name]
|
|
347
|
+
|
|
348
|
+
if harm_response != "" and isinstance(harm_response, dict):
|
|
349
|
+
# check if "output" is one key in harm_response
|
|
350
|
+
if "output" in harm_response:
|
|
351
|
+
harm_response = harm_response["output"]
|
|
352
|
+
|
|
353
|
+
# get content harm metric_value
|
|
354
|
+
if "label" in harm_response:
|
|
355
|
+
metric_value = float(harm_response["label"])
|
|
356
|
+
elif "valid" in harm_response:
|
|
357
|
+
metric_value = 0 if harm_response["valid"] else math.nan
|
|
358
|
+
else:
|
|
359
|
+
metric_value = math.nan
|
|
360
|
+
|
|
361
|
+
# get reason
|
|
362
|
+
if "reasoning" in harm_response:
|
|
363
|
+
reason = harm_response["reasoning"]
|
|
364
|
+
elif "reason" in harm_response:
|
|
365
|
+
reason = harm_response["reason"]
|
|
366
|
+
else:
|
|
367
|
+
reason = ""
|
|
368
|
+
elif harm_response != "" and isinstance(harm_response, str):
|
|
369
|
+
metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
|
|
370
|
+
if metric_value_match:
|
|
371
|
+
metric_value = int(metric_value_match[0])
|
|
372
|
+
else:
|
|
373
|
+
metric_value = math.nan
|
|
374
|
+
reason = harm_response
|
|
375
|
+
elif harm_response != "" and isinstance(harm_response, (int, float)):
|
|
376
|
+
if 0 < harm_response <= 7:
|
|
377
|
+
metric_value = harm_response
|
|
378
|
+
else:
|
|
379
|
+
metric_value = math.nan
|
|
380
|
+
reason = ""
|
|
381
|
+
else:
|
|
382
|
+
metric_value = math.nan
|
|
383
|
+
reason = ""
|
|
384
|
+
|
|
385
|
+
harm_score = metric_value
|
|
386
|
+
if metric_value == "n/a":
|
|
387
|
+
return result
|
|
388
|
+
if not math.isnan(metric_value):
|
|
389
|
+
# int(math.nan) causes a value error, and math.nan is already handled
|
|
390
|
+
# by get_harm_severity_level
|
|
391
|
+
harm_score = int(metric_value)
|
|
392
|
+
result[key] = get_harm_severity_level(harm_score)
|
|
393
|
+
result[key + "_score"] = harm_score
|
|
394
|
+
result[key + "_reason"] = reason
|
|
395
|
+
|
|
396
|
+
return result
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: str) -> str:
|
|
400
|
+
"""Get the discovery service URL for the Azure AI project
|
|
401
|
+
|
|
402
|
+
:param azure_ai_project: The Azure AI project details.
|
|
403
|
+
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
404
|
+
:param token: The Azure authentication token.
|
|
405
|
+
:type token: str
|
|
406
|
+
:return: The discovery service URL.
|
|
407
|
+
:rtype: str
|
|
408
|
+
"""
|
|
409
|
+
headers = get_common_headers(token)
|
|
410
|
+
|
|
411
|
+
async with get_async_http_client_with_timeout() as client:
|
|
412
|
+
response = await client.get(
|
|
413
|
+
f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
|
|
414
|
+
f"resourceGroups/{azure_ai_project['resource_group_name']}/"
|
|
415
|
+
f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
|
|
416
|
+
f"api-version=2023-08-01-preview",
|
|
417
|
+
headers=headers,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
if response.status_code != 200:
|
|
421
|
+
msg = (
|
|
422
|
+
f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
|
|
423
|
+
f"and make sure you have the necessary access permissions. "
|
|
424
|
+
f"Status code: {response.status_code}."
|
|
425
|
+
)
|
|
426
|
+
raise EvaluationException(
|
|
427
|
+
message=msg,
|
|
428
|
+
target=ErrorTarget.RAI_CLIENT,
|
|
429
|
+
blame=ErrorBlame.USER_ERROR,
|
|
430
|
+
category=ErrorCategory.PROJECT_ACCESS_ERROR,
|
|
431
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
base_url = urlparse(response.json()["properties"]["discoveryUrl"])
|
|
435
|
+
return f"{base_url.scheme}://{base_url.netloc}"
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
|
|
439
|
+
"""Get the Responsible AI service URL
|
|
440
|
+
|
|
441
|
+
:param project_scope: The Azure AI project scope details.
|
|
442
|
+
:type project_scope: Dict
|
|
443
|
+
:param token: The Azure authentication token.
|
|
444
|
+
:type token: str
|
|
445
|
+
:return: The Responsible AI service URL.
|
|
446
|
+
:rtype: str
|
|
447
|
+
"""
|
|
448
|
+
discovery_url = await _get_service_discovery_url(azure_ai_project=project_scope, token=token)
|
|
449
|
+
subscription_id = project_scope["subscription_id"]
|
|
450
|
+
resource_group_name = project_scope["resource_group_name"]
|
|
451
|
+
project_name = project_scope["project_name"]
|
|
452
|
+
base_url = discovery_url.rstrip("/")
|
|
453
|
+
rai_url = (
|
|
454
|
+
f"{base_url}/raisvc/v1.0"
|
|
455
|
+
f"/subscriptions/{subscription_id}"
|
|
456
|
+
f"/resourceGroups/{resource_group_name}"
|
|
457
|
+
f"/providers/Microsoft.MachineLearningServices/workspaces/{project_name}"
|
|
458
|
+
)
|
|
459
|
+
return rai_url
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str:
|
|
463
|
+
"""Get token. Fetch a new token if the current token is near expiry
|
|
464
|
+
|
|
465
|
+
:param credential: The Azure authentication credential.
|
|
466
|
+
:type credential:
|
|
467
|
+
~azure.core.credentials.TokenCredential
|
|
468
|
+
:param token: The Azure authentication token. Defaults to None. If none, a new token will be fetched.
|
|
469
|
+
:type token: str
|
|
470
|
+
:return: The Azure authentication token.
|
|
471
|
+
"""
|
|
472
|
+
if token:
|
|
473
|
+
# Decode the token to get its expiration time
|
|
474
|
+
try:
|
|
475
|
+
decoded_token = jwt.decode(token, options={"verify_signature": False})
|
|
476
|
+
except jwt.PyJWTError:
|
|
477
|
+
pass
|
|
478
|
+
else:
|
|
479
|
+
exp_time = decoded_token["exp"]
|
|
480
|
+
current_time = time.time()
|
|
481
|
+
|
|
482
|
+
# Return current token if not near expiry
|
|
483
|
+
if (exp_time - current_time) >= 300:
|
|
484
|
+
return token
|
|
485
|
+
|
|
486
|
+
return credential.get_token("https://management.azure.com/.default").token
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
async def evaluate_with_rai_service(
|
|
490
|
+
data: dict,
|
|
491
|
+
metric_name: str,
|
|
492
|
+
project_scope: AzureAIProject,
|
|
493
|
+
credential: TokenCredential,
|
|
494
|
+
annotation_task: str = Tasks.CONTENT_HARM,
|
|
495
|
+
metric_display_name=None,
|
|
496
|
+
) -> Dict[str, Union[str, float]]:
|
|
497
|
+
""" "Evaluate the content safety of the response using Responsible AI service
|
|
498
|
+
|
|
499
|
+
:param data: The data to evaluate.
|
|
500
|
+
:type data: dict
|
|
501
|
+
:param metric_name: The evaluation metric to use.
|
|
502
|
+
:type metric_name: str
|
|
503
|
+
:param project_scope: The Azure AI project scope details.
|
|
504
|
+
:type project_scope: Dict
|
|
505
|
+
:param credential: The Azure authentication credential.
|
|
506
|
+
:type credential:
|
|
507
|
+
~azure.core.credentials.TokenCredential
|
|
508
|
+
:param annotation_task: The annotation task to use.
|
|
509
|
+
:type annotation_task: str
|
|
510
|
+
:param metric_display_name: The display name of metric to use.
|
|
511
|
+
:type metric_display_name: str
|
|
512
|
+
:return: The parsed annotation result.
|
|
513
|
+
:rtype: Dict[str, Union[str, float]]
|
|
514
|
+
"""
|
|
515
|
+
|
|
516
|
+
# Get RAI service URL from discovery service and check service availability
|
|
517
|
+
token = await fetch_or_reuse_token(credential)
|
|
518
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
519
|
+
await ensure_service_availability(rai_svc_url, token, annotation_task)
|
|
520
|
+
|
|
521
|
+
# Submit annotation request and fetch result
|
|
522
|
+
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
|
|
523
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
524
|
+
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
525
|
+
|
|
526
|
+
return result
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
|
|
530
|
+
"""Generate the payload for the annotation request
|
|
531
|
+
:param content_type: The type of the content representing multimodal or images.
|
|
532
|
+
:type content_type: str
|
|
533
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
534
|
+
:type messages: str
|
|
535
|
+
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
536
|
+
in the payload.
|
|
537
|
+
:type metric: str
|
|
538
|
+
:return: The payload for the annotation request.
|
|
539
|
+
:rtype: Dict
|
|
540
|
+
"""
|
|
541
|
+
include_metric = True
|
|
542
|
+
task = Tasks.CONTENT_HARM
|
|
543
|
+
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
544
|
+
task = Tasks.PROTECTED_MATERIAL
|
|
545
|
+
include_metric = False
|
|
546
|
+
|
|
547
|
+
if include_metric:
|
|
548
|
+
return {
|
|
549
|
+
"ContentType": content_type,
|
|
550
|
+
"Contents": [{"messages": messages}],
|
|
551
|
+
"AnnotationTask": task,
|
|
552
|
+
"MetricList": [metric],
|
|
553
|
+
}
|
|
554
|
+
return {
|
|
555
|
+
"ContentType": content_type,
|
|
556
|
+
"Contents": [{"messages": messages}],
|
|
557
|
+
"AnnotationTask": task,
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
|
|
562
|
+
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
563
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
564
|
+
:type messages: str
|
|
565
|
+
:param metric: The evaluation metric to use.
|
|
566
|
+
:type metric: str
|
|
567
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
568
|
+
:type rai_svc_url: str
|
|
569
|
+
:param token: The Azure authentication token.
|
|
570
|
+
:type token: str
|
|
571
|
+
:return: The operation ID.
|
|
572
|
+
:rtype: str
|
|
573
|
+
"""
|
|
574
|
+
## handle json payload and payload from inference sdk strongly type messages
|
|
575
|
+
if len(messages) > 0 and not isinstance(messages[0], dict):
|
|
576
|
+
try:
|
|
577
|
+
from azure.ai.inference.models import ChatRequestMessage
|
|
578
|
+
except ImportError as ex:
|
|
579
|
+
error_message = (
|
|
580
|
+
"Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
|
|
581
|
+
)
|
|
582
|
+
raise MissingRequiredPackage(message=error_message) from ex
|
|
583
|
+
if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
|
|
584
|
+
messages = [message.as_dict() for message in messages]
|
|
585
|
+
|
|
586
|
+
filtered_messages = [message for message in messages if message["role"] != "system"]
|
|
587
|
+
assistant_messages = [message for message in messages if message["role"] == "assistant"]
|
|
588
|
+
content_type = retrieve_content_type(assistant_messages, metric)
|
|
589
|
+
payload = generate_payload_multimodal(content_type, filtered_messages, metric)
|
|
590
|
+
|
|
591
|
+
## calling rai service for annotation
|
|
592
|
+
url = rai_svc_url + "/submitannotation"
|
|
593
|
+
headers = get_common_headers(token)
|
|
594
|
+
async with get_async_http_client() as client:
|
|
595
|
+
response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
596
|
+
url, json=payload, headers=headers
|
|
597
|
+
)
|
|
598
|
+
if response.status_code != 202:
|
|
599
|
+
raise HttpResponseError(
|
|
600
|
+
message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
|
|
601
|
+
)
|
|
602
|
+
result = response.json()
|
|
603
|
+
operation_id = result["location"].split("/")[-1]
|
|
604
|
+
return operation_id
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
async def evaluate_with_rai_service_multimodal(
|
|
608
|
+
messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
|
|
609
|
+
):
|
|
610
|
+
""" "Evaluate the content safety of the response using Responsible AI service
|
|
611
|
+
:param messages: The normalized list of messages.
|
|
612
|
+
:type messages: str
|
|
613
|
+
:param metric_name: The evaluation metric to use.
|
|
614
|
+
:type metric_name: str
|
|
615
|
+
:param project_scope: The Azure AI project scope details.
|
|
616
|
+
:type project_scope: Dict
|
|
617
|
+
:param credential: The Azure authentication credential.
|
|
618
|
+
:type credential:
|
|
619
|
+
~azure.core.credentials.TokenCredential
|
|
620
|
+
:return: The parsed annotation result.
|
|
621
|
+
:rtype: List[List[Dict]]
|
|
622
|
+
"""
|
|
623
|
+
|
|
624
|
+
# Get RAI service URL from discovery service and check service availability
|
|
625
|
+
token = await fetch_or_reuse_token(credential)
|
|
626
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
627
|
+
await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
|
|
628
|
+
# Submit annotation request and fetch result
|
|
629
|
+
operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
|
|
630
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
631
|
+
result = parse_response(annotation_response, metric_name)
|
|
632
|
+
return result
|