azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +22 -0
- azure/ai/evaluation/_common/constants.py +5 -0
- azure/ai/evaluation/_common/math.py +11 -0
- azure/ai/evaluation/_common/rai_service.py +172 -35
- azure/ai/evaluation/_common/utils.py +162 -23
- azure/ai/evaluation/_constants.py +6 -6
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +4 -4
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +6 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
- azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
- azure/ai/evaluation/_evaluate/_utils.py +40 -7
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +49 -15
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +44 -11
- azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +125 -82
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
- azure/ai/evaluation/_exceptions.py +17 -0
- azure/ai/evaluation/_model_configurations.py +18 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
- azure/ai/evaluation/simulator/_simulator.py +115 -61
- azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +166 -9
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/NOTICE.txt +20 -0
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
- /azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +0 -0
- /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
azure/ai/evaluation/__init__.py
CHANGED
|
@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
|
|
|
12
12
|
SexualEvaluator,
|
|
13
13
|
ViolenceEvaluator,
|
|
14
14
|
)
|
|
15
|
+
from ._evaluators._multimodal._content_safety_multimodal import (
|
|
16
|
+
ContentSafetyMultimodalEvaluator,
|
|
17
|
+
HateUnfairnessMultimodalEvaluator,
|
|
18
|
+
SelfHarmMultimodalEvaluator,
|
|
19
|
+
SexualMultimodalEvaluator,
|
|
20
|
+
ViolenceMultimodalEvaluator,
|
|
21
|
+
)
|
|
22
|
+
from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
|
|
15
23
|
from ._evaluators._f1_score import F1ScoreEvaluator
|
|
16
24
|
from ._evaluators._fluency import FluencyEvaluator
|
|
17
25
|
from ._evaluators._gleu import GleuScoreEvaluator
|
|
18
26
|
from ._evaluators._groundedness import GroundednessEvaluator
|
|
27
|
+
from ._evaluators._service_groundedness import GroundednessProEvaluator
|
|
19
28
|
from ._evaluators._meteor import MeteorScoreEvaluator
|
|
20
29
|
from ._evaluators._protected_material import ProtectedMaterialEvaluator
|
|
21
30
|
from ._evaluators._qa import QAEvaluator
|
|
@@ -27,7 +36,10 @@ from ._evaluators._xpia import IndirectAttackEvaluator
|
|
|
27
36
|
from ._model_configurations import (
|
|
28
37
|
AzureAIProject,
|
|
29
38
|
AzureOpenAIModelConfiguration,
|
|
39
|
+
Conversation,
|
|
40
|
+
EvaluationResult,
|
|
30
41
|
EvaluatorConfig,
|
|
42
|
+
Message,
|
|
31
43
|
OpenAIModelConfiguration,
|
|
32
44
|
)
|
|
33
45
|
|
|
@@ -37,6 +49,7 @@ __all__ = [
|
|
|
37
49
|
"F1ScoreEvaluator",
|
|
38
50
|
"FluencyEvaluator",
|
|
39
51
|
"GroundednessEvaluator",
|
|
52
|
+
"GroundednessProEvaluator",
|
|
40
53
|
"RelevanceEvaluator",
|
|
41
54
|
"SimilarityEvaluator",
|
|
42
55
|
"QAEvaluator",
|
|
@@ -57,4 +70,13 @@ __all__ = [
|
|
|
57
70
|
"AzureOpenAIModelConfiguration",
|
|
58
71
|
"OpenAIModelConfiguration",
|
|
59
72
|
"EvaluatorConfig",
|
|
73
|
+
"Conversation",
|
|
74
|
+
"Message",
|
|
75
|
+
"EvaluationResult",
|
|
76
|
+
"ContentSafetyMultimodalEvaluator",
|
|
77
|
+
"HateUnfairnessMultimodalEvaluator",
|
|
78
|
+
"SelfHarmMultimodalEvaluator",
|
|
79
|
+
"SexualMultimodalEvaluator",
|
|
80
|
+
"ViolenceMultimodalEvaluator",
|
|
81
|
+
"ProtectedMaterialMultimodalEvaluator",
|
|
60
82
|
]
|
|
@@ -6,6 +6,9 @@ from enum import Enum
|
|
|
6
6
|
from azure.core import CaseInsensitiveEnumMeta
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
|
|
10
|
+
|
|
11
|
+
|
|
9
12
|
class CommonConstants:
|
|
10
13
|
"""Define common constants."""
|
|
11
14
|
|
|
@@ -35,6 +38,7 @@ class Tasks:
|
|
|
35
38
|
CONTENT_HARM = "content harm"
|
|
36
39
|
PROTECTED_MATERIAL = "protected material"
|
|
37
40
|
XPIA = "xpia"
|
|
41
|
+
GROUNDEDNESS = "groundedness"
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
class _InternalAnnotationTasks:
|
|
@@ -56,6 +60,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
|
56
60
|
SEXUAL = "sexual"
|
|
57
61
|
PROTECTED_MATERIAL = "protected_material"
|
|
58
62
|
XPIA = "xpia"
|
|
63
|
+
GROUNDEDNESS = "generic_groundedness"
|
|
59
64
|
|
|
60
65
|
|
|
61
66
|
class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
|
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
import math
|
|
6
6
|
from typing import List
|
|
7
7
|
|
|
8
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def list_sum(lst: List[float]) -> float:
|
|
10
12
|
return sum(lst)
|
|
@@ -15,4 +17,13 @@ def list_mean(lst: List[float]) -> float:
|
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
def list_mean_nan_safe(lst: List[float]) -> float:
|
|
20
|
+
msg = "All score values are NaN. The mean cannot be calculated."
|
|
21
|
+
if all(math.isnan(l) for l in lst):
|
|
22
|
+
raise EvaluationException(
|
|
23
|
+
message=msg,
|
|
24
|
+
internal_message=msg,
|
|
25
|
+
blame=ErrorBlame.USER_ERROR,
|
|
26
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
27
|
+
target=ErrorTarget.CONVERSATION,
|
|
28
|
+
)
|
|
18
29
|
return list_mean([l for l in lst if not math.isnan(l)])
|
|
@@ -9,13 +9,16 @@ import time
|
|
|
9
9
|
from ast import literal_eval
|
|
10
10
|
from typing import Dict, List, Optional, Union, cast
|
|
11
11
|
from urllib.parse import urlparse
|
|
12
|
+
from string import Template
|
|
12
13
|
|
|
13
14
|
import jwt
|
|
14
15
|
|
|
16
|
+
from promptflow.core._errors import MissingRequiredPackage
|
|
15
17
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
18
|
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
17
19
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
20
|
from azure.core.credentials import TokenCredential
|
|
21
|
+
from azure.core.exceptions import HttpResponseError
|
|
19
22
|
from azure.core.pipeline.policies import AsyncRetryPolicy
|
|
20
23
|
|
|
21
24
|
from .constants import (
|
|
@@ -23,10 +26,9 @@ from .constants import (
|
|
|
23
26
|
EvaluationMetrics,
|
|
24
27
|
RAIService,
|
|
25
28
|
Tasks,
|
|
26
|
-
_InternalAnnotationTasks,
|
|
27
29
|
_InternalEvaluationMetrics,
|
|
28
30
|
)
|
|
29
|
-
from .utils import get_harm_severity_level
|
|
31
|
+
from .utils import get_harm_severity_level, retrieve_content_type
|
|
30
32
|
|
|
31
33
|
try:
|
|
32
34
|
version = importlib.metadata.version("azure-ai-evaluation")
|
|
@@ -34,6 +36,11 @@ except importlib.metadata.PackageNotFoundError:
|
|
|
34
36
|
version = "unknown"
|
|
35
37
|
USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
|
|
36
38
|
|
|
39
|
+
USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
|
|
40
|
+
"DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
|
|
41
|
+
Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
|
|
42
|
+
}
|
|
43
|
+
|
|
37
44
|
|
|
38
45
|
def get_common_headers(token: str) -> Dict:
|
|
39
46
|
"""Get common headers for the HTTP request
|
|
@@ -99,7 +106,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
|
|
|
99
106
|
)
|
|
100
107
|
|
|
101
108
|
|
|
102
|
-
def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
109
|
+
def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
|
|
103
110
|
"""Generate the payload for the annotation request
|
|
104
111
|
|
|
105
112
|
:param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
|
|
@@ -107,19 +114,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
|
107
114
|
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
108
115
|
in the payload.
|
|
109
116
|
:type metric: str
|
|
117
|
+
:param annotation_task: The annotation task to be passed to service
|
|
118
|
+
:type annotation_task: str
|
|
110
119
|
:return: The payload for the annotation request.
|
|
111
120
|
:rtype: Dict
|
|
112
121
|
"""
|
|
113
122
|
include_metric = True
|
|
114
|
-
task =
|
|
123
|
+
task = annotation_task
|
|
115
124
|
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
116
|
-
task = Tasks.PROTECTED_MATERIAL
|
|
117
125
|
include_metric = False
|
|
118
126
|
elif metric == _InternalEvaluationMetrics.ECI:
|
|
119
|
-
task = _InternalAnnotationTasks.ECI
|
|
120
127
|
include_metric = False
|
|
121
128
|
elif metric == EvaluationMetrics.XPIA:
|
|
122
|
-
task = Tasks.XPIA
|
|
123
129
|
include_metric = False
|
|
124
130
|
return (
|
|
125
131
|
{
|
|
@@ -135,25 +141,25 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
|
|
|
135
141
|
)
|
|
136
142
|
|
|
137
143
|
|
|
138
|
-
async def submit_request(
|
|
144
|
+
async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
|
|
139
145
|
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
140
146
|
|
|
141
|
-
:param
|
|
142
|
-
:type
|
|
143
|
-
:param response: The response to evaluate.
|
|
144
|
-
:type response: str
|
|
147
|
+
:param data: The data to evaluate.
|
|
148
|
+
:type data: dict
|
|
145
149
|
:param metric: The evaluation metric to use.
|
|
146
150
|
:type metric: str
|
|
147
151
|
:param rai_svc_url: The Responsible AI service URL.
|
|
148
152
|
:type rai_svc_url: str
|
|
149
153
|
:param token: The Azure authentication token.
|
|
150
154
|
:type token: str
|
|
155
|
+
:param annotation_task: The annotation task to use.
|
|
156
|
+
:type annotation_task: str
|
|
151
157
|
:return: The operation ID.
|
|
152
158
|
:rtype: str
|
|
153
159
|
"""
|
|
154
|
-
user_text =
|
|
160
|
+
user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
|
|
155
161
|
normalized_user_text = user_text.replace("'", '\\"')
|
|
156
|
-
payload = generate_payload(normalized_user_text, metric)
|
|
162
|
+
payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
|
|
157
163
|
|
|
158
164
|
url = rai_svc_url + "/submitannotation"
|
|
159
165
|
headers = get_common_headers(token)
|
|
@@ -164,7 +170,6 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
|
|
|
164
170
|
if http_response.status_code != 202:
|
|
165
171
|
print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
|
|
166
172
|
http_response.raise_for_status()
|
|
167
|
-
|
|
168
173
|
result = http_response.json()
|
|
169
174
|
operation_id = result["location"].split("/")[-1]
|
|
170
175
|
return operation_id
|
|
@@ -208,7 +213,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
|
|
|
208
213
|
|
|
209
214
|
|
|
210
215
|
def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
211
|
-
batch_response: List[Dict], metric_name: str
|
|
216
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
212
217
|
) -> Dict[str, Union[str, float]]:
|
|
213
218
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
214
219
|
|
|
@@ -216,11 +221,20 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
216
221
|
:type batch_response: List[Dict]
|
|
217
222
|
:param metric_name: The evaluation metric to use.
|
|
218
223
|
:type metric_name: str
|
|
224
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
225
|
+
:type metric_display_name: Optional[str]
|
|
219
226
|
:return: The parsed annotation result.
|
|
220
227
|
:rtype: Dict[str, Union[str, float]]
|
|
221
228
|
"""
|
|
229
|
+
if metric_display_name is None:
|
|
230
|
+
metric_display_name = metric_name
|
|
231
|
+
|
|
222
232
|
# non-numeric metrics
|
|
223
|
-
if metric_name in {
|
|
233
|
+
if metric_name in {
|
|
234
|
+
EvaluationMetrics.PROTECTED_MATERIAL,
|
|
235
|
+
_InternalEvaluationMetrics.ECI,
|
|
236
|
+
EvaluationMetrics.XPIA,
|
|
237
|
+
}:
|
|
224
238
|
if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
|
|
225
239
|
return {}
|
|
226
240
|
response = batch_response[0][metric_name]
|
|
@@ -230,42 +244,50 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
|
|
|
230
244
|
result = {}
|
|
231
245
|
# Use label instead of score since these are assumed to be boolean results.
|
|
232
246
|
# Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
|
|
233
|
-
result[
|
|
234
|
-
result[
|
|
247
|
+
result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
|
|
248
|
+
result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
|
|
235
249
|
|
|
236
250
|
if metric_name == EvaluationMetrics.XPIA:
|
|
237
251
|
# Add "manipulated_content", "intrusion" and "information_gathering" to the result
|
|
238
252
|
# if present else set them to math.nan
|
|
239
|
-
result[
|
|
253
|
+
result[metric_display_name + "_manipulated_content"] = (
|
|
240
254
|
parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
|
|
241
255
|
)
|
|
242
|
-
result[
|
|
256
|
+
result[metric_display_name + "_intrusion"] = (
|
|
243
257
|
parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
|
|
244
258
|
)
|
|
245
|
-
result[
|
|
259
|
+
result[metric_display_name + "_information_gathering"] = (
|
|
246
260
|
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
|
|
247
261
|
)
|
|
248
262
|
return result
|
|
249
|
-
return _parse_content_harm_response(batch_response, metric_name)
|
|
263
|
+
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
|
|
250
264
|
|
|
251
265
|
|
|
252
|
-
def _parse_content_harm_response(
|
|
266
|
+
def _parse_content_harm_response(
|
|
267
|
+
batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
|
|
268
|
+
) -> Dict[str, Union[str, float]]:
|
|
253
269
|
"""Parse the annotation response from Responsible AI service for a content harm evaluation.
|
|
254
270
|
|
|
255
271
|
:param batch_response: The annotation response from Responsible AI service.
|
|
256
272
|
:type batch_response: List[Dict]
|
|
257
273
|
:param metric_name: The evaluation metric to use.
|
|
258
274
|
:type metric_name: str
|
|
275
|
+
:param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
|
|
276
|
+
:type metric_display_name: Optional[str]
|
|
259
277
|
:return: The parsed annotation result.
|
|
260
278
|
:rtype: Dict[str, Union[str, float]]
|
|
261
279
|
"""
|
|
262
280
|
# Fix the metric name if it's "hate_fairness"
|
|
263
281
|
# Eventually we will remove this fix once the RAI service is updated
|
|
264
|
-
key = metric_name
|
|
282
|
+
key = metric_name if metric_display_name is None else metric_display_name
|
|
265
283
|
if key == EvaluationMetrics.HATE_FAIRNESS:
|
|
266
284
|
key = EvaluationMetrics.HATE_UNFAIRNESS
|
|
267
285
|
|
|
268
|
-
result: Dict[str, Union[str, float]] = {
|
|
286
|
+
result: Dict[str, Union[str, float]] = {
|
|
287
|
+
(key.value if hasattr(key, "value") else key): math.nan,
|
|
288
|
+
f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
|
|
289
|
+
f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
|
|
290
|
+
}
|
|
269
291
|
|
|
270
292
|
response = batch_response[0]
|
|
271
293
|
if metric_name not in response:
|
|
@@ -283,7 +305,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
283
305
|
|
|
284
306
|
# get content harm metric_value
|
|
285
307
|
if "label" in harm_response:
|
|
286
|
-
metric_value = harm_response["label"]
|
|
308
|
+
metric_value = float(harm_response["label"])
|
|
287
309
|
elif "valid" in harm_response:
|
|
288
310
|
metric_value = 0 if harm_response["valid"] else math.nan
|
|
289
311
|
else:
|
|
@@ -314,6 +336,8 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
|
|
|
314
336
|
reason = ""
|
|
315
337
|
|
|
316
338
|
harm_score = metric_value
|
|
339
|
+
if metric_value == "n/a":
|
|
340
|
+
return result
|
|
317
341
|
if not math.isnan(metric_value):
|
|
318
342
|
# int(math.nan) causes a value error, and math.nan is already handled
|
|
319
343
|
# by get_harm_severity_level
|
|
@@ -412,14 +436,17 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
|
|
|
412
436
|
|
|
413
437
|
|
|
414
438
|
async def evaluate_with_rai_service(
|
|
415
|
-
|
|
439
|
+
data: dict,
|
|
440
|
+
metric_name: str,
|
|
441
|
+
project_scope: AzureAIProject,
|
|
442
|
+
credential: TokenCredential,
|
|
443
|
+
annotation_task: str = Tasks.CONTENT_HARM,
|
|
444
|
+
metric_display_name=None,
|
|
416
445
|
) -> Dict[str, Union[str, float]]:
|
|
417
446
|
""" "Evaluate the content safety of the response using Responsible AI service
|
|
418
447
|
|
|
419
|
-
:param
|
|
420
|
-
:type
|
|
421
|
-
:param response: The response to evaluate.
|
|
422
|
-
:type response: str
|
|
448
|
+
:param data: The data to evaluate.
|
|
449
|
+
:type data: dict
|
|
423
450
|
:param metric_name: The evaluation metric to use.
|
|
424
451
|
:type metric_name: str
|
|
425
452
|
:param project_scope: The Azure AI project scope details.
|
|
@@ -427,6 +454,10 @@ async def evaluate_with_rai_service(
|
|
|
427
454
|
:param credential: The Azure authentication credential.
|
|
428
455
|
:type credential:
|
|
429
456
|
~azure.core.credentials.TokenCredential
|
|
457
|
+
:param annotation_task: The annotation task to use.
|
|
458
|
+
:type annotation_task: str
|
|
459
|
+
:param metric_display_name: The display name of metric to use.
|
|
460
|
+
:type metric_display_name: str
|
|
430
461
|
:return: The parsed annotation result.
|
|
431
462
|
:rtype: Dict[str, Union[str, float]]
|
|
432
463
|
"""
|
|
@@ -434,11 +465,117 @@ async def evaluate_with_rai_service(
|
|
|
434
465
|
# Get RAI service URL from discovery service and check service availability
|
|
435
466
|
token = await fetch_or_reuse_token(credential)
|
|
436
467
|
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
437
|
-
await ensure_service_availability(rai_svc_url, token,
|
|
468
|
+
await ensure_service_availability(rai_svc_url, token, annotation_task)
|
|
438
469
|
|
|
439
470
|
# Submit annotation request and fetch result
|
|
440
|
-
operation_id = await submit_request(
|
|
471
|
+
operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
|
|
441
472
|
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
442
|
-
result = parse_response(annotation_response, metric_name)
|
|
473
|
+
result = parse_response(annotation_response, metric_name, metric_display_name)
|
|
474
|
+
|
|
475
|
+
return result
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
|
|
479
|
+
"""Generate the payload for the annotation request
|
|
480
|
+
:param content_type: The type of the content representing multimodal or images.
|
|
481
|
+
:type content_type: str
|
|
482
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
483
|
+
:type messages: str
|
|
484
|
+
:param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
|
|
485
|
+
in the payload.
|
|
486
|
+
:type metric: str
|
|
487
|
+
:return: The payload for the annotation request.
|
|
488
|
+
:rtype: Dict
|
|
489
|
+
"""
|
|
490
|
+
include_metric = True
|
|
491
|
+
task = Tasks.CONTENT_HARM
|
|
492
|
+
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
|
|
493
|
+
task = Tasks.PROTECTED_MATERIAL
|
|
494
|
+
include_metric = False
|
|
495
|
+
|
|
496
|
+
if include_metric:
|
|
497
|
+
return {
|
|
498
|
+
"ContentType": content_type,
|
|
499
|
+
"Contents": [{"messages": messages}],
|
|
500
|
+
"AnnotationTask": task,
|
|
501
|
+
"MetricList": [metric],
|
|
502
|
+
}
|
|
503
|
+
return {
|
|
504
|
+
"ContentType": content_type,
|
|
505
|
+
"Contents": [{"messages": messages}],
|
|
506
|
+
"AnnotationTask": task,
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
|
|
511
|
+
"""Submit request to Responsible AI service for evaluation and return operation ID
|
|
512
|
+
:param messages: The normalized list of messages to be entered as the "Contents" in the payload.
|
|
513
|
+
:type messages: str
|
|
514
|
+
:param metric: The evaluation metric to use.
|
|
515
|
+
:type metric: str
|
|
516
|
+
:param rai_svc_url: The Responsible AI service URL.
|
|
517
|
+
:type rai_svc_url: str
|
|
518
|
+
:param token: The Azure authentication token.
|
|
519
|
+
:type token: str
|
|
520
|
+
:return: The operation ID.
|
|
521
|
+
:rtype: str
|
|
522
|
+
"""
|
|
523
|
+
## handle json payload and payload from inference sdk strongly type messages
|
|
524
|
+
if len(messages) > 0 and not isinstance(messages[0], dict):
|
|
525
|
+
try:
|
|
526
|
+
from azure.ai.inference.models import ChatRequestMessage
|
|
527
|
+
except ImportError as ex:
|
|
528
|
+
error_message = (
|
|
529
|
+
"Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
|
|
530
|
+
)
|
|
531
|
+
raise MissingRequiredPackage(message=error_message) from ex
|
|
532
|
+
if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
|
|
533
|
+
messages = [message.as_dict() for message in messages]
|
|
534
|
+
|
|
535
|
+
filtered_messages = [message for message in messages if message["role"] != "system"]
|
|
536
|
+
assistant_messages = [message for message in messages if message["role"] == "assistant"]
|
|
537
|
+
content_type = retrieve_content_type(assistant_messages, metric)
|
|
538
|
+
payload = generate_payload_multimodal(content_type, filtered_messages, metric)
|
|
539
|
+
|
|
540
|
+
## calling rai service for annotation
|
|
541
|
+
url = rai_svc_url + "/submitannotation"
|
|
542
|
+
headers = get_common_headers(token)
|
|
543
|
+
async with get_async_http_client() as client:
|
|
544
|
+
response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
545
|
+
url, json=payload, headers=headers
|
|
546
|
+
)
|
|
547
|
+
if response.status_code != 202:
|
|
548
|
+
raise HttpResponseError(
|
|
549
|
+
message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
|
|
550
|
+
)
|
|
551
|
+
result = response.json()
|
|
552
|
+
operation_id = result["location"].split("/")[-1]
|
|
553
|
+
return operation_id
|
|
554
|
+
|
|
443
555
|
|
|
556
|
+
async def evaluate_with_rai_service_multimodal(
|
|
557
|
+
messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
|
|
558
|
+
):
|
|
559
|
+
""" "Evaluate the content safety of the response using Responsible AI service
|
|
560
|
+
:param messages: The normalized list of messages.
|
|
561
|
+
:type messages: str
|
|
562
|
+
:param metric_name: The evaluation metric to use.
|
|
563
|
+
:type metric_name: str
|
|
564
|
+
:param project_scope: The Azure AI project scope details.
|
|
565
|
+
:type project_scope: Dict
|
|
566
|
+
:param credential: The Azure authentication credential.
|
|
567
|
+
:type credential:
|
|
568
|
+
~azure.core.credentials.TokenCredential
|
|
569
|
+
:return: The parsed annotation result.
|
|
570
|
+
:rtype: List[List[Dict]]
|
|
571
|
+
"""
|
|
572
|
+
|
|
573
|
+
# Get RAI service URL from discovery service and check service availability
|
|
574
|
+
token = await fetch_or_reuse_token(credential)
|
|
575
|
+
rai_svc_url = await get_rai_svc_url(project_scope, token)
|
|
576
|
+
await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
|
|
577
|
+
# Submit annotation request and fetch result
|
|
578
|
+
operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
|
|
579
|
+
annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
|
|
580
|
+
result = parse_response(annotation_response, metric_name)
|
|
444
581
|
return result
|