azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. azure/ai/evaluation/__init__.py +82 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/_experimental.py +172 -0
  4. azure/ai/evaluation/_common/constants.py +72 -0
  5. azure/ai/evaluation/_common/math.py +89 -0
  6. azure/ai/evaluation/_common/rai_service.py +632 -0
  7. azure/ai/evaluation/_common/utils.py +445 -0
  8. azure/ai/evaluation/_constants.py +72 -0
  9. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
  12. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
  13. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
  14. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  15. azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
  16. azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
  17. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
  18. azure/ai/evaluation/_evaluate/_utils.py +298 -0
  19. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  20. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  21. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
  22. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  23. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
  24. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  25. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  26. azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
  27. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
  28. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  47. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  48. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  49. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
  50. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  51. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  52. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  53. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  54. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  55. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  56. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  57. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  58. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  59. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
  60. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
  62. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
  64. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  65. azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  66. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
  67. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  68. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  69. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  70. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  72. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  73. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
  74. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  75. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  76. azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
  77. azure/ai/evaluation/_exceptions.py +128 -0
  78. azure/ai/evaluation/_http_utils.py +466 -0
  79. azure/ai/evaluation/_model_configurations.py +123 -0
  80. azure/ai/evaluation/_user_agent.py +6 -0
  81. azure/ai/evaluation/_vendor/__init__.py +3 -0
  82. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  83. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  84. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  85. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  86. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  87. azure/ai/evaluation/_version.py +5 -0
  88. azure/ai/evaluation/py.typed +0 -0
  89. azure/ai/evaluation/simulator/__init__.py +16 -0
  90. azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
  91. azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
  92. azure/ai/evaluation/simulator/_constants.py +27 -0
  93. azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
  94. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  95. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  96. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  97. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  98. azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
  99. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  100. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  101. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  102. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
  103. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  104. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
  105. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
  106. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
  107. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
  108. azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
  109. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  110. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  111. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  112. azure/ai/evaluation/simulator/_simulator.py +716 -0
  113. azure/ai/evaluation/simulator/_tracing.py +89 -0
  114. azure/ai/evaluation/simulator/_utils.py +132 -0
  115. azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
  116. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
  117. azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
  118. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
  119. azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
  120. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  121. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  122. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,632 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import asyncio
5
+ import importlib.metadata
6
+ import math
7
+ import re
8
+ import time
9
+ import json
10
+ import html
11
+ from ast import literal_eval
12
+ from typing import Dict, List, Optional, Union, cast
13
+ from urllib.parse import urlparse
14
+ from string import Template
15
+
16
+ import jwt
17
+
18
+ from promptflow.core._errors import MissingRequiredPackage
19
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
20
+ from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
21
+ from azure.ai.evaluation._model_configurations import AzureAIProject
22
+ from azure.core.credentials import TokenCredential
23
+ from azure.core.exceptions import HttpResponseError
24
+ from azure.core.pipeline.policies import AsyncRetryPolicy
25
+
26
+ from .constants import (
27
+ CommonConstants,
28
+ EvaluationMetrics,
29
+ RAIService,
30
+ Tasks,
31
+ _InternalEvaluationMetrics,
32
+ )
33
+ from .utils import get_harm_severity_level, retrieve_content_type
34
+
35
+ try:
36
+ version = importlib.metadata.version("azure-ai-evaluation")
37
+ except importlib.metadata.PackageNotFoundError:
38
+ version = "unknown"
39
+ USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
40
+
41
+ USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
42
+ "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
43
+ }
44
+
45
+
46
+ def get_formatted_template(data: dict, annotation_task: str) -> str:
47
+ """Given the task and input data, produce a formatted string that will serve as the main
48
+ payload for the RAI service. Requires specific per-task logic.
49
+
50
+ :param data: The data to incorporate into the payload.
51
+ :type data: dict
52
+ :param annotation_task: The annotation task to use. This determines the template to use.
53
+ :type annotation_task: str
54
+ :return: The formatted based on the data and task template.
55
+ :rtype: str
56
+ """
57
+ # Template class doesn't play nice with json dumping/loading, just handle groundedness'
58
+ # JSON format manually.
59
+ # Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
60
+ if annotation_task == Tasks.GROUNDEDNESS:
61
+ as_dict = {
62
+ "question": data.get("query", ""),
63
+ "answer": data.get("response", ""),
64
+ "context": data.get("context", ""),
65
+ }
66
+ return json.dumps(as_dict)
67
+ as_dict = {
68
+ "query": html.escape(data.get("query", "")),
69
+ "response": html.escape(data.get("response", "")),
70
+ }
71
+ user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
72
+ return user_text.replace("'", '\\"')
73
+
74
+
75
+ def get_common_headers(token: str) -> Dict:
76
+ """Get common headers for the HTTP request
77
+
78
+ :param token: The Azure authentication token.
79
+ :type token: str
80
+ :return: The common headers.
81
+ :rtype: Dict
82
+ """
83
+ return {
84
+ "Authorization": f"Bearer {token}",
85
+ "Content-Type": "application/json",
86
+ "User-Agent": USER_AGENT,
87
+ # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
88
+ # https://github.com/encode/httpx/discussions/2959
89
+ "Connection": "close",
90
+ }
91
+
92
+
93
+ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
94
+ return get_async_http_client().with_policies(
95
+ retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
96
+ )
97
+
98
+
99
+ async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
100
+ """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
101
+
102
+ :param rai_svc_url: The Responsible AI service URL.
103
+ :type rai_svc_url: str
104
+ :param token: The Azure authentication token.
105
+ :type token: str
106
+ :param capability: The capability to check. Default is None.
107
+ :type capability: str
108
+ :raises Exception: If the service is not available in the region or the capability is not available.
109
+ """
110
+ headers = get_common_headers(token)
111
+ svc_liveness_url = rai_svc_url + "/checkannotation"
112
+
113
+ async with get_async_http_client() as client:
114
+ response = await client.get(svc_liveness_url, headers=headers)
115
+
116
+ if response.status_code != 200:
117
+ msg = (
118
+ f"RAI service is unavailable in this region, or you lack the necessary permissions "
119
+ f"to access the AI project. Status Code: {response.status_code}"
120
+ )
121
+ raise EvaluationException(
122
+ message=msg,
123
+ internal_message=msg,
124
+ target=ErrorTarget.RAI_CLIENT,
125
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
126
+ blame=ErrorBlame.USER_ERROR,
127
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
128
+ )
129
+
130
+ capabilities = response.json()
131
+ if capability and capability not in capabilities:
132
+ msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
133
+ raise EvaluationException(
134
+ message=msg,
135
+ internal_message=msg,
136
+ target=ErrorTarget.RAI_CLIENT,
137
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
138
+ blame=ErrorBlame.USER_ERROR,
139
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
140
+ )
141
+
142
+
143
+ def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
144
+ """Generate the payload for the annotation request
145
+
146
+ :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
147
+ :type normalized_user_text: str
148
+ :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
149
+ in the payload.
150
+ :type metric: str
151
+ :param annotation_task: The annotation task to be passed to service
152
+ :type annotation_task: str
153
+ :return: The payload for the annotation request.
154
+ :rtype: Dict
155
+ """
156
+ include_metric = True
157
+ task = annotation_task
158
+ if metric == EvaluationMetrics.PROTECTED_MATERIAL:
159
+ include_metric = False
160
+ elif metric == _InternalEvaluationMetrics.ECI:
161
+ include_metric = False
162
+ elif metric == EvaluationMetrics.XPIA:
163
+ include_metric = False
164
+ return (
165
+ {
166
+ "UserTextList": [normalized_user_text],
167
+ "AnnotationTask": task,
168
+ "MetricList": [metric],
169
+ }
170
+ if include_metric
171
+ else {
172
+ "UserTextList": [normalized_user_text],
173
+ "AnnotationTask": task,
174
+ }
175
+ )
176
+
177
+
178
+ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
179
+ """Submit request to Responsible AI service for evaluation and return operation ID
180
+
181
+ :param data: The data to evaluate.
182
+ :type data: dict
183
+ :param metric: The evaluation metric to use.
184
+ :type metric: str
185
+ :param rai_svc_url: The Responsible AI service URL.
186
+ :type rai_svc_url: str
187
+ :param token: The Azure authentication token.
188
+ :type token: str
189
+ :param annotation_task: The annotation task to use.
190
+ :type annotation_task: str
191
+ :return: The operation ID.
192
+ :rtype: str
193
+ """
194
+ normalized_user_text = get_formatted_template(data, annotation_task)
195
+ payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
196
+
197
+ url = rai_svc_url + "/submitannotation"
198
+ headers = get_common_headers(token)
199
+
200
+ async with get_async_http_client_with_timeout() as client:
201
+ http_response = await client.post(url, json=payload, headers=headers)
202
+
203
+ if http_response.status_code != 202:
204
+ print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
205
+ http_response.raise_for_status()
206
+ result = http_response.json()
207
+ operation_id = result["location"].split("/")[-1]
208
+ return operation_id
209
+
210
+
211
+ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential, token: str) -> Dict:
212
+ """Fetch the annotation result from Responsible AI service
213
+
214
+ :param operation_id: The operation ID.
215
+ :type operation_id: str
216
+ :param rai_svc_url: The Responsible AI service URL.
217
+ :type rai_svc_url: str
218
+ :param credential: The Azure authentication credential.
219
+ :type credential: ~azure.core.credentials.TokenCredential
220
+ :param token: The Azure authentication token.
221
+ :type token: str
222
+ :return: The annotation result.
223
+ :rtype: Dict
224
+ """
225
+ start = time.time()
226
+ request_count = 0
227
+
228
+ url = rai_svc_url + "/operations/" + operation_id
229
+ while True:
230
+ token = await fetch_or_reuse_token(credential, token)
231
+ headers = get_common_headers(token)
232
+
233
+ async with get_async_http_client_with_timeout() as client:
234
+ response = await client.get(url, headers=headers)
235
+
236
+ if response.status_code == 200:
237
+ return response.json()
238
+
239
+ request_count += 1
240
+ time_elapsed = time.time() - start
241
+ if time_elapsed > RAIService.TIMEOUT:
242
+ raise TimeoutError(f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds")
243
+
244
+ sleep_time = RAIService.SLEEP_TIME**request_count
245
+ await asyncio.sleep(sleep_time)
246
+
247
+
248
+ def parse_response( # pylint: disable=too-many-branches,too-many-statements
249
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
250
+ ) -> Dict[str, Union[str, float]]:
251
+ """Parse the annotation response from Responsible AI service for a content harm evaluation.
252
+
253
+ :param batch_response: The annotation response from Responsible AI service.
254
+ :type batch_response: List[Dict]
255
+ :param metric_name: The evaluation metric to use.
256
+ :type metric_name: str
257
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
258
+ :type metric_display_name: Optional[str]
259
+ :return: The parsed annotation result.
260
+ :rtype: Dict[str, Union[str, float]]
261
+ """
262
+ if metric_display_name is None:
263
+ metric_display_name = metric_name
264
+
265
+ # non-numeric metrics
266
+ if metric_name in {
267
+ EvaluationMetrics.PROTECTED_MATERIAL,
268
+ _InternalEvaluationMetrics.ECI,
269
+ EvaluationMetrics.XPIA,
270
+ }:
271
+ result = {}
272
+ if not batch_response or len(batch_response[0]) == 0:
273
+ return {}
274
+ if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
275
+ pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
276
+ for pm_metric_name in pm_metric_names:
277
+ response = batch_response[0][pm_metric_name]
278
+ response = response.replace("false", "False")
279
+ response = response.replace("true", "True")
280
+ parsed_response = literal_eval(response)
281
+ result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
282
+ result[pm_metric_name + "_reason"] = (
283
+ parsed_response["reasoning"] if "reasoning" in parsed_response else ""
284
+ )
285
+ return result
286
+ if metric_name not in batch_response[0]:
287
+ return {}
288
+ response = batch_response[0][metric_name]
289
+ response = response.replace("false", "False")
290
+ response = response.replace("true", "True")
291
+ parsed_response = literal_eval(response)
292
+ # Use label instead of score since these are assumed to be boolean results.
293
+ # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
294
+ result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
295
+ result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
296
+
297
+ if metric_name == EvaluationMetrics.XPIA:
298
+ # Add "manipulated_content", "intrusion" and "information_gathering" to the result
299
+ # if present else set them to math.nan
300
+ result[metric_display_name + "_manipulated_content"] = (
301
+ parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
302
+ )
303
+ result[metric_display_name + "_intrusion"] = (
304
+ parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
305
+ )
306
+ result[metric_display_name + "_information_gathering"] = (
307
+ parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
308
+ )
309
+ return result
310
+ return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
311
+
312
+
313
+ def _parse_content_harm_response(
314
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
315
+ ) -> Dict[str, Union[str, float]]:
316
+ """Parse the annotation response from Responsible AI service for a content harm evaluation.
317
+
318
+ :param batch_response: The annotation response from Responsible AI service.
319
+ :type batch_response: List[Dict]
320
+ :param metric_name: The evaluation metric to use.
321
+ :type metric_name: str
322
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
323
+ :type metric_display_name: Optional[str]
324
+ :return: The parsed annotation result.
325
+ :rtype: Dict[str, Union[str, float]]
326
+ """
327
+ # Fix the metric name if it's "hate_fairness"
328
+ # Eventually we will remove this fix once the RAI service is updated
329
+ key = metric_name if metric_display_name is None else metric_display_name
330
+ if key == EvaluationMetrics.HATE_FAIRNESS:
331
+ key = EvaluationMetrics.HATE_UNFAIRNESS
332
+
333
+ result: Dict[str, Union[str, float]] = {
334
+ (key.value if hasattr(key, "value") else key): math.nan,
335
+ f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
336
+ f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
337
+ }
338
+
339
+ response = batch_response[0]
340
+ if metric_name not in response:
341
+ return result
342
+
343
+ try:
344
+ harm_response = literal_eval(response[metric_name])
345
+ except Exception: # pylint: disable=broad-exception-caught
346
+ harm_response = response[metric_name]
347
+
348
+ if harm_response != "" and isinstance(harm_response, dict):
349
+ # check if "output" is one key in harm_response
350
+ if "output" in harm_response:
351
+ harm_response = harm_response["output"]
352
+
353
+ # get content harm metric_value
354
+ if "label" in harm_response:
355
+ metric_value = float(harm_response["label"])
356
+ elif "valid" in harm_response:
357
+ metric_value = 0 if harm_response["valid"] else math.nan
358
+ else:
359
+ metric_value = math.nan
360
+
361
+ # get reason
362
+ if "reasoning" in harm_response:
363
+ reason = harm_response["reasoning"]
364
+ elif "reason" in harm_response:
365
+ reason = harm_response["reason"]
366
+ else:
367
+ reason = ""
368
+ elif harm_response != "" and isinstance(harm_response, str):
369
+ metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
370
+ if metric_value_match:
371
+ metric_value = int(metric_value_match[0])
372
+ else:
373
+ metric_value = math.nan
374
+ reason = harm_response
375
+ elif harm_response != "" and isinstance(harm_response, (int, float)):
376
+ if 0 < harm_response <= 7:
377
+ metric_value = harm_response
378
+ else:
379
+ metric_value = math.nan
380
+ reason = ""
381
+ else:
382
+ metric_value = math.nan
383
+ reason = ""
384
+
385
+ harm_score = metric_value
386
+ if metric_value == "n/a":
387
+ return result
388
+ if not math.isnan(metric_value):
389
+ # int(math.nan) causes a value error, and math.nan is already handled
390
+ # by get_harm_severity_level
391
+ harm_score = int(metric_value)
392
+ result[key] = get_harm_severity_level(harm_score)
393
+ result[key + "_score"] = harm_score
394
+ result[key + "_reason"] = reason
395
+
396
+ return result
397
+
398
+
399
+ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: str) -> str:
400
+ """Get the discovery service URL for the Azure AI project
401
+
402
+ :param azure_ai_project: The Azure AI project details.
403
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
404
+ :param token: The Azure authentication token.
405
+ :type token: str
406
+ :return: The discovery service URL.
407
+ :rtype: str
408
+ """
409
+ headers = get_common_headers(token)
410
+
411
+ async with get_async_http_client_with_timeout() as client:
412
+ response = await client.get(
413
+ f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
414
+ f"resourceGroups/{azure_ai_project['resource_group_name']}/"
415
+ f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
416
+ f"api-version=2023-08-01-preview",
417
+ headers=headers,
418
+ )
419
+
420
+ if response.status_code != 200:
421
+ msg = (
422
+ f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
423
+ f"and make sure you have the necessary access permissions. "
424
+ f"Status code: {response.status_code}."
425
+ )
426
+ raise EvaluationException(
427
+ message=msg,
428
+ target=ErrorTarget.RAI_CLIENT,
429
+ blame=ErrorBlame.USER_ERROR,
430
+ category=ErrorCategory.PROJECT_ACCESS_ERROR,
431
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
432
+ )
433
+
434
+ base_url = urlparse(response.json()["properties"]["discoveryUrl"])
435
+ return f"{base_url.scheme}://{base_url.netloc}"
436
+
437
+
438
+ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
439
+ """Get the Responsible AI service URL
440
+
441
+ :param project_scope: The Azure AI project scope details.
442
+ :type project_scope: Dict
443
+ :param token: The Azure authentication token.
444
+ :type token: str
445
+ :return: The Responsible AI service URL.
446
+ :rtype: str
447
+ """
448
+ discovery_url = await _get_service_discovery_url(azure_ai_project=project_scope, token=token)
449
+ subscription_id = project_scope["subscription_id"]
450
+ resource_group_name = project_scope["resource_group_name"]
451
+ project_name = project_scope["project_name"]
452
+ base_url = discovery_url.rstrip("/")
453
+ rai_url = (
454
+ f"{base_url}/raisvc/v1.0"
455
+ f"/subscriptions/{subscription_id}"
456
+ f"/resourceGroups/{resource_group_name}"
457
+ f"/providers/Microsoft.MachineLearningServices/workspaces/{project_name}"
458
+ )
459
+ return rai_url
460
+
461
+
462
+ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str:
463
+ """Get token. Fetch a new token if the current token is near expiry
464
+
465
+ :param credential: The Azure authentication credential.
466
+ :type credential:
467
+ ~azure.core.credentials.TokenCredential
468
+ :param token: The Azure authentication token. Defaults to None. If none, a new token will be fetched.
469
+ :type token: str
470
+ :return: The Azure authentication token.
471
+ """
472
+ if token:
473
+ # Decode the token to get its expiration time
474
+ try:
475
+ decoded_token = jwt.decode(token, options={"verify_signature": False})
476
+ except jwt.PyJWTError:
477
+ pass
478
+ else:
479
+ exp_time = decoded_token["exp"]
480
+ current_time = time.time()
481
+
482
+ # Return current token if not near expiry
483
+ if (exp_time - current_time) >= 300:
484
+ return token
485
+
486
+ return credential.get_token("https://management.azure.com/.default").token
487
+
488
+
489
+ async def evaluate_with_rai_service(
490
+ data: dict,
491
+ metric_name: str,
492
+ project_scope: AzureAIProject,
493
+ credential: TokenCredential,
494
+ annotation_task: str = Tasks.CONTENT_HARM,
495
+ metric_display_name=None,
496
+ ) -> Dict[str, Union[str, float]]:
497
+ """ "Evaluate the content safety of the response using Responsible AI service
498
+
499
+ :param data: The data to evaluate.
500
+ :type data: dict
501
+ :param metric_name: The evaluation metric to use.
502
+ :type metric_name: str
503
+ :param project_scope: The Azure AI project scope details.
504
+ :type project_scope: Dict
505
+ :param credential: The Azure authentication credential.
506
+ :type credential:
507
+ ~azure.core.credentials.TokenCredential
508
+ :param annotation_task: The annotation task to use.
509
+ :type annotation_task: str
510
+ :param metric_display_name: The display name of metric to use.
511
+ :type metric_display_name: str
512
+ :return: The parsed annotation result.
513
+ :rtype: Dict[str, Union[str, float]]
514
+ """
515
+
516
+ # Get RAI service URL from discovery service and check service availability
517
+ token = await fetch_or_reuse_token(credential)
518
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
519
+ await ensure_service_availability(rai_svc_url, token, annotation_task)
520
+
521
+ # Submit annotation request and fetch result
522
+ operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
523
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
524
+ result = parse_response(annotation_response, metric_name, metric_display_name)
525
+
526
+ return result
527
+
528
+
529
+ def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
530
+ """Generate the payload for the annotation request
531
+ :param content_type: The type of the content representing multimodal or images.
532
+ :type content_type: str
533
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
534
+ :type messages: str
535
+ :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
536
+ in the payload.
537
+ :type metric: str
538
+ :return: The payload for the annotation request.
539
+ :rtype: Dict
540
+ """
541
+ include_metric = True
542
+ task = Tasks.CONTENT_HARM
543
+ if metric == EvaluationMetrics.PROTECTED_MATERIAL:
544
+ task = Tasks.PROTECTED_MATERIAL
545
+ include_metric = False
546
+
547
+ if include_metric:
548
+ return {
549
+ "ContentType": content_type,
550
+ "Contents": [{"messages": messages}],
551
+ "AnnotationTask": task,
552
+ "MetricList": [metric],
553
+ }
554
+ return {
555
+ "ContentType": content_type,
556
+ "Contents": [{"messages": messages}],
557
+ "AnnotationTask": task,
558
+ }
559
+
560
+
561
+ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
562
+ """Submit request to Responsible AI service for evaluation and return operation ID
563
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
564
+ :type messages: str
565
+ :param metric: The evaluation metric to use.
566
+ :type metric: str
567
+ :param rai_svc_url: The Responsible AI service URL.
568
+ :type rai_svc_url: str
569
+ :param token: The Azure authentication token.
570
+ :type token: str
571
+ :return: The operation ID.
572
+ :rtype: str
573
+ """
574
+ ## handle json payload and payload from inference sdk strongly type messages
575
+ if len(messages) > 0 and not isinstance(messages[0], dict):
576
+ try:
577
+ from azure.ai.inference.models import ChatRequestMessage
578
+ except ImportError as ex:
579
+ error_message = (
580
+ "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
581
+ )
582
+ raise MissingRequiredPackage(message=error_message) from ex
583
+ if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
584
+ messages = [message.as_dict() for message in messages]
585
+
586
+ filtered_messages = [message for message in messages if message["role"] != "system"]
587
+ assistant_messages = [message for message in messages if message["role"] == "assistant"]
588
+ content_type = retrieve_content_type(assistant_messages, metric)
589
+ payload = generate_payload_multimodal(content_type, filtered_messages, metric)
590
+
591
+ ## calling rai service for annotation
592
+ url = rai_svc_url + "/submitannotation"
593
+ headers = get_common_headers(token)
594
+ async with get_async_http_client() as client:
595
+ response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
596
+ url, json=payload, headers=headers
597
+ )
598
+ if response.status_code != 202:
599
+ raise HttpResponseError(
600
+ message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
601
+ )
602
+ result = response.json()
603
+ operation_id = result["location"].split("/")[-1]
604
+ return operation_id
605
+
606
+
607
+ async def evaluate_with_rai_service_multimodal(
608
+ messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
609
+ ):
610
+ """ "Evaluate the content safety of the response using Responsible AI service
611
+ :param messages: The normalized list of messages.
612
+ :type messages: str
613
+ :param metric_name: The evaluation metric to use.
614
+ :type metric_name: str
615
+ :param project_scope: The Azure AI project scope details.
616
+ :type project_scope: Dict
617
+ :param credential: The Azure authentication credential.
618
+ :type credential:
619
+ ~azure.core.credentials.TokenCredential
620
+ :return: The parsed annotation result.
621
+ :rtype: List[List[Dict]]
622
+ """
623
+
624
+ # Get RAI service URL from discovery service and check service availability
625
+ token = await fetch_or_reuse_token(credential)
626
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
627
+ await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
628
+ # Submit annotation request and fetch result
629
+ operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
630
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
631
+ result = parse_response(annotation_response, metric_name)
632
+ return result