azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
  3. azure/ai/evaluation/_common/constants.py +5 -0
  4. azure/ai/evaluation/_common/math.py +73 -2
  5. azure/ai/evaluation/_common/rai_service.py +250 -62
  6. azure/ai/evaluation/_common/utils.py +196 -23
  7. azure/ai/evaluation/_constants.py +7 -6
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
  10. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
  11. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
  13. azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
  15. azure/ai/evaluation/_evaluate/_utils.py +46 -11
  16. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
  17. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
  18. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  19. azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
  20. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
  21. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
  28. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
  29. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
  30. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  31. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
  32. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  34. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  35. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
  36. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  44. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
  45. azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
  46. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
  47. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  48. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
  49. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  50. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
  51. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  52. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  53. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
  54. azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
  55. azure/ai/evaluation/_exceptions.py +19 -0
  56. azure/ai/evaluation/_model_configurations.py +83 -15
  57. azure/ai/evaluation/_version.py +1 -1
  58. azure/ai/evaluation/simulator/__init__.py +2 -1
  59. azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
  60. azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
  61. azure/ai/evaluation/simulator/_constants.py +11 -1
  62. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  63. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  64. azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
  65. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  66. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  67. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
  68. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
  69. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
  70. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  71. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  72. azure/ai/evaluation/simulator/_simulator.py +165 -105
  73. azure/ai/evaluation/simulator/_utils.py +31 -13
  74. azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
  75. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
  76. azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
  77. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
  78. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
  79. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  80. azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
  81. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  82. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  83. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
@@ -6,16 +6,21 @@ import importlib.metadata
6
6
  import math
7
7
  import re
8
8
  import time
9
+ import json
10
+ import html
9
11
  from ast import literal_eval
10
12
  from typing import Dict, List, Optional, Union, cast
11
13
  from urllib.parse import urlparse
14
+ from string import Template
12
15
 
13
16
  import jwt
14
17
 
18
+ from promptflow.core._errors import MissingRequiredPackage
15
19
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
20
  from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
17
21
  from azure.ai.evaluation._model_configurations import AzureAIProject
18
22
  from azure.core.credentials import TokenCredential
23
+ from azure.core.exceptions import HttpResponseError
19
24
  from azure.core.pipeline.policies import AsyncRetryPolicy
20
25
 
21
26
  from .constants import (
@@ -23,10 +28,9 @@ from .constants import (
23
28
  EvaluationMetrics,
24
29
  RAIService,
25
30
  Tasks,
26
- _InternalAnnotationTasks,
27
31
  _InternalEvaluationMetrics,
28
32
  )
29
- from .utils import get_harm_severity_level
33
+ from .utils import get_harm_severity_level, retrieve_content_type
30
34
 
31
35
  try:
32
36
  version = importlib.metadata.version("azure-ai-evaluation")
@@ -34,6 +38,39 @@ except importlib.metadata.PackageNotFoundError:
34
38
  version = "unknown"
35
39
  USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
36
40
 
41
+ USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
42
+ "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
43
+ }
44
+
45
+
46
+ def get_formatted_template(data: dict, annotation_task: str) -> str:
47
+ """Given the task and input data, produce a formatted string that will serve as the main
48
+ payload for the RAI service. Requires specific per-task logic.
49
+
50
+ :param data: The data to incorporate into the payload.
51
+ :type data: dict
52
+ :param annotation_task: The annotation task to use. This determines the template to use.
53
+ :type annotation_task: str
54
+ :return: The formatted based on the data and task template.
55
+ :rtype: str
56
+ """
57
+ # Template class doesn't play nice with json dumping/loading, just handle groundedness'
58
+ # JSON format manually.
59
+ # Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
60
+ if annotation_task == Tasks.GROUNDEDNESS:
61
+ as_dict = {
62
+ "question": data.get("query", ""),
63
+ "answer": data.get("response", ""),
64
+ "context": data.get("context", ""),
65
+ }
66
+ return json.dumps(as_dict)
67
+ as_dict = {
68
+ "query": html.escape(data.get("query", "")),
69
+ "response": html.escape(data.get("response", "")),
70
+ }
71
+ user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
72
+ return user_text.replace("'", '\\"')
73
+
37
74
 
38
75
  def get_common_headers(token: str) -> Dict:
39
76
  """Get common headers for the HTTP request
@@ -76,30 +113,34 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
76
113
  async with get_async_http_client() as client:
77
114
  response = await client.get(svc_liveness_url, headers=headers)
78
115
 
79
- if response.status_code != 200:
80
- msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
81
- raise EvaluationException(
82
- message=msg,
83
- internal_message=msg,
84
- target=ErrorTarget.UNKNOWN,
85
- category=ErrorCategory.SERVICE_UNAVAILABLE,
86
- blame=ErrorBlame.USER_ERROR,
87
- )
88
-
89
- capabilities = response.json()
116
+ if response.status_code != 200:
117
+ msg = (
118
+ f"RAI service is unavailable in this region, or you lack the necessary permissions "
119
+ f"to access the AI project. Status Code: {response.status_code}"
120
+ )
121
+ raise EvaluationException(
122
+ message=msg,
123
+ internal_message=msg,
124
+ target=ErrorTarget.RAI_CLIENT,
125
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
126
+ blame=ErrorBlame.USER_ERROR,
127
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
128
+ )
90
129
 
91
- if capability and capability not in capabilities:
92
- msg = f"Capability '{capability}' is not available in this region"
93
- raise EvaluationException(
94
- message=msg,
95
- internal_message=msg,
96
- target=ErrorTarget.RAI_CLIENT,
97
- category=ErrorCategory.SERVICE_UNAVAILABLE,
98
- blame=ErrorBlame.USER_ERROR,
99
- )
130
+ capabilities = response.json()
131
+ if capability and capability not in capabilities:
132
+ msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
133
+ raise EvaluationException(
134
+ message=msg,
135
+ internal_message=msg,
136
+ target=ErrorTarget.RAI_CLIENT,
137
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
138
+ blame=ErrorBlame.USER_ERROR,
139
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
140
+ )
100
141
 
101
142
 
102
- def generate_payload(normalized_user_text: str, metric: str) -> Dict:
143
+ def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
103
144
  """Generate the payload for the annotation request
104
145
 
105
146
  :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
@@ -107,19 +148,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
107
148
  :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
108
149
  in the payload.
109
150
  :type metric: str
151
+ :param annotation_task: The annotation task to be passed to service
152
+ :type annotation_task: str
110
153
  :return: The payload for the annotation request.
111
154
  :rtype: Dict
112
155
  """
113
156
  include_metric = True
114
- task = Tasks.CONTENT_HARM
157
+ task = annotation_task
115
158
  if metric == EvaluationMetrics.PROTECTED_MATERIAL:
116
- task = Tasks.PROTECTED_MATERIAL
117
159
  include_metric = False
118
160
  elif metric == _InternalEvaluationMetrics.ECI:
119
- task = _InternalAnnotationTasks.ECI
120
161
  include_metric = False
121
162
  elif metric == EvaluationMetrics.XPIA:
122
- task = Tasks.XPIA
123
163
  include_metric = False
124
164
  return (
125
165
  {
@@ -135,25 +175,24 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
135
175
  )
136
176
 
137
177
 
138
- async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
178
+ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
139
179
  """Submit request to Responsible AI service for evaluation and return operation ID
140
180
 
141
- :param query: The query to evaluate.
142
- :type query: str
143
- :param response: The response to evaluate.
144
- :type response: str
181
+ :param data: The data to evaluate.
182
+ :type data: dict
145
183
  :param metric: The evaluation metric to use.
146
184
  :type metric: str
147
185
  :param rai_svc_url: The Responsible AI service URL.
148
186
  :type rai_svc_url: str
149
187
  :param token: The Azure authentication token.
150
188
  :type token: str
189
+ :param annotation_task: The annotation task to use.
190
+ :type annotation_task: str
151
191
  :return: The operation ID.
152
192
  :rtype: str
153
193
  """
154
- user_text = f"<Human>{query}</><System>{response}</>"
155
- normalized_user_text = user_text.replace("'", '\\"')
156
- payload = generate_payload(normalized_user_text, metric)
194
+ normalized_user_text = get_formatted_template(data, annotation_task)
195
+ payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
157
196
 
158
197
  url = rai_svc_url + "/submitannotation"
159
198
  headers = get_common_headers(token)
@@ -164,7 +203,6 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
164
203
  if http_response.status_code != 202:
165
204
  print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
166
205
  http_response.raise_for_status()
167
-
168
206
  result = http_response.json()
169
207
  operation_id = result["location"].split("/")[-1]
170
208
  return operation_id
@@ -208,7 +246,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
208
246
 
209
247
 
210
248
  def parse_response( # pylint: disable=too-many-branches,too-many-statements
211
- batch_response: List[Dict], metric_name: str
249
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
212
250
  ) -> Dict[str, Union[str, float]]:
213
251
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
214
252
 
@@ -216,56 +254,87 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
216
254
  :type batch_response: List[Dict]
217
255
  :param metric_name: The evaluation metric to use.
218
256
  :type metric_name: str
257
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
258
+ :type metric_display_name: Optional[str]
219
259
  :return: The parsed annotation result.
220
260
  :rtype: Dict[str, Union[str, float]]
221
261
  """
262
+ if metric_display_name is None:
263
+ metric_display_name = metric_name
264
+
222
265
  # non-numeric metrics
223
- if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
224
- if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
266
+ if metric_name in {
267
+ EvaluationMetrics.PROTECTED_MATERIAL,
268
+ _InternalEvaluationMetrics.ECI,
269
+ EvaluationMetrics.XPIA,
270
+ }:
271
+ result = {}
272
+ if not batch_response or len(batch_response[0]) == 0:
273
+ return {}
274
+ if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
275
+ pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
276
+ for pm_metric_name in pm_metric_names:
277
+ response = batch_response[0][pm_metric_name]
278
+ response = response.replace("false", "False")
279
+ response = response.replace("true", "True")
280
+ parsed_response = literal_eval(response)
281
+ result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
282
+ result[pm_metric_name + "_reason"] = (
283
+ parsed_response["reasoning"] if "reasoning" in parsed_response else ""
284
+ )
285
+ return result
286
+ if metric_name not in batch_response[0]:
225
287
  return {}
226
288
  response = batch_response[0][metric_name]
227
289
  response = response.replace("false", "False")
228
290
  response = response.replace("true", "True")
229
291
  parsed_response = literal_eval(response)
230
- result = {}
231
292
  # Use label instead of score since these are assumed to be boolean results.
232
293
  # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
233
- result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
234
- result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
294
+ result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
295
+ result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
235
296
 
236
297
  if metric_name == EvaluationMetrics.XPIA:
237
298
  # Add "manipulated_content", "intrusion" and "information_gathering" to the result
238
299
  # if present else set them to math.nan
239
- result[metric_name + "_manipulated_content"] = (
300
+ result[metric_display_name + "_manipulated_content"] = (
240
301
  parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
241
302
  )
242
- result[metric_name + "_intrusion"] = (
303
+ result[metric_display_name + "_intrusion"] = (
243
304
  parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
244
305
  )
245
- result[metric_name + "_information_gathering"] = (
306
+ result[metric_display_name + "_information_gathering"] = (
246
307
  parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
247
308
  )
248
309
  return result
249
- return _parse_content_harm_response(batch_response, metric_name)
310
+ return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
250
311
 
251
312
 
252
- def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict[str, Union[str, float]]:
313
+ def _parse_content_harm_response(
314
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
315
+ ) -> Dict[str, Union[str, float]]:
253
316
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
254
317
 
255
318
  :param batch_response: The annotation response from Responsible AI service.
256
319
  :type batch_response: List[Dict]
257
320
  :param metric_name: The evaluation metric to use.
258
321
  :type metric_name: str
322
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
323
+ :type metric_display_name: Optional[str]
259
324
  :return: The parsed annotation result.
260
325
  :rtype: Dict[str, Union[str, float]]
261
326
  """
262
327
  # Fix the metric name if it's "hate_fairness"
263
328
  # Eventually we will remove this fix once the RAI service is updated
264
- key = metric_name
329
+ key = metric_name if metric_display_name is None else metric_display_name
265
330
  if key == EvaluationMetrics.HATE_FAIRNESS:
266
331
  key = EvaluationMetrics.HATE_UNFAIRNESS
267
332
 
268
- result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
333
+ result: Dict[str, Union[str, float]] = {
334
+ (key.value if hasattr(key, "value") else key): math.nan,
335
+ f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
336
+ f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
337
+ }
269
338
 
270
339
  response = batch_response[0]
271
340
  if metric_name not in response:
@@ -283,7 +352,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
283
352
 
284
353
  # get content harm metric_value
285
354
  if "label" in harm_response:
286
- metric_value = harm_response["label"]
355
+ metric_value = float(harm_response["label"])
287
356
  elif "valid" in harm_response:
288
357
  metric_value = 0 if harm_response["valid"] else math.nan
289
358
  else:
@@ -314,6 +383,8 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
314
383
  reason = ""
315
384
 
316
385
  harm_score = metric_value
386
+ if metric_value == "n/a":
387
+ return result
317
388
  if not math.isnan(metric_value):
318
389
  # int(math.nan) causes a value error, and math.nan is already handled
319
390
  # by get_harm_severity_level
@@ -347,13 +418,17 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
347
418
  )
348
419
 
349
420
  if response.status_code != 200:
350
- msg = "Failed to retrieve the discovery service URL."
421
+ msg = (
422
+ f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
423
+ f"and make sure you have the necessary access permissions. "
424
+ f"Status code: {response.status_code}."
425
+ )
351
426
  raise EvaluationException(
352
427
  message=msg,
353
- internal_message=msg,
354
428
  target=ErrorTarget.RAI_CLIENT,
355
- category=ErrorCategory.SERVICE_UNAVAILABLE,
356
- blame=ErrorBlame.UNKNOWN,
429
+ blame=ErrorBlame.USER_ERROR,
430
+ category=ErrorCategory.PROJECT_ACCESS_ERROR,
431
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
357
432
  )
358
433
 
359
434
  base_url = urlparse(response.json()["properties"]["discoveryUrl"])
@@ -412,14 +487,17 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
412
487
 
413
488
 
414
489
  async def evaluate_with_rai_service(
415
- query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
490
+ data: dict,
491
+ metric_name: str,
492
+ project_scope: AzureAIProject,
493
+ credential: TokenCredential,
494
+ annotation_task: str = Tasks.CONTENT_HARM,
495
+ metric_display_name=None,
416
496
  ) -> Dict[str, Union[str, float]]:
417
497
  """ "Evaluate the content safety of the response using Responsible AI service
418
498
 
419
- :param query: The query to evaluate.
420
- :type query: str
421
- :param response: The response to evaluate.
422
- :type response: str
499
+ :param data: The data to evaluate.
500
+ :type data: dict
423
501
  :param metric_name: The evaluation metric to use.
424
502
  :type metric_name: str
425
503
  :param project_scope: The Azure AI project scope details.
@@ -427,6 +505,10 @@ async def evaluate_with_rai_service(
427
505
  :param credential: The Azure authentication credential.
428
506
  :type credential:
429
507
  ~azure.core.credentials.TokenCredential
508
+ :param annotation_task: The annotation task to use.
509
+ :type annotation_task: str
510
+ :param metric_display_name: The display name of metric to use.
511
+ :type metric_display_name: str
430
512
  :return: The parsed annotation result.
431
513
  :rtype: Dict[str, Union[str, float]]
432
514
  """
@@ -434,11 +516,117 @@ async def evaluate_with_rai_service(
434
516
  # Get RAI service URL from discovery service and check service availability
435
517
  token = await fetch_or_reuse_token(credential)
436
518
  rai_svc_url = await get_rai_svc_url(project_scope, token)
437
- await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
519
+ await ensure_service_availability(rai_svc_url, token, annotation_task)
438
520
 
439
521
  # Submit annotation request and fetch result
440
- operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
522
+ operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
441
523
  annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
442
- result = parse_response(annotation_response, metric_name)
524
+ result = parse_response(annotation_response, metric_name, metric_display_name)
525
+
526
+ return result
527
+
528
+
529
+ def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
530
+ """Generate the payload for the annotation request
531
+ :param content_type: The type of the content representing multimodal or images.
532
+ :type content_type: str
533
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
534
+ :type messages: str
535
+ :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
536
+ in the payload.
537
+ :type metric: str
538
+ :return: The payload for the annotation request.
539
+ :rtype: Dict
540
+ """
541
+ include_metric = True
542
+ task = Tasks.CONTENT_HARM
543
+ if metric == EvaluationMetrics.PROTECTED_MATERIAL:
544
+ task = Tasks.PROTECTED_MATERIAL
545
+ include_metric = False
546
+
547
+ if include_metric:
548
+ return {
549
+ "ContentType": content_type,
550
+ "Contents": [{"messages": messages}],
551
+ "AnnotationTask": task,
552
+ "MetricList": [metric],
553
+ }
554
+ return {
555
+ "ContentType": content_type,
556
+ "Contents": [{"messages": messages}],
557
+ "AnnotationTask": task,
558
+ }
559
+
560
+
561
+ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
562
+ """Submit request to Responsible AI service for evaluation and return operation ID
563
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
564
+ :type messages: str
565
+ :param metric: The evaluation metric to use.
566
+ :type metric: str
567
+ :param rai_svc_url: The Responsible AI service URL.
568
+ :type rai_svc_url: str
569
+ :param token: The Azure authentication token.
570
+ :type token: str
571
+ :return: The operation ID.
572
+ :rtype: str
573
+ """
574
+ ## handle json payload and payload from inference sdk strongly type messages
575
+ if len(messages) > 0 and not isinstance(messages[0], dict):
576
+ try:
577
+ from azure.ai.inference.models import ChatRequestMessage
578
+ except ImportError as ex:
579
+ error_message = (
580
+ "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
581
+ )
582
+ raise MissingRequiredPackage(message=error_message) from ex
583
+ if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
584
+ messages = [message.as_dict() for message in messages]
585
+
586
+ filtered_messages = [message for message in messages if message["role"] != "system"]
587
+ assistant_messages = [message for message in messages if message["role"] == "assistant"]
588
+ content_type = retrieve_content_type(assistant_messages, metric)
589
+ payload = generate_payload_multimodal(content_type, filtered_messages, metric)
590
+
591
+ ## calling rai service for annotation
592
+ url = rai_svc_url + "/submitannotation"
593
+ headers = get_common_headers(token)
594
+ async with get_async_http_client() as client:
595
+ response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
596
+ url, json=payload, headers=headers
597
+ )
598
+ if response.status_code != 202:
599
+ raise HttpResponseError(
600
+ message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
601
+ )
602
+ result = response.json()
603
+ operation_id = result["location"].split("/")[-1]
604
+ return operation_id
605
+
443
606
 
607
+ async def evaluate_with_rai_service_multimodal(
608
+ messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
609
+ ):
610
+ """ "Evaluate the content safety of the response using Responsible AI service
611
+ :param messages: The normalized list of messages.
612
+ :type messages: str
613
+ :param metric_name: The evaluation metric to use.
614
+ :type metric_name: str
615
+ :param project_scope: The Azure AI project scope details.
616
+ :type project_scope: Dict
617
+ :param credential: The Azure authentication credential.
618
+ :type credential:
619
+ ~azure.core.credentials.TokenCredential
620
+ :return: The parsed annotation result.
621
+ :rtype: List[List[Dict]]
622
+ """
623
+
624
+ # Get RAI service URL from discovery service and check service availability
625
+ token = await fetch_or_reuse_token(credential)
626
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
627
+ await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
628
+ # Submit annotation request and fetch result
629
+ operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
630
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
631
+ result = parse_response(annotation_response, metric_name)
444
632
  return result