azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (79) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/_common/constants.py +5 -0
  3. azure/ai/evaluation/_common/math.py +11 -0
  4. azure/ai/evaluation/_common/rai_service.py +172 -35
  5. azure/ai/evaluation/_common/utils.py +162 -23
  6. azure/ai/evaluation/_constants.py +6 -6
  7. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  8. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +4 -4
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +6 -3
  10. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  11. azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
  12. azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
  13. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
  14. azure/ai/evaluation/_evaluate/_utils.py +40 -7
  15. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
  17. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  18. azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
  19. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
  20. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
  29. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +49 -15
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  34. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  35. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  36. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  43. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +44 -11
  44. azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
  45. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
  46. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +125 -82
  48. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  49. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  50. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  51. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  52. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
  53. azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
  54. azure/ai/evaluation/_exceptions.py +17 -0
  55. azure/ai/evaluation/_model_configurations.py +18 -1
  56. azure/ai/evaluation/_version.py +1 -1
  57. azure/ai/evaluation/simulator/__init__.py +2 -1
  58. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  59. azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
  60. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  61. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  62. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  63. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  64. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  65. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
  66. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  69. azure/ai/evaluation/simulator/_simulator.py +115 -61
  70. azure/ai/evaluation/simulator/_utils.py +6 -6
  71. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +166 -9
  72. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/NOTICE.txt +20 -0
  73. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  74. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  76. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  77. /azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +0 -0
  78. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  79. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -12,10 +12,19 @@ from ._evaluators._content_safety import (
12
12
  SexualEvaluator,
13
13
  ViolenceEvaluator,
14
14
  )
15
+ from ._evaluators._multimodal._content_safety_multimodal import (
16
+ ContentSafetyMultimodalEvaluator,
17
+ HateUnfairnessMultimodalEvaluator,
18
+ SelfHarmMultimodalEvaluator,
19
+ SexualMultimodalEvaluator,
20
+ ViolenceMultimodalEvaluator,
21
+ )
22
+ from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
15
23
  from ._evaluators._f1_score import F1ScoreEvaluator
16
24
  from ._evaluators._fluency import FluencyEvaluator
17
25
  from ._evaluators._gleu import GleuScoreEvaluator
18
26
  from ._evaluators._groundedness import GroundednessEvaluator
27
+ from ._evaluators._service_groundedness import GroundednessProEvaluator
19
28
  from ._evaluators._meteor import MeteorScoreEvaluator
20
29
  from ._evaluators._protected_material import ProtectedMaterialEvaluator
21
30
  from ._evaluators._qa import QAEvaluator
@@ -27,7 +36,10 @@ from ._evaluators._xpia import IndirectAttackEvaluator
27
36
  from ._model_configurations import (
28
37
  AzureAIProject,
29
38
  AzureOpenAIModelConfiguration,
39
+ Conversation,
40
+ EvaluationResult,
30
41
  EvaluatorConfig,
42
+ Message,
31
43
  OpenAIModelConfiguration,
32
44
  )
33
45
 
@@ -37,6 +49,7 @@ __all__ = [
37
49
  "F1ScoreEvaluator",
38
50
  "FluencyEvaluator",
39
51
  "GroundednessEvaluator",
52
+ "GroundednessProEvaluator",
40
53
  "RelevanceEvaluator",
41
54
  "SimilarityEvaluator",
42
55
  "QAEvaluator",
@@ -57,4 +70,13 @@ __all__ = [
57
70
  "AzureOpenAIModelConfiguration",
58
71
  "OpenAIModelConfiguration",
59
72
  "EvaluatorConfig",
73
+ "Conversation",
74
+ "Message",
75
+ "EvaluationResult",
76
+ "ContentSafetyMultimodalEvaluator",
77
+ "HateUnfairnessMultimodalEvaluator",
78
+ "SelfHarmMultimodalEvaluator",
79
+ "SexualMultimodalEvaluator",
80
+ "ViolenceMultimodalEvaluator",
81
+ "ProtectedMaterialMultimodalEvaluator",
60
82
  ]
@@ -6,6 +6,9 @@ from enum import Enum
6
6
  from azure.core import CaseInsensitiveEnumMeta
7
7
 
8
8
 
9
+ PROMPT_BASED_REASON_EVALUATORS = ["coherence", "relevance", "retrieval", "groundedness", "fluency"]
10
+
11
+
9
12
  class CommonConstants:
10
13
  """Define common constants."""
11
14
 
@@ -35,6 +38,7 @@ class Tasks:
35
38
  CONTENT_HARM = "content harm"
36
39
  PROTECTED_MATERIAL = "protected material"
37
40
  XPIA = "xpia"
41
+ GROUNDEDNESS = "groundedness"
38
42
 
39
43
 
40
44
  class _InternalAnnotationTasks:
@@ -56,6 +60,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
56
60
  SEXUAL = "sexual"
57
61
  PROTECTED_MATERIAL = "protected_material"
58
62
  XPIA = "xpia"
63
+ GROUNDEDNESS = "generic_groundedness"
59
64
 
60
65
 
61
66
  class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
@@ -5,6 +5,8 @@
5
5
  import math
6
6
  from typing import List
7
7
 
8
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
9
+
8
10
 
9
11
  def list_sum(lst: List[float]) -> float:
10
12
  return sum(lst)
@@ -15,4 +17,13 @@ def list_mean(lst: List[float]) -> float:
15
17
 
16
18
 
17
19
  def list_mean_nan_safe(lst: List[float]) -> float:
20
+ msg = "All score values are NaN. The mean cannot be calculated."
21
+ if all(math.isnan(l) for l in lst):
22
+ raise EvaluationException(
23
+ message=msg,
24
+ internal_message=msg,
25
+ blame=ErrorBlame.USER_ERROR,
26
+ category=ErrorCategory.INVALID_VALUE,
27
+ target=ErrorTarget.CONVERSATION,
28
+ )
18
29
  return list_mean([l for l in lst if not math.isnan(l)])
@@ -9,13 +9,16 @@ import time
9
9
  from ast import literal_eval
10
10
  from typing import Dict, List, Optional, Union, cast
11
11
  from urllib.parse import urlparse
12
+ from string import Template
12
13
 
13
14
  import jwt
14
15
 
16
+ from promptflow.core._errors import MissingRequiredPackage
15
17
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
18
  from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
17
19
  from azure.ai.evaluation._model_configurations import AzureAIProject
18
20
  from azure.core.credentials import TokenCredential
21
+ from azure.core.exceptions import HttpResponseError
19
22
  from azure.core.pipeline.policies import AsyncRetryPolicy
20
23
 
21
24
  from .constants import (
@@ -23,10 +26,9 @@ from .constants import (
23
26
  EvaluationMetrics,
24
27
  RAIService,
25
28
  Tasks,
26
- _InternalAnnotationTasks,
27
29
  _InternalEvaluationMetrics,
28
30
  )
29
- from .utils import get_harm_severity_level
31
+ from .utils import get_harm_severity_level, retrieve_content_type
30
32
 
31
33
  try:
32
34
  version = importlib.metadata.version("azure-ai-evaluation")
@@ -34,6 +36,11 @@ except importlib.metadata.PackageNotFoundError:
34
36
  version = "unknown"
35
37
  USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
36
38
 
39
+ USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
40
+ "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
41
+ Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
42
+ }
43
+
37
44
 
38
45
  def get_common_headers(token: str) -> Dict:
39
46
  """Get common headers for the HTTP request
@@ -99,7 +106,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
99
106
  )
100
107
 
101
108
 
102
- def generate_payload(normalized_user_text: str, metric: str) -> Dict:
109
+ def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
103
110
  """Generate the payload for the annotation request
104
111
 
105
112
  :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
@@ -107,19 +114,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
107
114
  :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
108
115
  in the payload.
109
116
  :type metric: str
117
+ :param annotation_task: The annotation task to be passed to service
118
+ :type annotation_task: str
110
119
  :return: The payload for the annotation request.
111
120
  :rtype: Dict
112
121
  """
113
122
  include_metric = True
114
- task = Tasks.CONTENT_HARM
123
+ task = annotation_task
115
124
  if metric == EvaluationMetrics.PROTECTED_MATERIAL:
116
- task = Tasks.PROTECTED_MATERIAL
117
125
  include_metric = False
118
126
  elif metric == _InternalEvaluationMetrics.ECI:
119
- task = _InternalAnnotationTasks.ECI
120
127
  include_metric = False
121
128
  elif metric == EvaluationMetrics.XPIA:
122
- task = Tasks.XPIA
123
129
  include_metric = False
124
130
  return (
125
131
  {
@@ -135,25 +141,25 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
135
141
  )
136
142
 
137
143
 
138
- async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
144
+ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
139
145
  """Submit request to Responsible AI service for evaluation and return operation ID
140
146
 
141
- :param query: The query to evaluate.
142
- :type query: str
143
- :param response: The response to evaluate.
144
- :type response: str
147
+ :param data: The data to evaluate.
148
+ :type data: dict
145
149
  :param metric: The evaluation metric to use.
146
150
  :type metric: str
147
151
  :param rai_svc_url: The Responsible AI service URL.
148
152
  :type rai_svc_url: str
149
153
  :param token: The Azure authentication token.
150
154
  :type token: str
155
+ :param annotation_task: The annotation task to use.
156
+ :type annotation_task: str
151
157
  :return: The operation ID.
152
158
  :rtype: str
153
159
  """
154
- user_text = f"<Human>{query}</><System>{response}</>"
160
+ user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
155
161
  normalized_user_text = user_text.replace("'", '\\"')
156
- payload = generate_payload(normalized_user_text, metric)
162
+ payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
157
163
 
158
164
  url = rai_svc_url + "/submitannotation"
159
165
  headers = get_common_headers(token)
@@ -164,7 +170,6 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
164
170
  if http_response.status_code != 202:
165
171
  print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
166
172
  http_response.raise_for_status()
167
-
168
173
  result = http_response.json()
169
174
  operation_id = result["location"].split("/")[-1]
170
175
  return operation_id
@@ -208,7 +213,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
208
213
 
209
214
 
210
215
  def parse_response( # pylint: disable=too-many-branches,too-many-statements
211
- batch_response: List[Dict], metric_name: str
216
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
212
217
  ) -> Dict[str, Union[str, float]]:
213
218
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
214
219
 
@@ -216,11 +221,20 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
216
221
  :type batch_response: List[Dict]
217
222
  :param metric_name: The evaluation metric to use.
218
223
  :type metric_name: str
224
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
225
+ :type metric_display_name: Optional[str]
219
226
  :return: The parsed annotation result.
220
227
  :rtype: Dict[str, Union[str, float]]
221
228
  """
229
+ if metric_display_name is None:
230
+ metric_display_name = metric_name
231
+
222
232
  # non-numeric metrics
223
- if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
233
+ if metric_name in {
234
+ EvaluationMetrics.PROTECTED_MATERIAL,
235
+ _InternalEvaluationMetrics.ECI,
236
+ EvaluationMetrics.XPIA,
237
+ }:
224
238
  if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
225
239
  return {}
226
240
  response = batch_response[0][metric_name]
@@ -230,42 +244,50 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
230
244
  result = {}
231
245
  # Use label instead of score since these are assumed to be boolean results.
232
246
  # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
233
- result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
234
- result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
247
+ result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
248
+ result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
235
249
 
236
250
  if metric_name == EvaluationMetrics.XPIA:
237
251
  # Add "manipulated_content", "intrusion" and "information_gathering" to the result
238
252
  # if present else set them to math.nan
239
- result[metric_name + "_manipulated_content"] = (
253
+ result[metric_display_name + "_manipulated_content"] = (
240
254
  parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
241
255
  )
242
- result[metric_name + "_intrusion"] = (
256
+ result[metric_display_name + "_intrusion"] = (
243
257
  parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
244
258
  )
245
- result[metric_name + "_information_gathering"] = (
259
+ result[metric_display_name + "_information_gathering"] = (
246
260
  parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
247
261
  )
248
262
  return result
249
- return _parse_content_harm_response(batch_response, metric_name)
263
+ return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
250
264
 
251
265
 
252
- def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict[str, Union[str, float]]:
266
+ def _parse_content_harm_response(
267
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
268
+ ) -> Dict[str, Union[str, float]]:
253
269
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
254
270
 
255
271
  :param batch_response: The annotation response from Responsible AI service.
256
272
  :type batch_response: List[Dict]
257
273
  :param metric_name: The evaluation metric to use.
258
274
  :type metric_name: str
275
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
276
+ :type metric_display_name: Optional[str]
259
277
  :return: The parsed annotation result.
260
278
  :rtype: Dict[str, Union[str, float]]
261
279
  """
262
280
  # Fix the metric name if it's "hate_fairness"
263
281
  # Eventually we will remove this fix once the RAI service is updated
264
- key = metric_name
282
+ key = metric_name if metric_display_name is None else metric_display_name
265
283
  if key == EvaluationMetrics.HATE_FAIRNESS:
266
284
  key = EvaluationMetrics.HATE_UNFAIRNESS
267
285
 
268
- result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
286
+ result: Dict[str, Union[str, float]] = {
287
+ (key.value if hasattr(key, "value") else key): math.nan,
288
+ f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
289
+ f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
290
+ }
269
291
 
270
292
  response = batch_response[0]
271
293
  if metric_name not in response:
@@ -283,7 +305,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
283
305
 
284
306
  # get content harm metric_value
285
307
  if "label" in harm_response:
286
- metric_value = harm_response["label"]
308
+ metric_value = float(harm_response["label"])
287
309
  elif "valid" in harm_response:
288
310
  metric_value = 0 if harm_response["valid"] else math.nan
289
311
  else:
@@ -314,6 +336,8 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
314
336
  reason = ""
315
337
 
316
338
  harm_score = metric_value
339
+ if metric_value == "n/a":
340
+ return result
317
341
  if not math.isnan(metric_value):
318
342
  # int(math.nan) causes a value error, and math.nan is already handled
319
343
  # by get_harm_severity_level
@@ -412,14 +436,17 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
412
436
 
413
437
 
414
438
  async def evaluate_with_rai_service(
415
- query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
439
+ data: dict,
440
+ metric_name: str,
441
+ project_scope: AzureAIProject,
442
+ credential: TokenCredential,
443
+ annotation_task: str = Tasks.CONTENT_HARM,
444
+ metric_display_name=None,
416
445
  ) -> Dict[str, Union[str, float]]:
417
446
  """ "Evaluate the content safety of the response using Responsible AI service
418
447
 
419
- :param query: The query to evaluate.
420
- :type query: str
421
- :param response: The response to evaluate.
422
- :type response: str
448
+ :param data: The data to evaluate.
449
+ :type data: dict
423
450
  :param metric_name: The evaluation metric to use.
424
451
  :type metric_name: str
425
452
  :param project_scope: The Azure AI project scope details.
@@ -427,6 +454,10 @@ async def evaluate_with_rai_service(
427
454
  :param credential: The Azure authentication credential.
428
455
  :type credential:
429
456
  ~azure.core.credentials.TokenCredential
457
+ :param annotation_task: The annotation task to use.
458
+ :type annotation_task: str
459
+ :param metric_display_name: The display name of metric to use.
460
+ :type metric_display_name: str
430
461
  :return: The parsed annotation result.
431
462
  :rtype: Dict[str, Union[str, float]]
432
463
  """
@@ -434,11 +465,117 @@ async def evaluate_with_rai_service(
434
465
  # Get RAI service URL from discovery service and check service availability
435
466
  token = await fetch_or_reuse_token(credential)
436
467
  rai_svc_url = await get_rai_svc_url(project_scope, token)
437
- await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
468
+ await ensure_service_availability(rai_svc_url, token, annotation_task)
438
469
 
439
470
  # Submit annotation request and fetch result
440
- operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
471
+ operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
441
472
  annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
442
- result = parse_response(annotation_response, metric_name)
473
+ result = parse_response(annotation_response, metric_name, metric_display_name)
474
+
475
+ return result
476
+
477
+
478
+ def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
479
+ """Generate the payload for the annotation request
480
+ :param content_type: The type of the content representing multimodal or images.
481
+ :type content_type: str
482
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
483
+ :type messages: str
484
+ :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
485
+ in the payload.
486
+ :type metric: str
487
+ :return: The payload for the annotation request.
488
+ :rtype: Dict
489
+ """
490
+ include_metric = True
491
+ task = Tasks.CONTENT_HARM
492
+ if metric == EvaluationMetrics.PROTECTED_MATERIAL:
493
+ task = Tasks.PROTECTED_MATERIAL
494
+ include_metric = False
495
+
496
+ if include_metric:
497
+ return {
498
+ "ContentType": content_type,
499
+ "Contents": [{"messages": messages}],
500
+ "AnnotationTask": task,
501
+ "MetricList": [metric],
502
+ }
503
+ return {
504
+ "ContentType": content_type,
505
+ "Contents": [{"messages": messages}],
506
+ "AnnotationTask": task,
507
+ }
508
+
509
+
510
+ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
511
+ """Submit request to Responsible AI service for evaluation and return operation ID
512
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
513
+ :type messages: str
514
+ :param metric: The evaluation metric to use.
515
+ :type metric: str
516
+ :param rai_svc_url: The Responsible AI service URL.
517
+ :type rai_svc_url: str
518
+ :param token: The Azure authentication token.
519
+ :type token: str
520
+ :return: The operation ID.
521
+ :rtype: str
522
+ """
523
+ ## handle json payload and payload from inference sdk strongly type messages
524
+ if len(messages) > 0 and not isinstance(messages[0], dict):
525
+ try:
526
+ from azure.ai.inference.models import ChatRequestMessage
527
+ except ImportError as ex:
528
+ error_message = (
529
+ "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
530
+ )
531
+ raise MissingRequiredPackage(message=error_message) from ex
532
+ if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
533
+ messages = [message.as_dict() for message in messages]
534
+
535
+ filtered_messages = [message for message in messages if message["role"] != "system"]
536
+ assistant_messages = [message for message in messages if message["role"] == "assistant"]
537
+ content_type = retrieve_content_type(assistant_messages, metric)
538
+ payload = generate_payload_multimodal(content_type, filtered_messages, metric)
539
+
540
+ ## calling rai service for annotation
541
+ url = rai_svc_url + "/submitannotation"
542
+ headers = get_common_headers(token)
543
+ async with get_async_http_client() as client:
544
+ response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
545
+ url, json=payload, headers=headers
546
+ )
547
+ if response.status_code != 202:
548
+ raise HttpResponseError(
549
+ message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
550
+ )
551
+ result = response.json()
552
+ operation_id = result["location"].split("/")[-1]
553
+ return operation_id
554
+
443
555
 
556
+ async def evaluate_with_rai_service_multimodal(
557
+ messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
558
+ ):
559
+ """ "Evaluate the content safety of the response using Responsible AI service
560
+ :param messages: The normalized list of messages.
561
+ :type messages: str
562
+ :param metric_name: The evaluation metric to use.
563
+ :type metric_name: str
564
+ :param project_scope: The Azure AI project scope details.
565
+ :type project_scope: Dict
566
+ :param credential: The Azure authentication credential.
567
+ :type credential:
568
+ ~azure.core.credentials.TokenCredential
569
+ :return: The parsed annotation result.
570
+ :rtype: List[List[Dict]]
571
+ """
572
+
573
+ # Get RAI service URL from discovery service and check service availability
574
+ token = await fetch_or_reuse_token(credential)
575
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
576
+ await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
577
+ # Submit annotation request and fetch result
578
+ operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
579
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
580
+ result = parse_response(annotation_response, metric_name)
444
581
  return result