azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (93) hide show
  1. azure/ai/evaluation/__init__.py +23 -1
  2. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
  3. azure/ai/evaluation/_common/constants.py +9 -2
  4. azure/ai/evaluation/_common/math.py +29 -0
  5. azure/ai/evaluation/_common/rai_service.py +222 -93
  6. azure/ai/evaluation/_common/utils.py +328 -19
  7. azure/ai/evaluation/_constants.py +16 -8
  8. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
  10. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
  11. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
  12. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  13. azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
  14. azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
  15. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
  16. azure/ai/evaluation/_evaluate/_utils.py +77 -25
  17. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  18. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
  19. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  20. azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
  21. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
  22. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
  24. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
  25. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
  26. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
  27. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
  28. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
  29. azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
  30. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  31. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
  32. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  33. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  34. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
  35. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  38. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  43. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  44. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  45. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  46. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
  47. azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
  48. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
  49. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  50. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
  51. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  52. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  53. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  55. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
  56. azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
  57. azure/ai/evaluation/_exceptions.py +26 -6
  58. azure/ai/evaluation/_http_utils.py +203 -132
  59. azure/ai/evaluation/_model_configurations.py +23 -6
  60. azure/ai/evaluation/_vendor/__init__.py +3 -0
  61. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  62. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  63. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  64. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  65. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  66. azure/ai/evaluation/_version.py +1 -1
  67. azure/ai/evaluation/simulator/__init__.py +2 -1
  68. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  69. azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
  70. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  71. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  72. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  73. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  74. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  75. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  76. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
  78. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  80. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  81. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  82. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
  83. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  84. azure/ai/evaluation/simulator/_simulator.py +222 -169
  85. azure/ai/evaluation/simulator/_tracing.py +4 -4
  86. azure/ai/evaluation/simulator/_utils.py +6 -6
  87. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
  88. azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
  89. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  90. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  91. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  92. azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
  93. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -3,30 +3,32 @@
3
3
  # ---------------------------------------------------------
4
4
  import asyncio
5
5
  import importlib.metadata
6
+ import math
6
7
  import re
7
8
  import time
8
9
  from ast import literal_eval
9
- from typing import Dict, List
10
+ from typing import Dict, List, Optional, Union, cast
10
11
  from urllib.parse import urlparse
12
+ from string import Template
11
13
 
12
14
  import jwt
13
- import numpy as np
14
15
 
16
+ from promptflow.core._errors import MissingRequiredPackage
15
17
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
- from azure.ai.evaluation._http_utils import get_async_http_client
18
+ from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
17
19
  from azure.ai.evaluation._model_configurations import AzureAIProject
18
20
  from azure.core.credentials import TokenCredential
19
- from azure.identity import DefaultAzureCredential
21
+ from azure.core.exceptions import HttpResponseError
22
+ from azure.core.pipeline.policies import AsyncRetryPolicy
20
23
 
21
24
  from .constants import (
22
25
  CommonConstants,
23
26
  EvaluationMetrics,
24
27
  RAIService,
25
28
  Tasks,
26
- _InternalAnnotationTasks,
27
29
  _InternalEvaluationMetrics,
28
30
  )
29
- from .utils import get_harm_severity_level
31
+ from .utils import get_harm_severity_level, retrieve_content_type
30
32
 
31
33
  try:
32
34
  version = importlib.metadata.version("azure-ai-evaluation")
@@ -34,6 +36,11 @@ except importlib.metadata.PackageNotFoundError:
34
36
  version = "unknown"
35
37
  USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
36
38
 
39
+ USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
40
+ "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
41
+ Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
42
+ }
43
+
37
44
 
38
45
  def get_common_headers(token: str) -> Dict:
39
46
  """Get common headers for the HTTP request
@@ -53,7 +60,13 @@ def get_common_headers(token: str) -> Dict:
53
60
  }
54
61
 
55
62
 
56
- async def ensure_service_availability(rai_svc_url: str, token: str, capability: str = None) -> None:
63
+ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
64
+ return get_async_http_client().with_policies(
65
+ retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
66
+ )
67
+
68
+
69
+ async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
57
70
  """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
58
71
 
59
72
  :param rai_svc_url: The Responsible AI service URL.
@@ -68,9 +81,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
68
81
  svc_liveness_url = rai_svc_url + "/checkannotation"
69
82
 
70
83
  async with get_async_http_client() as client:
71
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
72
- svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
73
- )
84
+ response = await client.get(svc_liveness_url, headers=headers)
74
85
 
75
86
  if response.status_code != 200:
76
87
  msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
@@ -95,7 +106,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
95
106
  )
96
107
 
97
108
 
98
- def generate_payload(normalized_user_text: str, metric: str) -> Dict:
109
+ def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
99
110
  """Generate the payload for the annotation request
100
111
 
101
112
  :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
@@ -103,19 +114,18 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
103
114
  :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
104
115
  in the payload.
105
116
  :type metric: str
117
+ :param annotation_task: The annotation task to be passed to service
118
+ :type annotation_task: str
106
119
  :return: The payload for the annotation request.
107
120
  :rtype: Dict
108
121
  """
109
122
  include_metric = True
110
- task = Tasks.CONTENT_HARM
123
+ task = annotation_task
111
124
  if metric == EvaluationMetrics.PROTECTED_MATERIAL:
112
- task = Tasks.PROTECTED_MATERIAL
113
125
  include_metric = False
114
126
  elif metric == _InternalEvaluationMetrics.ECI:
115
- task = _InternalAnnotationTasks.ECI
116
127
  include_metric = False
117
128
  elif metric == EvaluationMetrics.XPIA:
118
- task = Tasks.XPIA
119
129
  include_metric = False
120
130
  return (
121
131
  {
@@ -131,39 +141,36 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
131
141
  )
132
142
 
133
143
 
134
- async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
144
+ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
135
145
  """Submit request to Responsible AI service for evaluation and return operation ID
136
146
 
137
- :param query: The query to evaluate.
138
- :type query: str
139
- :param response: The response to evaluate.
140
- :type response: str
147
+ :param data: The data to evaluate.
148
+ :type data: dict
141
149
  :param metric: The evaluation metric to use.
142
150
  :type metric: str
143
151
  :param rai_svc_url: The Responsible AI service URL.
144
152
  :type rai_svc_url: str
145
153
  :param token: The Azure authentication token.
146
154
  :type token: str
155
+ :param annotation_task: The annotation task to use.
156
+ :type annotation_task: str
147
157
  :return: The operation ID.
148
158
  :rtype: str
149
159
  """
150
- user_text = f"<Human>{query}</><System>{response}</>"
160
+ user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data)
151
161
  normalized_user_text = user_text.replace("'", '\\"')
152
- payload = generate_payload(normalized_user_text, metric)
162
+ payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
153
163
 
154
164
  url = rai_svc_url + "/submitannotation"
155
165
  headers = get_common_headers(token)
156
166
 
157
- async with get_async_http_client() as client:
158
- response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
159
- url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
160
- )
161
-
162
- if response.status_code != 202:
163
- print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
164
- response.raise_for_status()
167
+ async with get_async_http_client_with_timeout() as client:
168
+ http_response = await client.post(url, json=payload, headers=headers)
165
169
 
166
- result = response.json()
170
+ if http_response.status_code != 202:
171
+ print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
172
+ http_response.raise_for_status()
173
+ result = http_response.json()
167
174
  operation_id = result["location"].split("/")[-1]
168
175
  return operation_id
169
176
 
@@ -190,10 +197,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
190
197
  token = await fetch_or_reuse_token(credential, token)
191
198
  headers = get_common_headers(token)
192
199
 
193
- async with get_async_http_client() as client:
194
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
195
- url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
196
- )
200
+ async with get_async_http_client_with_timeout() as client:
201
+ response = await client.get(url, headers=headers)
197
202
 
198
203
  if response.status_code == 200:
199
204
  return response.json()
@@ -208,19 +213,28 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
208
213
 
209
214
 
210
215
  def parse_response( # pylint: disable=too-many-branches,too-many-statements
211
- batch_response: List[Dict], metric_name: str
212
- ) -> Dict:
216
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
217
+ ) -> Dict[str, Union[str, float]]:
213
218
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
214
219
 
215
220
  :param batch_response: The annotation response from Responsible AI service.
216
221
  :type batch_response: List[Dict]
217
222
  :param metric_name: The evaluation metric to use.
218
223
  :type metric_name: str
224
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
225
+ :type metric_display_name: Optional[str]
219
226
  :return: The parsed annotation result.
220
- :rtype: List[List[Dict]]
227
+ :rtype: Dict[str, Union[str, float]]
221
228
  """
229
+ if metric_display_name is None:
230
+ metric_display_name = metric_name
231
+
222
232
  # non-numeric metrics
223
- if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
233
+ if metric_name in {
234
+ EvaluationMetrics.PROTECTED_MATERIAL,
235
+ _InternalEvaluationMetrics.ECI,
236
+ EvaluationMetrics.XPIA,
237
+ }:
224
238
  if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
225
239
  return {}
226
240
  response = batch_response[0][metric_name]
@@ -229,43 +243,51 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
229
243
  parsed_response = literal_eval(response)
230
244
  result = {}
231
245
  # Use label instead of score since these are assumed to be boolean results.
232
- # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
233
- result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
234
- result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
246
+ # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
247
+ result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
248
+ result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
235
249
 
236
250
  if metric_name == EvaluationMetrics.XPIA:
237
251
  # Add "manipulated_content", "intrusion" and "information_gathering" to the result
238
- # if present else set them to np.nan
239
- result[metric_name + "_manipulated_content"] = (
240
- parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
252
+ # if present else set them to math.nan
253
+ result[metric_display_name + "_manipulated_content"] = (
254
+ parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
241
255
  )
242
- result[metric_name + "_intrusion"] = (
243
- parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
256
+ result[metric_display_name + "_intrusion"] = (
257
+ parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
244
258
  )
245
- result[metric_name + "_information_gathering"] = (
246
- parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
259
+ result[metric_display_name + "_information_gathering"] = (
260
+ parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
247
261
  )
248
262
  return result
249
- return _parse_content_harm_response(batch_response, metric_name)
263
+ return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
250
264
 
251
265
 
252
- def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
266
+ def _parse_content_harm_response(
267
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
268
+ ) -> Dict[str, Union[str, float]]:
253
269
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
254
270
 
255
271
  :param batch_response: The annotation response from Responsible AI service.
256
272
  :type batch_response: List[Dict]
257
273
  :param metric_name: The evaluation metric to use.
258
274
  :type metric_name: str
275
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
276
+ :type metric_display_name: Optional[str]
259
277
  :return: The parsed annotation result.
260
- :rtype: List[List[Dict]]
278
+ :rtype: Dict[str, Union[str, float]]
261
279
  """
262
280
  # Fix the metric name if it's "hate_fairness"
263
281
  # Eventually we will remove this fix once the RAI service is updated
264
- key = metric_name
282
+ key = metric_name if metric_display_name is None else metric_display_name
265
283
  if key == EvaluationMetrics.HATE_FAIRNESS:
266
284
  key = EvaluationMetrics.HATE_UNFAIRNESS
267
285
 
268
- result = {key: np.nan, key + "_score": np.nan, key + "_reason": ""}
286
+ result: Dict[str, Union[str, float]] = {
287
+ (key.value if hasattr(key, "value") else key): math.nan,
288
+ f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
289
+ f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
290
+ }
269
291
 
270
292
  response = batch_response[0]
271
293
  if metric_name not in response:
@@ -283,11 +305,11 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
283
305
 
284
306
  # get content harm metric_value
285
307
  if "label" in harm_response:
286
- metric_value = harm_response["label"]
308
+ metric_value = float(harm_response["label"])
287
309
  elif "valid" in harm_response:
288
- metric_value = 0 if harm_response["valid"] else np.nan
310
+ metric_value = 0 if harm_response["valid"] else math.nan
289
311
  else:
290
- metric_value = np.nan
312
+ metric_value = math.nan
291
313
 
292
314
  # get reason
293
315
  if "reasoning" in harm_response:
@@ -301,21 +323,23 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
301
323
  if metric_value_match:
302
324
  metric_value = int(metric_value_match[0])
303
325
  else:
304
- metric_value = np.nan
326
+ metric_value = math.nan
305
327
  reason = harm_response
306
328
  elif harm_response != "" and isinstance(harm_response, (int, float)):
307
329
  if 0 < harm_response <= 7:
308
330
  metric_value = harm_response
309
331
  else:
310
- metric_value = np.nan
332
+ metric_value = math.nan
311
333
  reason = ""
312
334
  else:
313
- metric_value = np.nan
335
+ metric_value = math.nan
314
336
  reason = ""
315
337
 
316
338
  harm_score = metric_value
317
- if not np.isnan(metric_value):
318
- # int(np.nan) causes a value error, and np.nan is already handled
339
+ if metric_value == "n/a":
340
+ return result
341
+ if not math.isnan(metric_value):
342
+ # int(math.nan) causes a value error, and math.nan is already handled
319
343
  # by get_harm_severity_level
320
344
  harm_score = int(metric_value)
321
345
  result[key] = get_harm_severity_level(harm_score)
@@ -337,14 +361,13 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
337
361
  """
338
362
  headers = get_common_headers(token)
339
363
 
340
- async with get_async_http_client() as client:
341
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
364
+ async with get_async_http_client_with_timeout() as client:
365
+ response = await client.get(
342
366
  f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
343
367
  f"resourceGroups/{azure_ai_project['resource_group_name']}/"
344
368
  f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
345
369
  f"api-version=2023-08-01-preview",
346
370
  headers=headers,
347
- timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
348
371
  )
349
372
 
350
373
  if response.status_code != 200:
@@ -361,7 +384,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
361
384
  return f"{base_url.scheme}://{base_url.netloc}"
362
385
 
363
386
 
364
- async def get_rai_svc_url(project_scope: dict, token: str) -> str:
387
+ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
365
388
  """Get the Responsible AI service URL
366
389
 
367
390
  :param project_scope: The Azure AI project scope details.
@@ -385,7 +408,7 @@ async def get_rai_svc_url(project_scope: dict, token: str) -> str:
385
408
  return rai_url
386
409
 
387
410
 
388
- async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str:
411
+ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str:
389
412
  """Get token. Fetch a new token if the current token is near expiry
390
413
 
391
414
  :param credential: The Azure authentication credential.
@@ -395,35 +418,147 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -
395
418
  :type token: str
396
419
  :return: The Azure authentication token.
397
420
  """
398
- acquire_new_token = True
399
- try:
400
- if token:
401
- # Decode the token to get its expiration time
421
+ if token:
422
+ # Decode the token to get its expiration time
423
+ try:
402
424
  decoded_token = jwt.decode(token, options={"verify_signature": False})
425
+ except jwt.PyJWTError:
426
+ pass
427
+ else:
403
428
  exp_time = decoded_token["exp"]
404
429
  current_time = time.time()
405
430
 
406
- # Check if the token is near expiry
431
+ # Return current token if not near expiry
407
432
  if (exp_time - current_time) >= 300:
408
- acquire_new_token = False
409
- except Exception: # pylint: disable=broad-exception-caught
410
- pass
433
+ return token
411
434
 
412
- if acquire_new_token:
413
- token = credential.get_token("https://management.azure.com/.default").token
414
-
415
- return token
435
+ return credential.get_token("https://management.azure.com/.default").token
416
436
 
417
437
 
418
438
  async def evaluate_with_rai_service(
419
- query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
420
- ):
439
+ data: dict,
440
+ metric_name: str,
441
+ project_scope: AzureAIProject,
442
+ credential: TokenCredential,
443
+ annotation_task: str = Tasks.CONTENT_HARM,
444
+ metric_display_name=None,
445
+ ) -> Dict[str, Union[str, float]]:
421
446
  """ "Evaluate the content safety of the response using Responsible AI service
422
447
 
423
- :param query: The query to evaluate.
424
- :type query: str
425
- :param response: The response to evaluate.
426
- :type response: str
448
+ :param data: The data to evaluate.
449
+ :type data: dict
450
+ :param metric_name: The evaluation metric to use.
451
+ :type metric_name: str
452
+ :param project_scope: The Azure AI project scope details.
453
+ :type project_scope: Dict
454
+ :param credential: The Azure authentication credential.
455
+ :type credential:
456
+ ~azure.core.credentials.TokenCredential
457
+ :param annotation_task: The annotation task to use.
458
+ :type annotation_task: str
459
+ :param metric_display_name: The display name of metric to use.
460
+ :type metric_display_name: str
461
+ :return: The parsed annotation result.
462
+ :rtype: Dict[str, Union[str, float]]
463
+ """
464
+
465
+ # Get RAI service URL from discovery service and check service availability
466
+ token = await fetch_or_reuse_token(credential)
467
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
468
+ await ensure_service_availability(rai_svc_url, token, annotation_task)
469
+
470
+ # Submit annotation request and fetch result
471
+ operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
472
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
473
+ result = parse_response(annotation_response, metric_name, metric_display_name)
474
+
475
+ return result
476
+
477
+
478
+ def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
479
+ """Generate the payload for the annotation request
480
+ :param content_type: The type of the content representing multimodal or images.
481
+ :type content_type: str
482
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
483
+ :type messages: str
484
+ :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
485
+ in the payload.
486
+ :type metric: str
487
+ :return: The payload for the annotation request.
488
+ :rtype: Dict
489
+ """
490
+ include_metric = True
491
+ task = Tasks.CONTENT_HARM
492
+ if metric == EvaluationMetrics.PROTECTED_MATERIAL:
493
+ task = Tasks.PROTECTED_MATERIAL
494
+ include_metric = False
495
+
496
+ if include_metric:
497
+ return {
498
+ "ContentType": content_type,
499
+ "Contents": [{"messages": messages}],
500
+ "AnnotationTask": task,
501
+ "MetricList": [metric],
502
+ }
503
+ return {
504
+ "ContentType": content_type,
505
+ "Contents": [{"messages": messages}],
506
+ "AnnotationTask": task,
507
+ }
508
+
509
+
510
+ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
511
+ """Submit request to Responsible AI service for evaluation and return operation ID
512
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
513
+ :type messages: str
514
+ :param metric: The evaluation metric to use.
515
+ :type metric: str
516
+ :param rai_svc_url: The Responsible AI service URL.
517
+ :type rai_svc_url: str
518
+ :param token: The Azure authentication token.
519
+ :type token: str
520
+ :return: The operation ID.
521
+ :rtype: str
522
+ """
523
+ ## handle json payload and payload from inference sdk strongly type messages
524
+ if len(messages) > 0 and not isinstance(messages[0], dict):
525
+ try:
526
+ from azure.ai.inference.models import ChatRequestMessage
527
+ except ImportError as ex:
528
+ error_message = (
529
+ "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
530
+ )
531
+ raise MissingRequiredPackage(message=error_message) from ex
532
+ if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
533
+ messages = [message.as_dict() for message in messages]
534
+
535
+ filtered_messages = [message for message in messages if message["role"] != "system"]
536
+ assistant_messages = [message for message in messages if message["role"] == "assistant"]
537
+ content_type = retrieve_content_type(assistant_messages, metric)
538
+ payload = generate_payload_multimodal(content_type, filtered_messages, metric)
539
+
540
+ ## calling rai service for annotation
541
+ url = rai_svc_url + "/submitannotation"
542
+ headers = get_common_headers(token)
543
+ async with get_async_http_client() as client:
544
+ response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
545
+ url, json=payload, headers=headers
546
+ )
547
+ if response.status_code != 202:
548
+ raise HttpResponseError(
549
+ message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
550
+ )
551
+ result = response.json()
552
+ operation_id = result["location"].split("/")[-1]
553
+ return operation_id
554
+
555
+
556
+ async def evaluate_with_rai_service_multimodal(
557
+ messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
558
+ ):
559
+ """ "Evaluate the content safety of the response using Responsible AI service
560
+ :param messages: The normalized list of messages.
561
+ :type messages: str
427
562
  :param metric_name: The evaluation metric to use.
428
563
  :type metric_name: str
429
564
  :param project_scope: The Azure AI project scope details.
@@ -434,19 +569,13 @@ async def evaluate_with_rai_service(
434
569
  :return: The parsed annotation result.
435
570
  :rtype: List[List[Dict]]
436
571
  """
437
- # Use DefaultAzureCredential if no credential is provided
438
- # This is for the for batch run scenario as the credential cannot be serialized by promoptflow
439
- if credential is None or credential == {}:
440
- credential = DefaultAzureCredential()
441
572
 
442
573
  # Get RAI service URL from discovery service and check service availability
443
574
  token = await fetch_or_reuse_token(credential)
444
575
  rai_svc_url = await get_rai_svc_url(project_scope, token)
445
576
  await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
446
-
447
577
  # Submit annotation request and fetch result
448
- operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
449
- annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
578
+ operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
579
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
450
580
  result = parse_response(annotation_response, metric_name)
451
-
452
581
  return result