azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (105) hide show
  1. azure/ai/evaluation/__init__.py +5 -31
  2. azure/ai/evaluation/_common/constants.py +2 -9
  3. azure/ai/evaluation/_common/rai_service.py +120 -300
  4. azure/ai/evaluation/_common/utils.py +23 -381
  5. azure/ai/evaluation/_constants.py +6 -19
  6. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
  7. azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +7 -23
  8. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +17 -33
  9. azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/proxy_client.py +4 -32
  10. azure/ai/evaluation/_evaluate/_eval_run.py +24 -81
  11. azure/ai/evaluation/_evaluate/_evaluate.py +239 -393
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +17 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +28 -82
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +18 -17
  15. azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
  16. azure/ai/evaluation/_evaluators/_chat/_chat.py +357 -0
  17. azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
  18. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +157 -0
  19. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  20. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +88 -78
  21. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
  22. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +67 -105
  24. azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +34 -24
  25. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +301 -0
  26. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
  27. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
  28. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
  29. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
  30. azure/ai/evaluation/_evaluators/_eci/_eci.py +54 -44
  31. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +19 -34
  32. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +89 -76
  33. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
  34. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +16 -14
  35. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +87 -113
  36. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  37. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -20
  38. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
  39. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  40. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  41. azure/ai/evaluation/_evaluators/_qa/_qa.py +30 -23
  42. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +96 -84
  43. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
  44. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -26
  45. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +38 -53
  46. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +105 -91
  48. azure/ai/evaluation/_exceptions.py +7 -28
  49. azure/ai/evaluation/_http_utils.py +132 -203
  50. azure/ai/evaluation/_model_configurations.py +8 -104
  51. azure/ai/evaluation/_version.py +1 -1
  52. azure/ai/evaluation/simulator/__init__.py +1 -2
  53. azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
  54. azure/ai/evaluation/simulator/_adversarial_simulator.py +92 -111
  55. azure/ai/evaluation/simulator/_constants.py +1 -11
  56. azure/ai/evaluation/simulator/_conversation/__init__.py +12 -13
  57. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  58. azure/ai/evaluation/simulator/_direct_attack_simulator.py +67 -33
  59. azure/ai/evaluation/simulator/_helpers/__init__.py +2 -1
  60. azure/ai/evaluation/{_common → simulator/_helpers}/_experimental.py +9 -24
  61. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +5 -26
  62. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +94 -107
  63. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
  64. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +11 -28
  65. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +4 -8
  66. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
  67. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  68. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
  69. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
  70. azure/ai/evaluation/simulator/_simulator.py +207 -277
  71. azure/ai/evaluation/simulator/_tracing.py +4 -4
  72. azure/ai/evaluation/simulator/_utils.py +13 -31
  73. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +449 -0
  74. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +99 -0
  75. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +1 -1
  76. azure/ai/evaluation/_common/math.py +0 -89
  77. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
  78. azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
  79. azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
  80. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
  81. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
  82. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
  83. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
  84. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  85. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  86. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  87. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  88. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  89. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  90. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  91. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
  92. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
  93. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
  94. azure/ai/evaluation/_vendor/__init__.py +0 -3
  95. azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
  96. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
  97. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
  98. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
  99. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
  100. azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
  101. azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
  102. azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
  103. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
  104. azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
  105. {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
@@ -3,34 +3,30 @@
3
3
  # ---------------------------------------------------------
4
4
  import asyncio
5
5
  import importlib.metadata
6
- import math
7
6
  import re
8
7
  import time
9
- import json
10
- import html
11
8
  from ast import literal_eval
12
- from typing import Dict, List, Optional, Union, cast
9
+ from typing import Dict, List
13
10
  from urllib.parse import urlparse
14
- from string import Template
15
11
 
16
12
  import jwt
13
+ import numpy as np
17
14
 
18
- from promptflow.core._errors import MissingRequiredPackage
19
15
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
20
- from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
16
+ from azure.ai.evaluation._http_utils import get_async_http_client
21
17
  from azure.ai.evaluation._model_configurations import AzureAIProject
22
18
  from azure.core.credentials import TokenCredential
23
- from azure.core.exceptions import HttpResponseError
24
- from azure.core.pipeline.policies import AsyncRetryPolicy
19
+ from azure.identity import DefaultAzureCredential
25
20
 
26
21
  from .constants import (
27
22
  CommonConstants,
28
23
  EvaluationMetrics,
29
24
  RAIService,
30
25
  Tasks,
26
+ _InternalAnnotationTasks,
31
27
  _InternalEvaluationMetrics,
32
28
  )
33
- from .utils import get_harm_severity_level, retrieve_content_type
29
+ from .utils import get_harm_severity_level
34
30
 
35
31
  try:
36
32
  version = importlib.metadata.version("azure-ai-evaluation")
@@ -38,39 +34,6 @@ except importlib.metadata.PackageNotFoundError:
38
34
  version = "unknown"
39
35
  USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
40
36
 
41
- USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
42
- "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
43
- }
44
-
45
-
46
- def get_formatted_template(data: dict, annotation_task: str) -> str:
47
- """Given the task and input data, produce a formatted string that will serve as the main
48
- payload for the RAI service. Requires specific per-task logic.
49
-
50
- :param data: The data to incorporate into the payload.
51
- :type data: dict
52
- :param annotation_task: The annotation task to use. This determines the template to use.
53
- :type annotation_task: str
54
- :return: The formatted based on the data and task template.
55
- :rtype: str
56
- """
57
- # Template class doesn't play nice with json dumping/loading, just handle groundedness'
58
- # JSON format manually.
59
- # Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
60
- if annotation_task == Tasks.GROUNDEDNESS:
61
- as_dict = {
62
- "question": data.get("query", ""),
63
- "answer": data.get("response", ""),
64
- "context": data.get("context", ""),
65
- }
66
- return json.dumps(as_dict)
67
- as_dict = {
68
- "query": html.escape(data.get("query", "")),
69
- "response": html.escape(data.get("response", "")),
70
- }
71
- user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
72
- return user_text.replace("'", '\\"')
73
-
74
37
 
75
38
  def get_common_headers(token: str) -> Dict:
76
39
  """Get common headers for the HTTP request
@@ -90,13 +53,7 @@ def get_common_headers(token: str) -> Dict:
90
53
  }
91
54
 
92
55
 
93
- def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
94
- return get_async_http_client().with_policies(
95
- retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
96
- )
97
-
98
-
99
- async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
56
+ async def ensure_service_availability(rai_svc_url: str, token: str, capability: str = None) -> None:
100
57
  """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
101
58
 
102
59
  :param rai_svc_url: The Responsible AI service URL.
@@ -111,36 +68,34 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
111
68
  svc_liveness_url = rai_svc_url + "/checkannotation"
112
69
 
113
70
  async with get_async_http_client() as client:
114
- response = await client.get(svc_liveness_url, headers=headers)
71
+ response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
72
+ svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
73
+ )
115
74
 
116
- if response.status_code != 200:
117
- msg = (
118
- f"RAI service is unavailable in this region, or you lack the necessary permissions "
119
- f"to access the AI project. Status Code: {response.status_code}"
120
- )
121
- raise EvaluationException(
122
- message=msg,
123
- internal_message=msg,
124
- target=ErrorTarget.RAI_CLIENT,
125
- category=ErrorCategory.SERVICE_UNAVAILABLE,
126
- blame=ErrorBlame.USER_ERROR,
127
- tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
128
- )
75
+ if response.status_code != 200:
76
+ msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
77
+ raise EvaluationException(
78
+ message=msg,
79
+ internal_message=msg,
80
+ target=ErrorTarget.UNKNOWN,
81
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
82
+ blame=ErrorBlame.USER_ERROR,
83
+ )
129
84
 
130
- capabilities = response.json()
131
- if capability and capability not in capabilities:
132
- msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
133
- raise EvaluationException(
134
- message=msg,
135
- internal_message=msg,
136
- target=ErrorTarget.RAI_CLIENT,
137
- category=ErrorCategory.SERVICE_UNAVAILABLE,
138
- blame=ErrorBlame.USER_ERROR,
139
- tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
140
- )
85
+ capabilities = response.json()
141
86
 
87
+ if capability and capability not in capabilities:
88
+ msg = f"Capability '{capability}' is not available in this region"
89
+ raise EvaluationException(
90
+ message=msg,
91
+ internal_message=msg,
92
+ target=ErrorTarget.RAI_CLIENT,
93
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
94
+ blame=ErrorBlame.USER_ERROR,
95
+ )
142
96
 
143
- def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
97
+
98
+ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
144
99
  """Generate the payload for the annotation request
145
100
 
146
101
  :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
@@ -148,18 +103,19 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
148
103
  :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
149
104
  in the payload.
150
105
  :type metric: str
151
- :param annotation_task: The annotation task to be passed to service
152
- :type annotation_task: str
153
106
  :return: The payload for the annotation request.
154
107
  :rtype: Dict
155
108
  """
156
109
  include_metric = True
157
- task = annotation_task
110
+ task = Tasks.CONTENT_HARM
158
111
  if metric == EvaluationMetrics.PROTECTED_MATERIAL:
112
+ task = Tasks.PROTECTED_MATERIAL
159
113
  include_metric = False
160
114
  elif metric == _InternalEvaluationMetrics.ECI:
115
+ task = _InternalAnnotationTasks.ECI
161
116
  include_metric = False
162
117
  elif metric == EvaluationMetrics.XPIA:
118
+ task = Tasks.XPIA
163
119
  include_metric = False
164
120
  return (
165
121
  {
@@ -175,35 +131,39 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
175
131
  )
176
132
 
177
133
 
178
- async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
134
+ async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
179
135
  """Submit request to Responsible AI service for evaluation and return operation ID
180
136
 
181
- :param data: The data to evaluate.
182
- :type data: dict
137
+ :param query: The query to evaluate.
138
+ :type query: str
139
+ :param response: The response to evaluate.
140
+ :type response: str
183
141
  :param metric: The evaluation metric to use.
184
142
  :type metric: str
185
143
  :param rai_svc_url: The Responsible AI service URL.
186
144
  :type rai_svc_url: str
187
145
  :param token: The Azure authentication token.
188
146
  :type token: str
189
- :param annotation_task: The annotation task to use.
190
- :type annotation_task: str
191
147
  :return: The operation ID.
192
148
  :rtype: str
193
149
  """
194
- normalized_user_text = get_formatted_template(data, annotation_task)
195
- payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
150
+ user_text = f"<Human>{query}</><System>{response}</>"
151
+ normalized_user_text = user_text.replace("'", '\\"')
152
+ payload = generate_payload(normalized_user_text, metric)
196
153
 
197
154
  url = rai_svc_url + "/submitannotation"
198
155
  headers = get_common_headers(token)
199
156
 
200
- async with get_async_http_client_with_timeout() as client:
201
- http_response = await client.post(url, json=payload, headers=headers)
157
+ async with get_async_http_client() as client:
158
+ response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
159
+ url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
160
+ )
161
+
162
+ if response.status_code != 202:
163
+ print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
164
+ response.raise_for_status()
202
165
 
203
- if http_response.status_code != 202:
204
- print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
205
- http_response.raise_for_status()
206
- result = http_response.json()
166
+ result = response.json()
207
167
  operation_id = result["location"].split("/")[-1]
208
168
  return operation_id
209
169
 
@@ -230,8 +190,10 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
230
190
  token = await fetch_or_reuse_token(credential, token)
231
191
  headers = get_common_headers(token)
232
192
 
233
- async with get_async_http_client_with_timeout() as client:
234
- response = await client.get(url, headers=headers)
193
+ async with get_async_http_client() as client:
194
+ response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
195
+ url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
196
+ )
235
197
 
236
198
  if response.status_code == 200:
237
199
  return response.json()
@@ -246,95 +208,64 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
246
208
 
247
209
 
248
210
  def parse_response( # pylint: disable=too-many-branches,too-many-statements
249
- batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
250
- ) -> Dict[str, Union[str, float]]:
211
+ batch_response: List[Dict], metric_name: str
212
+ ) -> Dict:
251
213
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
252
214
 
253
215
  :param batch_response: The annotation response from Responsible AI service.
254
216
  :type batch_response: List[Dict]
255
217
  :param metric_name: The evaluation metric to use.
256
218
  :type metric_name: str
257
- :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
258
- :type metric_display_name: Optional[str]
259
219
  :return: The parsed annotation result.
260
- :rtype: Dict[str, Union[str, float]]
220
+ :rtype: List[List[Dict]]
261
221
  """
262
- if metric_display_name is None:
263
- metric_display_name = metric_name
264
-
265
222
  # non-numeric metrics
266
- if metric_name in {
267
- EvaluationMetrics.PROTECTED_MATERIAL,
268
- _InternalEvaluationMetrics.ECI,
269
- EvaluationMetrics.XPIA,
270
- }:
271
- result = {}
272
- if not batch_response or len(batch_response[0]) == 0:
273
- return {}
274
- if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
275
- pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
276
- for pm_metric_name in pm_metric_names:
277
- response = batch_response[0][pm_metric_name]
278
- response = response.replace("false", "False")
279
- response = response.replace("true", "True")
280
- parsed_response = literal_eval(response)
281
- result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
282
- result[pm_metric_name + "_reason"] = (
283
- parsed_response["reasoning"] if "reasoning" in parsed_response else ""
284
- )
285
- return result
286
- if metric_name not in batch_response[0]:
223
+ if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
224
+ if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
287
225
  return {}
288
226
  response = batch_response[0][metric_name]
289
227
  response = response.replace("false", "False")
290
228
  response = response.replace("true", "True")
291
229
  parsed_response = literal_eval(response)
230
+ result = {}
292
231
  # Use label instead of score since these are assumed to be boolean results.
293
- # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
294
- result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
295
- result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
232
+ # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
233
+ result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
234
+ result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
296
235
 
297
236
  if metric_name == EvaluationMetrics.XPIA:
298
237
  # Add "manipulated_content", "intrusion" and "information_gathering" to the result
299
- # if present else set them to math.nan
300
- result[metric_display_name + "_manipulated_content"] = (
301
- parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
238
+ # if present else set them to np.nan
239
+ result[metric_name + "_manipulated_content"] = (
240
+ parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
302
241
  )
303
- result[metric_display_name + "_intrusion"] = (
304
- parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
242
+ result[metric_name + "_intrusion"] = (
243
+ parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
305
244
  )
306
- result[metric_display_name + "_information_gathering"] = (
307
- parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
245
+ result[metric_name + "_information_gathering"] = (
246
+ parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
308
247
  )
309
248
  return result
310
- return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
249
+ return _parse_content_harm_response(batch_response, metric_name)
311
250
 
312
251
 
313
- def _parse_content_harm_response(
314
- batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
315
- ) -> Dict[str, Union[str, float]]:
252
+ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
316
253
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
317
254
 
318
255
  :param batch_response: The annotation response from Responsible AI service.
319
256
  :type batch_response: List[Dict]
320
257
  :param metric_name: The evaluation metric to use.
321
258
  :type metric_name: str
322
- :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
323
- :type metric_display_name: Optional[str]
324
259
  :return: The parsed annotation result.
325
- :rtype: Dict[str, Union[str, float]]
260
+ :rtype: List[List[Dict]]
326
261
  """
327
262
  # Fix the metric name if it's "hate_fairness"
328
263
  # Eventually we will remove this fix once the RAI service is updated
329
- key = metric_name if metric_display_name is None else metric_display_name
264
+ key = metric_name
330
265
  if key == EvaluationMetrics.HATE_FAIRNESS:
331
266
  key = EvaluationMetrics.HATE_UNFAIRNESS
332
267
 
333
- result: Dict[str, Union[str, float]] = {
334
- (key.value if hasattr(key, "value") else key): math.nan,
335
- f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
336
- f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
337
- }
268
+ result = {key: np.nan, key + "_score": np.nan, key + "_reason": ""}
338
269
 
339
270
  response = batch_response[0]
340
271
  if metric_name not in response:
@@ -352,11 +283,11 @@ def _parse_content_harm_response(
352
283
 
353
284
  # get content harm metric_value
354
285
  if "label" in harm_response:
355
- metric_value = float(harm_response["label"])
286
+ metric_value = harm_response["label"]
356
287
  elif "valid" in harm_response:
357
- metric_value = 0 if harm_response["valid"] else math.nan
288
+ metric_value = 0 if harm_response["valid"] else np.nan
358
289
  else:
359
- metric_value = math.nan
290
+ metric_value = np.nan
360
291
 
361
292
  # get reason
362
293
  if "reasoning" in harm_response:
@@ -370,23 +301,21 @@ def _parse_content_harm_response(
370
301
  if metric_value_match:
371
302
  metric_value = int(metric_value_match[0])
372
303
  else:
373
- metric_value = math.nan
304
+ metric_value = np.nan
374
305
  reason = harm_response
375
306
  elif harm_response != "" and isinstance(harm_response, (int, float)):
376
307
  if 0 < harm_response <= 7:
377
308
  metric_value = harm_response
378
309
  else:
379
- metric_value = math.nan
310
+ metric_value = np.nan
380
311
  reason = ""
381
312
  else:
382
- metric_value = math.nan
313
+ metric_value = np.nan
383
314
  reason = ""
384
315
 
385
316
  harm_score = metric_value
386
- if metric_value == "n/a":
387
- return result
388
- if not math.isnan(metric_value):
389
- # int(math.nan) causes a value error, and math.nan is already handled
317
+ if not np.isnan(metric_value):
318
+ # int(np.nan) causes a value error, and np.nan is already handled
390
319
  # by get_harm_severity_level
391
320
  harm_score = int(metric_value)
392
321
  result[key] = get_harm_severity_level(harm_score)
@@ -408,34 +337,31 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
408
337
  """
409
338
  headers = get_common_headers(token)
410
339
 
411
- async with get_async_http_client_with_timeout() as client:
412
- response = await client.get(
340
+ async with get_async_http_client() as client:
341
+ response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
413
342
  f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
414
343
  f"resourceGroups/{azure_ai_project['resource_group_name']}/"
415
344
  f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
416
345
  f"api-version=2023-08-01-preview",
417
346
  headers=headers,
347
+ timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
418
348
  )
419
349
 
420
350
  if response.status_code != 200:
421
- msg = (
422
- f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
423
- f"and make sure you have the necessary access permissions. "
424
- f"Status code: {response.status_code}."
425
- )
351
+ msg = "Failed to retrieve the discovery service URL."
426
352
  raise EvaluationException(
427
353
  message=msg,
354
+ internal_message=msg,
428
355
  target=ErrorTarget.RAI_CLIENT,
429
- blame=ErrorBlame.USER_ERROR,
430
- category=ErrorCategory.PROJECT_ACCESS_ERROR,
431
- tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
356
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
357
+ blame=ErrorBlame.UNKNOWN,
432
358
  )
433
359
 
434
360
  base_url = urlparse(response.json()["properties"]["discoveryUrl"])
435
361
  return f"{base_url.scheme}://{base_url.netloc}"
436
362
 
437
363
 
438
- async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
364
+ async def get_rai_svc_url(project_scope: dict, token: str) -> str:
439
365
  """Get the Responsible AI service URL
440
366
 
441
367
  :param project_scope: The Azure AI project scope details.
@@ -459,7 +385,7 @@ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
459
385
  return rai_url
460
386
 
461
387
 
462
- async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str:
388
+ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str:
463
389
  """Get token. Fetch a new token if the current token is near expiry
464
390
 
465
391
  :param credential: The Azure authentication credential.
@@ -469,147 +395,35 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
469
395
  :type token: str
470
396
  :return: The Azure authentication token.
471
397
  """
472
- if token:
473
- # Decode the token to get its expiration time
474
- try:
398
+ acquire_new_token = True
399
+ try:
400
+ if token:
401
+ # Decode the token to get its expiration time
475
402
  decoded_token = jwt.decode(token, options={"verify_signature": False})
476
- except jwt.PyJWTError:
477
- pass
478
- else:
479
403
  exp_time = decoded_token["exp"]
480
404
  current_time = time.time()
481
405
 
482
- # Return current token if not near expiry
406
+ # Check if the token is near expiry
483
407
  if (exp_time - current_time) >= 300:
484
- return token
485
-
486
- return credential.get_token("https://management.azure.com/.default").token
487
-
488
-
489
- async def evaluate_with_rai_service(
490
- data: dict,
491
- metric_name: str,
492
- project_scope: AzureAIProject,
493
- credential: TokenCredential,
494
- annotation_task: str = Tasks.CONTENT_HARM,
495
- metric_display_name=None,
496
- ) -> Dict[str, Union[str, float]]:
497
- """ "Evaluate the content safety of the response using Responsible AI service
498
-
499
- :param data: The data to evaluate.
500
- :type data: dict
501
- :param metric_name: The evaluation metric to use.
502
- :type metric_name: str
503
- :param project_scope: The Azure AI project scope details.
504
- :type project_scope: Dict
505
- :param credential: The Azure authentication credential.
506
- :type credential:
507
- ~azure.core.credentials.TokenCredential
508
- :param annotation_task: The annotation task to use.
509
- :type annotation_task: str
510
- :param metric_display_name: The display name of metric to use.
511
- :type metric_display_name: str
512
- :return: The parsed annotation result.
513
- :rtype: Dict[str, Union[str, float]]
514
- """
515
-
516
- # Get RAI service URL from discovery service and check service availability
517
- token = await fetch_or_reuse_token(credential)
518
- rai_svc_url = await get_rai_svc_url(project_scope, token)
519
- await ensure_service_availability(rai_svc_url, token, annotation_task)
520
-
521
- # Submit annotation request and fetch result
522
- operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
523
- annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
524
- result = parse_response(annotation_response, metric_name, metric_display_name)
525
-
526
- return result
527
-
528
-
529
- def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
530
- """Generate the payload for the annotation request
531
- :param content_type: The type of the content representing multimodal or images.
532
- :type content_type: str
533
- :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
534
- :type messages: str
535
- :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
536
- in the payload.
537
- :type metric: str
538
- :return: The payload for the annotation request.
539
- :rtype: Dict
540
- """
541
- include_metric = True
542
- task = Tasks.CONTENT_HARM
543
- if metric == EvaluationMetrics.PROTECTED_MATERIAL:
544
- task = Tasks.PROTECTED_MATERIAL
545
- include_metric = False
546
-
547
- if include_metric:
548
- return {
549
- "ContentType": content_type,
550
- "Contents": [{"messages": messages}],
551
- "AnnotationTask": task,
552
- "MetricList": [metric],
553
- }
554
- return {
555
- "ContentType": content_type,
556
- "Contents": [{"messages": messages}],
557
- "AnnotationTask": task,
558
- }
408
+ acquire_new_token = False
409
+ except Exception: # pylint: disable=broad-exception-caught
410
+ pass
559
411
 
412
+ if acquire_new_token:
413
+ token = credential.get_token("https://management.azure.com/.default").token
560
414
 
561
- async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
562
- """Submit request to Responsible AI service for evaluation and return operation ID
563
- :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
564
- :type messages: str
565
- :param metric: The evaluation metric to use.
566
- :type metric: str
567
- :param rai_svc_url: The Responsible AI service URL.
568
- :type rai_svc_url: str
569
- :param token: The Azure authentication token.
570
- :type token: str
571
- :return: The operation ID.
572
- :rtype: str
573
- """
574
- ## handle json payload and payload from inference sdk strongly type messages
575
- if len(messages) > 0 and not isinstance(messages[0], dict):
576
- try:
577
- from azure.ai.inference.models import ChatRequestMessage
578
- except ImportError as ex:
579
- error_message = (
580
- "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
581
- )
582
- raise MissingRequiredPackage(message=error_message) from ex
583
- if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
584
- messages = [message.as_dict() for message in messages]
415
+ return token
585
416
 
586
- filtered_messages = [message for message in messages if message["role"] != "system"]
587
- assistant_messages = [message for message in messages if message["role"] == "assistant"]
588
- content_type = retrieve_content_type(assistant_messages, metric)
589
- payload = generate_payload_multimodal(content_type, filtered_messages, metric)
590
417
 
591
- ## calling rai service for annotation
592
- url = rai_svc_url + "/submitannotation"
593
- headers = get_common_headers(token)
594
- async with get_async_http_client() as client:
595
- response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
596
- url, json=payload, headers=headers
597
- )
598
- if response.status_code != 202:
599
- raise HttpResponseError(
600
- message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
601
- )
602
- result = response.json()
603
- operation_id = result["location"].split("/")[-1]
604
- return operation_id
605
-
606
-
607
- async def evaluate_with_rai_service_multimodal(
608
- messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
418
+ async def evaluate_with_rai_service(
419
+ query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
609
420
  ):
610
421
  """ "Evaluate the content safety of the response using Responsible AI service
611
- :param messages: The normalized list of messages.
612
- :type messages: str
422
+
423
+ :param query: The query to evaluate.
424
+ :type query: str
425
+ :param response: The response to evaluate.
426
+ :type response: str
613
427
  :param metric_name: The evaluation metric to use.
614
428
  :type metric_name: str
615
429
  :param project_scope: The Azure AI project scope details.
@@ -620,13 +434,19 @@ async def evaluate_with_rai_service_multimodal(
620
434
  :return: The parsed annotation result.
621
435
  :rtype: List[List[Dict]]
622
436
  """
437
+ # Use DefaultAzureCredential if no credential is provided
438
+ # This is for the for batch run scenario as the credential cannot be serialized by promoptflow
439
+ if credential is None or credential == {}:
440
+ credential = DefaultAzureCredential()
623
441
 
624
442
  # Get RAI service URL from discovery service and check service availability
625
443
  token = await fetch_or_reuse_token(credential)
626
444
  rai_svc_url = await get_rai_svc_url(project_scope, token)
627
445
  await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
446
+
628
447
  # Submit annotation request and fetch result
629
- operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
630
- annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
448
+ operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
449
+ annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
631
450
  result = parse_response(annotation_response, metric_name)
451
+
632
452
  return result