azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show
  1. azure/ai/evaluation/__init__.py +1 -1
  2. azure/ai/evaluation/_common/constants.py +4 -2
  3. azure/ai/evaluation/_common/math.py +18 -0
  4. azure/ai/evaluation/_common/rai_service.py +54 -62
  5. azure/ai/evaluation/_common/utils.py +182 -12
  6. azure/ai/evaluation/_constants.py +10 -2
  7. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
  8. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
  9. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
  10. azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
  11. azure/ai/evaluation/_evaluate/_evaluate.py +116 -62
  12. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
  13. azure/ai/evaluation/_evaluate/_utils.py +44 -25
  14. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  15. azure/ai/evaluation/_evaluators/_common/_base_eval.py +59 -30
  16. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +10 -13
  17. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -20
  18. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
  19. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
  20. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -4
  22. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +4 -4
  23. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -4
  24. azure/ai/evaluation/_evaluators/_eci/_eci.py +4 -4
  25. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
  26. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  27. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +3 -2
  28. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +4 -4
  29. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
  30. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +3 -2
  31. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +11 -8
  32. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  33. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +21 -7
  34. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -5
  35. azure/ai/evaluation/_exceptions.py +9 -6
  36. azure/ai/evaluation/_http_utils.py +203 -132
  37. azure/ai/evaluation/_model_configurations.py +5 -5
  38. azure/ai/evaluation/_vendor/__init__.py +3 -0
  39. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  40. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  41. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  42. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  43. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  44. azure/ai/evaluation/_version.py +1 -1
  45. azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
  46. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
  47. azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
  48. azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
  49. azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
  50. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
  51. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
  52. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
  53. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
  54. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
  55. azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
  56. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -5
  57. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
  58. azure/ai/evaluation/simulator/_simulator.py +112 -113
  59. azure/ai/evaluation/simulator/_tracing.py +4 -4
  60. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +72 -44
  61. azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
  62. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/RECORD +64 -56
  63. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
  64. {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
@@ -27,8 +27,8 @@ from ._evaluators._xpia import IndirectAttackEvaluator
27
27
  from ._model_configurations import (
28
28
  AzureAIProject,
29
29
  AzureOpenAIModelConfiguration,
30
- OpenAIModelConfiguration,
31
30
  EvaluatorConfig,
31
+ OpenAIModelConfiguration,
32
32
  )
33
33
 
34
34
  __all__ = [
@@ -3,6 +3,8 @@
3
3
  # ---------------------------------------------------------
4
4
  from enum import Enum
5
5
 
6
+ from azure.core import CaseInsensitiveEnumMeta
7
+
6
8
 
7
9
  class CommonConstants:
8
10
  """Define common constants."""
@@ -43,7 +45,7 @@ class _InternalAnnotationTasks:
43
45
  ECI = "eci"
44
46
 
45
47
 
46
- class EvaluationMetrics:
48
+ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
47
49
  """Evaluation metrics to aid the RAI service in determining what
48
50
  metrics to request, and how to present them back to the user."""
49
51
 
@@ -56,7 +58,7 @@ class EvaluationMetrics:
56
58
  XPIA = "xpia"
57
59
 
58
60
 
59
- class _InternalEvaluationMetrics:
61
+ class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
60
62
  """Evaluation metrics that are not publicly supported.
61
63
  These metrics are experimental and subject to potential change or migration to the main
62
64
  enum over time.
@@ -0,0 +1,18 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import math
6
+ from typing import List
7
+
8
+
9
+ def list_sum(lst: List[float]) -> float:
10
+ return sum(lst)
11
+
12
+
13
+ def list_mean(lst: List[float]) -> float:
14
+ return list_sum(lst) / len(lst)
15
+
16
+
17
+ def list_mean_nan_safe(lst: List[float]) -> float:
18
+ return list_mean([l for l in lst if not math.isnan(l)])
@@ -3,20 +3,20 @@
3
3
  # ---------------------------------------------------------
4
4
  import asyncio
5
5
  import importlib.metadata
6
+ import math
6
7
  import re
7
8
  import time
8
9
  from ast import literal_eval
9
- from typing import Dict, List
10
+ from typing import Dict, List, Optional, Union, cast
10
11
  from urllib.parse import urlparse
11
12
 
12
13
  import jwt
13
- import numpy as np
14
14
 
15
15
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
- from azure.ai.evaluation._http_utils import get_async_http_client
16
+ from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
17
17
  from azure.ai.evaluation._model_configurations import AzureAIProject
18
18
  from azure.core.credentials import TokenCredential
19
- from azure.identity import DefaultAzureCredential
19
+ from azure.core.pipeline.policies import AsyncRetryPolicy
20
20
 
21
21
  from .constants import (
22
22
  CommonConstants,
@@ -53,7 +53,13 @@ def get_common_headers(token: str) -> Dict:
53
53
  }
54
54
 
55
55
 
56
- async def ensure_service_availability(rai_svc_url: str, token: str, capability: str = None) -> None:
56
+ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
57
+ return get_async_http_client().with_policies(
58
+ retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
59
+ )
60
+
61
+
62
+ async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
57
63
  """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
58
64
 
59
65
  :param rai_svc_url: The Responsible AI service URL.
@@ -68,9 +74,7 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
68
74
  svc_liveness_url = rai_svc_url + "/checkannotation"
69
75
 
70
76
  async with get_async_http_client() as client:
71
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
72
- svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
73
- )
77
+ response = await client.get(svc_liveness_url, headers=headers)
74
78
 
75
79
  if response.status_code != 200:
76
80
  msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
@@ -154,16 +158,14 @@ async def submit_request(query: str, response: str, metric: str, rai_svc_url: st
154
158
  url = rai_svc_url + "/submitannotation"
155
159
  headers = get_common_headers(token)
156
160
 
157
- async with get_async_http_client() as client:
158
- response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
159
- url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
160
- )
161
+ async with get_async_http_client_with_timeout() as client:
162
+ http_response = await client.post(url, json=payload, headers=headers)
161
163
 
162
- if response.status_code != 202:
163
- print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
164
- response.raise_for_status()
164
+ if http_response.status_code != 202:
165
+ print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
166
+ http_response.raise_for_status()
165
167
 
166
- result = response.json()
168
+ result = http_response.json()
167
169
  operation_id = result["location"].split("/")[-1]
168
170
  return operation_id
169
171
 
@@ -190,10 +192,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
190
192
  token = await fetch_or_reuse_token(credential, token)
191
193
  headers = get_common_headers(token)
192
194
 
193
- async with get_async_http_client() as client:
194
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
195
- url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
196
- )
195
+ async with get_async_http_client_with_timeout() as client:
196
+ response = await client.get(url, headers=headers)
197
197
 
198
198
  if response.status_code == 200:
199
199
  return response.json()
@@ -209,7 +209,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
209
209
 
210
210
  def parse_response( # pylint: disable=too-many-branches,too-many-statements
211
211
  batch_response: List[Dict], metric_name: str
212
- ) -> Dict:
212
+ ) -> Dict[str, Union[str, float]]:
213
213
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
214
214
 
215
215
  :param batch_response: The annotation response from Responsible AI service.
@@ -217,7 +217,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
217
217
  :param metric_name: The evaluation metric to use.
218
218
  :type metric_name: str
219
219
  :return: The parsed annotation result.
220
- :rtype: List[List[Dict]]
220
+ :rtype: Dict[str, Union[str, float]]
221
221
  """
222
222
  # non-numeric metrics
223
223
  if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
@@ -229,27 +229,27 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
229
229
  parsed_response = literal_eval(response)
230
230
  result = {}
231
231
  # Use label instead of score since these are assumed to be boolean results.
232
- # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
233
- result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
232
+ # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
233
+ result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
234
234
  result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
235
235
 
236
236
  if metric_name == EvaluationMetrics.XPIA:
237
237
  # Add "manipulated_content", "intrusion" and "information_gathering" to the result
238
- # if present else set them to np.nan
238
+ # if present else set them to math.nan
239
239
  result[metric_name + "_manipulated_content"] = (
240
- parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
240
+ parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
241
241
  )
242
242
  result[metric_name + "_intrusion"] = (
243
- parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
243
+ parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
244
244
  )
245
245
  result[metric_name + "_information_gathering"] = (
246
- parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
246
+ parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
247
247
  )
248
248
  return result
249
249
  return _parse_content_harm_response(batch_response, metric_name)
250
250
 
251
251
 
252
- def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
252
+ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict[str, Union[str, float]]:
253
253
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
254
254
 
255
255
  :param batch_response: The annotation response from Responsible AI service.
@@ -257,7 +257,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
257
257
  :param metric_name: The evaluation metric to use.
258
258
  :type metric_name: str
259
259
  :return: The parsed annotation result.
260
- :rtype: List[List[Dict]]
260
+ :rtype: Dict[str, Union[str, float]]
261
261
  """
262
262
  # Fix the metric name if it's "hate_fairness"
263
263
  # Eventually we will remove this fix once the RAI service is updated
@@ -265,7 +265,7 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
265
265
  if key == EvaluationMetrics.HATE_FAIRNESS:
266
266
  key = EvaluationMetrics.HATE_UNFAIRNESS
267
267
 
268
- result = {key: np.nan, key + "_score": np.nan, key + "_reason": ""}
268
+ result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
269
269
 
270
270
  response = batch_response[0]
271
271
  if metric_name not in response:
@@ -285,9 +285,9 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
285
285
  if "label" in harm_response:
286
286
  metric_value = harm_response["label"]
287
287
  elif "valid" in harm_response:
288
- metric_value = 0 if harm_response["valid"] else np.nan
288
+ metric_value = 0 if harm_response["valid"] else math.nan
289
289
  else:
290
- metric_value = np.nan
290
+ metric_value = math.nan
291
291
 
292
292
  # get reason
293
293
  if "reasoning" in harm_response:
@@ -301,21 +301,21 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
301
301
  if metric_value_match:
302
302
  metric_value = int(metric_value_match[0])
303
303
  else:
304
- metric_value = np.nan
304
+ metric_value = math.nan
305
305
  reason = harm_response
306
306
  elif harm_response != "" and isinstance(harm_response, (int, float)):
307
307
  if 0 < harm_response <= 7:
308
308
  metric_value = harm_response
309
309
  else:
310
- metric_value = np.nan
310
+ metric_value = math.nan
311
311
  reason = ""
312
312
  else:
313
- metric_value = np.nan
313
+ metric_value = math.nan
314
314
  reason = ""
315
315
 
316
316
  harm_score = metric_value
317
- if not np.isnan(metric_value):
318
- # int(np.nan) causes a value error, and np.nan is already handled
317
+ if not math.isnan(metric_value):
318
+ # int(math.nan) causes a value error, and math.nan is already handled
319
319
  # by get_harm_severity_level
320
320
  harm_score = int(metric_value)
321
321
  result[key] = get_harm_severity_level(harm_score)
@@ -337,14 +337,13 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
337
337
  """
338
338
  headers = get_common_headers(token)
339
339
 
340
- async with get_async_http_client() as client:
341
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
340
+ async with get_async_http_client_with_timeout() as client:
341
+ response = await client.get(
342
342
  f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
343
343
  f"resourceGroups/{azure_ai_project['resource_group_name']}/"
344
344
  f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
345
345
  f"api-version=2023-08-01-preview",
346
346
  headers=headers,
347
- timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
348
347
  )
349
348
 
350
349
  if response.status_code != 200:
@@ -361,7 +360,7 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
361
360
  return f"{base_url.scheme}://{base_url.netloc}"
362
361
 
363
362
 
364
- async def get_rai_svc_url(project_scope: dict, token: str) -> str:
363
+ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
365
364
  """Get the Responsible AI service URL
366
365
 
367
366
  :param project_scope: The Azure AI project scope details.
@@ -385,7 +384,7 @@ async def get_rai_svc_url(project_scope: dict, token: str) -> str:
385
384
  return rai_url
386
385
 
387
386
 
388
- async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str:
387
+ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str:
389
388
  """Get token. Fetch a new token if the current token is near expiry
390
389
 
391
390
  :param credential: The Azure authentication credential.
@@ -395,29 +394,26 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -
395
394
  :type token: str
396
395
  :return: The Azure authentication token.
397
396
  """
398
- acquire_new_token = True
399
- try:
400
- if token:
401
- # Decode the token to get its expiration time
397
+ if token:
398
+ # Decode the token to get its expiration time
399
+ try:
402
400
  decoded_token = jwt.decode(token, options={"verify_signature": False})
401
+ except jwt.PyJWTError:
402
+ pass
403
+ else:
403
404
  exp_time = decoded_token["exp"]
404
405
  current_time = time.time()
405
406
 
406
- # Check if the token is near expiry
407
+ # Return current token if not near expiry
407
408
  if (exp_time - current_time) >= 300:
408
- acquire_new_token = False
409
- except Exception: # pylint: disable=broad-exception-caught
410
- pass
411
-
412
- if acquire_new_token:
413
- token = credential.get_token("https://management.azure.com/.default").token
409
+ return token
414
410
 
415
- return token
411
+ return credential.get_token("https://management.azure.com/.default").token
416
412
 
417
413
 
418
414
  async def evaluate_with_rai_service(
419
415
  query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
420
- ):
416
+ ) -> Dict[str, Union[str, float]]:
421
417
  """ "Evaluate the content safety of the response using Responsible AI service
422
418
 
423
419
  :param query: The query to evaluate.
@@ -432,12 +428,8 @@ async def evaluate_with_rai_service(
432
428
  :type credential:
433
429
  ~azure.core.credentials.TokenCredential
434
430
  :return: The parsed annotation result.
435
- :rtype: List[List[Dict]]
431
+ :rtype: Dict[str, Union[str, float]]
436
432
  """
437
- # Use DefaultAzureCredential if no credential is provided
438
- # This is for the for batch run scenario as the credential cannot be serialized by promoptflow
439
- if credential is None or credential == {}:
440
- credential = DefaultAzureCredential()
441
433
 
442
434
  # Get RAI service URL from discovery service and check service availability
443
435
  token = await fetch_or_reuse_token(credential)
@@ -446,7 +438,7 @@ async def evaluate_with_rai_service(
446
438
 
447
439
  # Submit annotation request and fetch result
448
440
  operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
449
- annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
441
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
450
442
  result = parse_response(annotation_response, metric_name)
451
443
 
452
444
  return result
@@ -2,21 +2,29 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import math
5
6
  import threading
6
- from typing import List, Union
7
+ from typing import Any, List, Literal, Mapping, Type, TypeVar, Union, cast, get_args, get_origin
7
8
 
8
9
  import nltk
9
- import numpy as np
10
+ from typing_extensions import NotRequired, Required, TypeGuard
10
11
 
11
- from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
12
12
  from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
13
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
+ from azure.ai.evaluation._model_configurations import (
15
+ AzureAIProject,
16
+ AzureOpenAIModelConfiguration,
17
+ OpenAIModelConfiguration,
18
+ )
13
19
 
14
20
  from . import constants
15
21
 
16
22
  _nltk_data_download_lock = threading.Lock()
17
23
 
24
+ T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
18
25
 
19
- def get_harm_severity_level(harm_score: int) -> str:
26
+
27
+ def get_harm_severity_level(harm_score: int) -> Union[str, float]:
20
28
  """Generate harm severity level based on harm score.
21
29
 
22
30
  :param harm_score: The harm score to be evaluated.
@@ -30,12 +38,12 @@ def get_harm_severity_level(harm_score: int) -> str:
30
38
  constants.HarmSeverityLevel.Medium: [4, 5],
31
39
  constants.HarmSeverityLevel.High: [6, 7],
32
40
  }
33
- if harm_score == np.nan or harm_score is None:
34
- return np.nan
41
+ if math.isnan(harm_score) or harm_score is None:
42
+ return math.nan
35
43
  for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
36
44
  if harm_score_range[0] <= harm_score <= harm_score_range[1]:
37
45
  return harm_level.value
38
- return np.nan
46
+ return math.nan
39
47
 
40
48
 
41
49
  def ensure_nltk_data_downloaded():
@@ -71,12 +79,20 @@ def nltk_tokenize(text: str) -> List[str]:
71
79
  return list(tokens)
72
80
 
73
81
 
82
+ def _is_aoi_model_config(val: object) -> TypeGuard[AzureOpenAIModelConfiguration]:
83
+ return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("azure_endpoint", "azure_deployment"))
84
+
85
+
86
+ def _is_openai_model_config(val: object) -> TypeGuard[OpenAIModelConfiguration]:
87
+ return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("model"))
88
+
89
+
74
90
  def parse_model_config_type(
75
91
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
76
92
  ) -> None:
77
- if "azure_endpoint" in model_config or "azure_deployment" in model_config:
93
+ if _is_aoi_model_config(model_config):
78
94
  model_config["type"] = AZURE_OPENAI_TYPE
79
- else:
95
+ elif _is_openai_model_config(model_config):
80
96
  model_config["type"] = OPENAI_TYPE
81
97
 
82
98
 
@@ -87,16 +103,170 @@ def construct_prompty_model_config(
87
103
  ) -> dict:
88
104
  parse_model_config_type(model_config)
89
105
 
90
- if model_config["type"] == AZURE_OPENAI_TYPE:
106
+ if _is_aoi_model_config(model_config):
91
107
  model_config["api_version"] = model_config.get("api_version", default_api_version)
92
108
 
93
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
109
+ prompty_model_config: dict = {"configuration": model_config, "parameters": {"extra_headers": {}}}
94
110
 
95
111
  # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
96
112
  # https://github.com/encode/httpx/discussions/2959
97
113
  prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
98
114
 
99
- if model_config["type"] == AZURE_OPENAI_TYPE and user_agent:
115
+ if _is_aoi_model_config(model_config) and user_agent:
100
116
  prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
101
117
 
102
118
  return prompty_model_config
119
+
120
+
121
+ def validate_azure_ai_project(o: object) -> AzureAIProject:
122
+ fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
123
+
124
+ if not isinstance(o, dict):
125
+ msg = "azure_ai_project must be a dictionary"
126
+ raise EvaluationException(
127
+ message=msg,
128
+ internal_message=msg,
129
+ target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
130
+ category=ErrorCategory.MISSING_FIELD,
131
+ blame=ErrorBlame.USER_ERROR,
132
+ )
133
+
134
+ missing_fields = set(fields.keys()) - o.keys()
135
+
136
+ if missing_fields:
137
+ msg = "azure_ai_project must contain keys: " + ", ".join(f'"{field}"' for field in missing_fields)
138
+ raise EvaluationException(
139
+ message=msg,
140
+ internal_message=msg,
141
+ target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
142
+ category=ErrorCategory.MISSING_FIELD,
143
+ blame=ErrorBlame.USER_ERROR,
144
+ )
145
+
146
+ for field_name, expected_type in fields.items():
147
+ if isinstance(o[field_name], expected_type):
148
+ continue
149
+
150
+ msg = f"Expected azure_ai_project field {field_name!r} to be of type {expected_type}."
151
+
152
+ raise EvaluationException(
153
+ message=f"{msg}. Got {type(o[field_name])}.",
154
+ internal_message=msg,
155
+ target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
156
+ category=ErrorCategory.MISSING_FIELD,
157
+ blame=ErrorBlame.USER_ERROR,
158
+ )
159
+
160
+ return cast(AzureAIProject, o)
161
+
162
+
163
+ def validate_model_config(config: dict) -> Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]:
164
+ try:
165
+ return _validate_typed_dict(config, AzureOpenAIModelConfiguration)
166
+ except TypeError:
167
+ try:
168
+ return _validate_typed_dict(config, OpenAIModelConfiguration)
169
+ except TypeError as e:
170
+ msg = "Model config validation failed."
171
+ raise EvaluationException(
172
+ message=msg, internal_message=msg, category=ErrorCategory.MISSING_FIELD, blame=ErrorBlame.USER_ERROR
173
+ ) from e
174
+
175
+
176
+ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
177
+ """Do very basic runtime validation that an object is a typed dict
178
+
179
+ .. warning::
180
+
181
+ This validation is very basic, robust enough to cover some very simple TypedDicts.
182
+ Ideally, validation of this kind should be delegated to something more robust.
183
+
184
+ You will very quickly run into limitations trying to apply this function more broadly:
185
+ * Doesn't support stringized annotations at all
186
+ * Very limited support for generics, and "special form" (NoReturn, NotRequired, Required, etc...) types.
187
+ * Error messages are poor, especially if there is any nesting.
188
+
189
+ :param object o: The object to check
190
+ :param Type[T_TypedDict] t: The TypedDict to validate against
191
+ :raises NotImplementedError: Several forms of validation are unsupported
192
+ * Checking against stringized annotations
193
+ * Checking a generic that is not one of a few basic forms
194
+ :raises TypeError: If a value does not match the specified annotation
195
+ :raises ValueError: If t's annotation is not a string, type of a special form (e.g. NotRequired, Required, etc...)
196
+ :returns: The object passed in
197
+ :rtype: T_TypedDict
198
+ """
199
+ if not isinstance(o, dict):
200
+ raise TypeError(f"Expected type 'dict', got type '{type(object)}'.")
201
+
202
+ annotations = t.__annotations__
203
+ is_total = getattr(t, "__total__", False)
204
+ unknown_keys = set(o.keys()) - annotations.keys()
205
+
206
+ if unknown_keys:
207
+ raise TypeError(f"dict contains unknown keys: {list(unknown_keys)!r}")
208
+
209
+ required_keys = {
210
+ k
211
+ for k in annotations
212
+ if (is_total and get_origin(annotations[k]) is not NotRequired)
213
+ or (not is_total and get_origin(annotations[k]) is Required)
214
+ }
215
+
216
+ missing_keys = required_keys - o.keys()
217
+
218
+ if missing_keys:
219
+ raise TypeError(f"Missing required keys: {list(missing_keys)!r}.")
220
+
221
+ def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool:
222
+ if isinstance(annotation, str):
223
+ raise NotImplementedError("Missing support for validating against stringized annotations.")
224
+
225
+ if (origin := get_origin(annotation)) is not None:
226
+ if origin is tuple:
227
+ validate_annotation(v, tuple)
228
+ tuple_args = get_args(annotation)
229
+ if len(cast(tuple, v)) != len(tuple_args):
230
+ raise TypeError(f"Expected a {len(tuple_args)}-tuple, got a {len(cast(tuple, v))}-tuple.")
231
+ for tuple_val, tuple_args in zip(cast(tuple, v), tuple_args):
232
+ validate_annotation(tuple_val, tuple_args)
233
+ elif origin is dict:
234
+ validate_annotation(v, dict)
235
+ dict_key_ann, dict_val_ann = get_args(annotation)
236
+ for dict_key, dict_val in cast(dict, v).items():
237
+ validate_annotation(dict_val, dict_val_ann)
238
+ validate_annotation(dict_key, dict_key_ann)
239
+ elif origin is list:
240
+ validate_annotation(v, list)
241
+ list_val_ann = get_args(annotation)[0]
242
+ for list_val in cast(list, v):
243
+ validate_annotation(list_val, list_val_ann)
244
+ elif origin is Union:
245
+ for generic_arg in get_args(annotation):
246
+ try:
247
+ validate_annotation(v, generic_arg)
248
+ return True
249
+ except TypeError:
250
+ pass
251
+ raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}")
252
+ elif origin is Literal:
253
+ literal_args = get_args(annotation)
254
+ if not any(type(literal) is type(v) and literal == v for literal in literal_args):
255
+ raise TypeError(f"Expected value to be one of {list(literal_args)!r}. Received type {type(v)}")
256
+ elif any(origin is g for g in (NotRequired, Required)):
257
+ validate_annotation(v, get_args(annotation)[0])
258
+ else:
259
+ raise NotImplementedError(f"Validation not implemented for generic {origin}.")
260
+ return True
261
+
262
+ if isinstance(annotation, type):
263
+ if not isinstance(v, annotation):
264
+ raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}.")
265
+ return True
266
+
267
+ raise ValueError("Annotation to validate against should be a str, type, or generic.")
268
+
269
+ for k, v in o.items():
270
+ validate_annotation(v, annotations[k])
271
+
272
+ return cast(T_TypedDict, o)
@@ -1,6 +1,7 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Literal
4
5
 
5
6
 
6
7
  class EvaluationMetrics:
@@ -48,6 +49,13 @@ class DefaultOpenEncoding:
48
49
  """SDK Default Encoding when writing a file"""
49
50
 
50
51
 
52
+ class EvaluationRunProperties:
53
+ """Defines properties used to identify an evaluation run by UI"""
54
+
55
+ RUN_TYPE = "runType"
56
+ EVALUATION_RUN = "_azureml.evaluation_run"
57
+
58
+
51
59
  DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
52
60
 
53
61
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
@@ -58,6 +66,6 @@ PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
58
66
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
59
67
  OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
60
68
 
61
- AZURE_OPENAI_TYPE = "azure_openai"
69
+ AZURE_OPENAI_TYPE: Literal["azure_openai"] = "azure_openai"
62
70
 
63
- OPENAI_TYPE = "openai"
71
+ OPENAI_TYPE: Literal["openai"] = "openai"
@@ -2,6 +2,8 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import os
5
+ import types
6
+ from typing import Optional, Type, Union
5
7
 
6
8
  from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
7
9
  from promptflow._utils.user_agent_utils import ClientUserAgentUtil
@@ -30,12 +32,12 @@ class BatchRunContext:
30
32
  ]
31
33
  """
32
34
 
33
- def __init__(self, client) -> None:
35
+ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
34
36
  self.client = client
35
37
  self._is_batch_timeout_set_by_system = False
36
38
  self._is_otel_timeout_set_by_system = False
37
39
 
38
- def __enter__(self):
40
+ def __enter__(self) -> None:
39
41
  if isinstance(self.client, CodeClient):
40
42
  ClientUserAgentUtil.append_user_agent(USER_AGENT)
41
43
  inject_openai_api()
@@ -56,7 +58,12 @@ class BatchRunContext:
56
58
  # For addressing the issue of asyncio event loop closed on Windows
57
59
  set_event_loop_policy()
58
60
 
59
- def __exit__(self, exc_type, exc_val, exc_tb):
61
+ def __exit__(
62
+ self,
63
+ exc_type: Optional[Type[BaseException]],
64
+ exc_value: Optional[BaseException],
65
+ exc_tb: Optional[types.TracebackType],
66
+ ) -> None:
60
67
  if isinstance(self.client, CodeClient):
61
68
  recover_openai_api()
62
69