azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show
  1. azure/ai/evaluation/__init__.py +10 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +7 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +165 -34
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  59. azure/ai/evaluation/_converters/_models.py +76 -6
  60. azure/ai/evaluation/_eval_mapping.py +73 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  63. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  64. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  65. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  66. azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
  67. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
  68. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  69. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  70. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  71. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  72. azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  76. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  77. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  78. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  79. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  80. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  81. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
  82. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  83. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
  84. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  85. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
  86. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
  87. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  88. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  89. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  90. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
  91. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
  92. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  93. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  94. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  95. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  96. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
  97. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
  98. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
  99. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  100. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  101. azure/ai/evaluation/_exceptions.py +2 -0
  102. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  103. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  104. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  106. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  107. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  109. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  110. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  111. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  112. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  113. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  114. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  115. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  116. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  117. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  118. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  119. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
  120. azure/ai/evaluation/_version.py +1 -1
  121. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  122. azure/ai/evaluation/red_team/_red_team.py +976 -546
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  125. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  126. azure/ai/evaluation/simulator/_constants.py +1 -0
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  128. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  129. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
  140. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
  141. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  142. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  143. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  144. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -2,20 +2,27 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import asyncio
5
6
  import re
7
+
8
+ from logging import Logger
6
9
  from os import PathLike
7
10
  from pathlib import Path
8
- from typing import Any, AsyncGenerator, Dict, Final, List, Mapping, Optional, Sequence, Tuple, Union, cast
11
+ from typing import Any, AsyncGenerator, Awaitable, Dict, Final, List, Mapping, Optional, Sequence, Tuple, Union, cast
9
12
 
10
- from openai import AsyncAzureOpenAI, AsyncOpenAI, NotGiven
13
+ from openai import AsyncAzureOpenAI, AsyncOpenAI, NotGiven, OpenAIError
14
+ from openai.lib.azure import AsyncAzureADTokenProvider
15
+ from azure.core.credentials import TokenCredential
16
+ from azure.core.credentials_async import AsyncTokenCredential
11
17
 
12
18
  from azure.ai.evaluation._exceptions import ErrorTarget
13
- from azure.ai.evaluation._constants import DefaultOpenEncoding
19
+ from azure.ai.evaluation._constants import DefaultOpenEncoding, TokenScope
14
20
  from azure.ai.evaluation._legacy.prompty._exceptions import (
15
21
  InvalidInputError,
16
22
  PromptyException,
17
23
  MissingRequiredInputError,
18
24
  NotSupportedError,
25
+ WrappedOpenAIError,
19
26
  )
20
27
  from azure.ai.evaluation._legacy.prompty._connection import AzureOpenAIConnection, Connection, OpenAIConnection
21
28
  from azure.ai.evaluation._legacy.prompty._yaml_utils import load_yaml_string
@@ -25,10 +32,14 @@ from azure.ai.evaluation._legacy.prompty._utils import (
25
32
  OpenAIChatResponseType,
26
33
  build_messages,
27
34
  format_llm_response,
35
+ openai_error_retryable,
28
36
  prepare_open_ai_request_params,
29
37
  resolve_references,
30
38
  update_dict_recursively,
31
39
  )
40
+ from azure.ai.evaluation._constants import DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
41
+ from azure.ai.evaluation._legacy._common._logging import get_logger
42
+ from azure.ai.evaluation._legacy._common._async_token_provider import AsyncAzureTokenProvider
32
43
 
33
44
 
34
45
  PROMPTY_EXTENSION: Final[str] = ".prompty"
@@ -124,10 +135,24 @@ class AsyncPrompty:
124
135
  def __init__(
125
136
  self,
126
137
  path: Union[str, PathLike],
138
+ *,
139
+ logger: Optional[Logger] = None,
140
+ token_credential: Optional[Union[TokenCredential, AsyncTokenCredential]] = None,
141
+ is_reasoning_model: bool = False,
127
142
  **kwargs: Any,
128
143
  ):
129
144
  path = Path(path)
130
145
  configs, self._template = self._parse_prompty(path)
146
+
147
+ if is_reasoning_model:
148
+ parameters = configs.get("model", {}).get("parameters", {})
149
+ if "max_tokens" in parameters:
150
+ parameters.pop("max_tokens", None)
151
+ parameters["max_completion_tokens"] = DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
152
+ # Remove unsupported parameters for reasoning models
153
+ for key in ["temperature", "top_p", "presence_penalty", "frequency_penalty"]:
154
+ parameters.pop(key, None)
155
+
131
156
  configs = resolve_references(configs, base_path=path.parent)
132
157
  configs = update_dict_recursively(configs, resolve_references(kwargs, base_path=path.parent))
133
158
 
@@ -142,6 +167,9 @@ class AsyncPrompty:
142
167
  self._inputs: Dict[str, Any] = configs.get("inputs", {})
143
168
  self._outputs: Dict[str, Any] = configs.get("outputs", {})
144
169
  self._name: str = configs.get("name", path.stem)
170
+ self._logger = logger or get_logger(__name__)
171
+ self._token_credential: Union[TokenCredential, AsyncTokenCredential] = \
172
+ token_credential or AsyncAzureTokenProvider()
145
173
 
146
174
  @property
147
175
  def path(self) -> Path:
@@ -234,9 +262,6 @@ class AsyncPrompty:
234
262
 
235
263
  return resolved_inputs
236
264
 
237
- # TODO ralphe: error handling
238
- # @trace
239
- # @handle_openai_error()
240
265
  async def __call__( # pylint: disable=docstring-keyword-should-match-keyword-only
241
266
  self,
242
267
  **kwargs: Any,
@@ -257,7 +282,7 @@ class AsyncPrompty:
257
282
  messages = build_messages(prompt=self._template, working_dir=self.path.parent, **inputs)
258
283
  params = prepare_open_ai_request_params(self._model, messages)
259
284
 
260
- timeout: Union[NotGiven, float] = NotGiven()
285
+ timeout: Optional[float] = None
261
286
  if timeout_val := cast(Any, kwargs.get("timeout", None)):
262
287
  timeout = float(timeout_val)
263
288
 
@@ -273,6 +298,9 @@ class AsyncPrompty:
273
298
  azure_deployment=connection.azure_deployment,
274
299
  api_version=connection.api_version,
275
300
  max_retries=max_retries,
301
+ azure_ad_token_provider=(self.get_token_provider(self._token_credential)
302
+ if not connection.api_key
303
+ else None),
276
304
  )
277
305
  elif isinstance(connection, OpenAIConnection):
278
306
  api_client = AsyncOpenAI(
@@ -286,8 +314,10 @@ class AsyncPrompty:
286
314
  f"'{type(connection).__name__}' is not a supported connection type.", target=ErrorTarget.EVAL_RUN
287
315
  )
288
316
 
289
- response: OpenAIChatResponseType = await api_client.with_options(timeout=timeout).chat.completions.create(
290
- **params
317
+ response: OpenAIChatResponseType = await self._send_with_retries(
318
+ api_client=api_client,
319
+ params=params,
320
+ timeout=timeout,
291
321
  )
292
322
 
293
323
  return await format_llm_response(
@@ -311,3 +341,83 @@ class AsyncPrompty:
311
341
  inputs = self._resolve_inputs(kwargs)
312
342
  messages = build_messages(prompt=self._template, working_dir=self.path.parent, **inputs)
313
343
  return messages
344
+
345
+ async def _send_with_retries(
346
+ self,
347
+ api_client: Union[AsyncAzureOpenAI, AsyncOpenAI],
348
+ params: Mapping[str, Any],
349
+ timeout: Optional[float],
350
+ max_retries: int = 10,
351
+ max_entity_retries: int = 3,
352
+ ) -> OpenAIChatResponseType:
353
+ """Send the request with retries.
354
+
355
+ :param Union[AsyncAzureOpenAI, AsyncOpenAI] api_client: The OpenAI client.
356
+ :param Mapping[str, Any] params: The request parameters.
357
+ :param Optional[float] timeout: The timeout for the request.
358
+ :param int max_retries: The maximum number of retries.
359
+ :param int max_entity_retries: The maximum number of retries for entity errors.
360
+ :return: The response from OpenAI.
361
+ :rtype: OpenAIChatResponseType
362
+ """
363
+
364
+ client_name: str = api_client.__class__.__name__
365
+ client: Union[AsyncAzureOpenAI, AsyncOpenAI] = api_client.with_options(timeout=timeout or NotGiven())
366
+
367
+ entity_retries: List[int] = [0]
368
+ should_retry: bool = True
369
+ retry: int = 0
370
+ delay: Optional[float] = None
371
+
372
+ while should_retry:
373
+ try:
374
+ if delay:
375
+ await asyncio.sleep(delay)
376
+
377
+ response = await client.chat.completions.create(**params)
378
+ return response
379
+ except OpenAIError as error:
380
+ if retry >= max_retries:
381
+ should_retry = False
382
+ else:
383
+ should_retry, delay = openai_error_retryable(error, retry, entity_retries, max_entity_retries)
384
+
385
+ if should_retry:
386
+ self._logger.warning(
387
+ "[%d/%d] %s request failed. %s: %s. Retrying in %f seconds.",
388
+ retry,
389
+ max_retries,
390
+ client_name,
391
+ type(error).__name__,
392
+ str(error),
393
+ delay or 0.0,
394
+ exc_info=True,
395
+ )
396
+ else:
397
+ self._logger.exception(
398
+ "[%d/%d] %s request failed. %s: %s",
399
+ retry,
400
+ max_retries,
401
+ client_name,
402
+ type(error).__name__,
403
+ str(error),
404
+ )
405
+ raise WrappedOpenAIError(error=error) from error
406
+
407
+ retry += 1
408
+
409
+ @staticmethod
410
+ def get_token_provider(cred: Union[TokenCredential, AsyncTokenCredential]) -> AsyncAzureADTokenProvider:
411
+ """Get the token provider for the prompty.
412
+
413
+ :param Union[TokenCredential, AsyncTokenCredential] cred: The Azure authentication credential.
414
+ :return: The token provider if a credential is provided, otherwise None.
415
+ :rtype: Optional[AsyncAzureADTokenProvider]
416
+ """
417
+ async def _wrapper() -> str:
418
+ token = cred.get_token(TokenScope.COGNITIVE_SERVICES_MANAGEMENT)
419
+ if isinstance(token, Awaitable):
420
+ token = await token
421
+ return token.token
422
+
423
+ return _wrapper
@@ -2,12 +2,15 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ # cspell:ignore apng, retriable
6
+
5
7
  import copy
6
- from dataclasses import dataclass, is_dataclass, fields
7
8
  import os
8
9
  import re
9
10
  import json
10
11
  import base64
12
+ from dataclasses import dataclass, is_dataclass, fields
13
+ from logging import Logger
11
14
  from pathlib import Path
12
15
  from typing import (
13
16
  Any,
@@ -30,6 +33,7 @@ from typing import (
30
33
  from jinja2 import Template
31
34
  from openai import AsyncStream
32
35
  from openai.types.chat import ChatCompletion, ChatCompletionChunk
36
+ from openai import APIConnectionError, APIStatusError, APITimeoutError, OpenAIError
33
37
 
34
38
  from azure.ai.evaluation._constants import DefaultOpenEncoding
35
39
  from azure.ai.evaluation._legacy.prompty._exceptions import (
@@ -217,7 +221,7 @@ DEFAULT_IMAGE_MIME_TYPE: Final[str] = "image/*"
217
221
  """The mime type to use when we don't know the image type"""
218
222
 
219
223
  FILE_EXT_TO_MIME: Final[Mapping[str, str]] = {
220
- ".apng": "image/apng", # cspell:ignore apng
224
+ ".apng": "image/apng",
221
225
  ".avif": "image/avif",
222
226
  ".bmp": "image/bmp",
223
227
  ".gif": "image/gif",
@@ -542,4 +546,70 @@ async def format_llm_response(
542
546
  return result
543
547
 
544
548
 
549
+ def openai_error_retryable(
550
+ error: OpenAIError, retry: int, entity_retry: List[int], max_entity_retries: int
551
+ ) -> Tuple[bool, float]:
552
+ """
553
+ Determines if an OpenAI error is retryable, and optionally determines the min retry delay to use.
554
+ If none is returned, the caller will determine the delay to use.
555
+
556
+ :param OpenAIError error: The error to handle
557
+ :param int retry: The current retry count (0 means we're on the first attempt and no retries have been made)
558
+ :param List[int] entity_retry: The current retry count for the unprocessable entity failures. This should be a
559
+ list containing only 1 element to mimic pass by reference semantics. A value of 0 means we're on the
560
+ first attempt and no retries have been made.
561
+ :param int max_entity_retries: The maximum number of retries to make for unprocessable entity failures
562
+ :return: A tuple containing whether the error is retryable and the min delay to use if any
563
+ :rtype: Tuple[bool, Optional[float]]
564
+ """
565
+
566
+ # Using https://platform.openai.com/docs/guides/error-codes/api-errors#python-library-error-types as a reference
567
+
568
+ should_retry: bool
569
+ delay: Optional[float] = None
570
+
571
+ if isinstance(error, APIConnectionError):
572
+ retriable_error_messages: Sequence[str] = [
573
+ "connection aborted",
574
+ # issue 2296
575
+ "server disconnected without sending a response",
576
+ ]
577
+ should_retry = (
578
+ isinstance(error, APITimeoutError) # APITimeoutError is a subclass of APIConnectionError
579
+ or str(error).lower() in retriable_error_messages
580
+ or str(error.__cause__).lower() in retriable_error_messages
581
+ )
582
+ elif isinstance(error, APIStatusError):
583
+ status_code: int = error.response.status_code
584
+ if status_code == 422:
585
+ # As per the original legacy code, UnprocessableEntityError (HTTP 422) should be handled differently
586
+ # with a smaller retry count, as retrying more may not be beneficial.
587
+ should_retry = entity_retry[0] < max_entity_retries
588
+ entity_retry[0] += 1
589
+ elif status_code == 429:
590
+ # Two types, one is you are throttled and so should retry after a delay, the other is you have exceeded
591
+ # your quota and should not retry.
592
+ if (error.type or "").lower() == "insufficient_quota":
593
+ should_retry = False
594
+ else:
595
+ should_retry = True
596
+ should_retry = error.type != "insufficient_quota"
597
+ else:
598
+ should_retry = status_code >= 500
599
+
600
+ # Use what the service tells us to use for the delay if it's provided
601
+ if should_retry and not delay:
602
+ delay_str = error.response.headers.get("Retry-After", None)
603
+ if delay_str is not None:
604
+ delay = float(delay_str)
605
+ else:
606
+ should_retry = False
607
+
608
+ # Use exponential backoff for retries if the service doesn't provide a delay
609
+ if not delay:
610
+ delay = min(60, 2 + 2**retry)
611
+
612
+ return (should_retry, delay)
613
+
614
+
545
615
  # endregion
@@ -6,9 +6,10 @@ from enum import Enum
6
6
  import os
7
7
  import inspect
8
8
  import logging
9
+ import asyncio
9
10
  from datetime import datetime
10
11
  from azure.ai.evaluation._common._experimental import experimental
11
- from typing import Any, Callable, Dict, List, Optional, Union, cast
12
+ from typing import Any, Callable, Dict, List, Optional, Union, cast, Coroutine, TypeVar, Awaitable
12
13
  from azure.ai.evaluation._common.math import list_mean_nan_safe
13
14
  from azure.ai.evaluation._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
14
15
  from azure.ai.evaluation._evaluators import (
@@ -20,6 +21,8 @@ from azure.ai.evaluation._evaluators import (
20
21
  _fluency,
21
22
  _xpia,
22
23
  _coherence,
24
+ _code_vulnerability,
25
+ _ungrounded_attributes,
23
26
  )
24
27
  from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
25
28
  from azure.ai.evaluation._evaluate import _evaluate
@@ -31,7 +34,7 @@ from azure.ai.evaluation.simulator import (
31
34
  AdversarialScenario,
32
35
  AdversarialScenarioJailbreak,
33
36
  IndirectAttackSimulator,
34
- DirectAttackSimulator ,
37
+ DirectAttackSimulator,
35
38
  )
36
39
  from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
37
40
  from azure.ai.evaluation.simulator._utils import JsonLineList
@@ -71,6 +74,7 @@ class _SafetyEvaluator(Enum):
71
74
  """
72
75
 
73
76
  CONTENT_SAFETY = "content_safety"
77
+ CODE_VULNERABILITY = "code_vulnerability"
74
78
  GROUNDEDNESS = "groundedness"
75
79
  PROTECTED_MATERIAL = "protected_material"
76
80
  RELEVANCE = "relevance"
@@ -80,21 +84,22 @@ class _SafetyEvaluator(Enum):
80
84
  INDIRECT_ATTACK = "indirect_attack"
81
85
  DIRECT_ATTACK = "direct_attack"
82
86
  ECI = "eci"
87
+ UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
83
88
 
84
89
 
85
90
  @experimental
86
91
  class _SafetyEvaluation:
87
92
  def __init__(
88
93
  self,
89
- azure_ai_project: dict,
94
+ azure_ai_project: Union[str, dict],
90
95
  credential: TokenCredential,
91
96
  model_config: Optional[Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]] = None,
92
97
  ):
93
98
  """
94
99
  Initializes a SafetyEvaluation object.
95
100
 
96
- :param azure_ai_project: A dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'.
97
- :type azure_ai_project: Dict[str, str]
101
+ :param azure_ai_project: A string or dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'.
102
+ :type azure_ai_project: Union[str, Dict[str, str]]
98
103
  :param credential: The credential for connecting to Azure AI project.
99
104
  :type credential: ~azure.core.credentials.TokenCredential
100
105
  :param model_config: A dictionary defining the configuration for the model. Acceptable types are AzureOpenAIModelConfiguration and OpenAIModelConfiguration.
@@ -106,8 +111,7 @@ class _SafetyEvaluation:
106
111
  self.model_config = model_config
107
112
  else:
108
113
  self.model_config = None
109
- validate_azure_ai_project(azure_ai_project)
110
- self.azure_ai_project = AzureAIProject(**azure_ai_project)
114
+ self.azure_ai_project = validate_azure_ai_project(azure_ai_project)
111
115
  self.credential = credential
112
116
  self.logger = _setup_logger()
113
117
 
@@ -157,6 +161,8 @@ class _SafetyEvaluation:
157
161
  adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
158
162
  source_text: Optional[str] = None,
159
163
  direct_attack: bool = False,
164
+ randomization_seed: Optional[int] = None,
165
+ concurrent_async_tasks: Optional[int] = 5,
160
166
  ) -> Dict[str, str]:
161
167
  """
162
168
  Generates synthetic conversations based on provided parameters.
@@ -192,10 +198,17 @@ class _SafetyEvaluation:
192
198
  context = latest_message.get("context", None)
193
199
  latest_context = None
194
200
  try:
201
+ is_async = self._is_async_function(target)
195
202
  if self._check_target_returns_context(target):
196
- response, latest_context = target(query=application_input)
203
+ if is_async:
204
+ response, latest_context = await target(query=application_input)
205
+ else:
206
+ response, latest_context = target(query=application_input)
197
207
  else:
198
- response = target(query=application_input)
208
+ if is_async:
209
+ response = await target(query=application_input)
210
+ else:
211
+ response = target(query=application_input)
199
212
  except Exception as e:
200
213
  response = f"Something went wrong {e!s}"
201
214
 
@@ -234,6 +247,8 @@ class _SafetyEvaluation:
234
247
  conversation_turns=conversation_turns,
235
248
  text=source_text,
236
249
  target=callback,
250
+ randomization_seed=randomization_seed,
251
+ concurrent_async_task=concurrent_async_tasks
237
252
  )
238
253
 
239
254
  # if DirectAttack, run DirectAttackSimulator
@@ -247,6 +262,8 @@ class _SafetyEvaluation:
247
262
  max_conversation_turns=max_conversation_turns,
248
263
  max_simulation_results=max_simulation_results,
249
264
  target=callback,
265
+ randomization_seed=randomization_seed,
266
+ concurrent_async_task=concurrent_async_tasks,
250
267
  )
251
268
  jailbreak_outputs = simulator_outputs["jailbreak"]
252
269
  simulator_outputs = simulator_outputs["regular"]
@@ -264,6 +281,7 @@ class _SafetyEvaluation:
264
281
  num_queries=max_simulation_results,
265
282
  target=callback,
266
283
  text=source_text if source_text else "",
284
+ concurrent_async_tasks=concurrent_async_tasks,
267
285
  )
268
286
 
269
287
  ## Run AdversarialSimulator
@@ -279,6 +297,8 @@ class _SafetyEvaluation:
279
297
  conversation_turns=conversation_turns,
280
298
  target=callback,
281
299
  text=source_text,
300
+ randomization_seed=randomization_seed,
301
+ concurrent_async_task=concurrent_async_tasks
282
302
  )
283
303
 
284
304
  ## If no outputs are generated, raise an exception
@@ -372,6 +392,10 @@ class _SafetyEvaluation:
372
392
  )
373
393
  if evaluator == _SafetyEvaluator.ECI:
374
394
  return _UnstableAdversarialScenario.ECI
395
+ if evaluator == _SafetyEvaluator.CODE_VULNERABILITY:
396
+ return AdversarialScenario.ADVERSARIAL_CODE_VULNERABILITY
397
+ if evaluator == _SafetyEvaluator.UNGROUNDED_ATTRIBUTES:
398
+ return AdversarialScenario.ADVERSARIAL_UNGROUNDED_ATTRIBUTES
375
399
  if evaluator in [
376
400
  _SafetyEvaluator.GROUNDEDNESS,
377
401
  _SafetyEvaluator.RELEVANCE,
@@ -453,6 +477,14 @@ class _SafetyEvaluation:
453
477
  evaluators_dict["eci"] = ECIEvaluator(
454
478
  azure_ai_project=self.azure_ai_project, credential=self.credential
455
479
  )
480
+ elif evaluator == _SafetyEvaluator.CODE_VULNERABILITY:
481
+ evaluators_dict["code_vulnerability"] = _code_vulnerability.CodeVulnerabilityEvaluator(
482
+ azure_ai_project=self.azure_ai_project, credential=self.credential
483
+ )
484
+ elif evaluator == _SafetyEvaluator.UNGROUNDED_ATTRIBUTES:
485
+ evaluators_dict["ungrounded_attributes"] = _ungrounded_attributes.UngroundedAttributesEvaluator(
486
+ azure_ai_project=self.azure_ai_project, credential=self.credential
487
+ )
456
488
  else:
457
489
  msg = (
458
490
  f"Invalid evaluator: {evaluator}. Supported evaluators are: {_SafetyEvaluator.__members__.values()}"
@@ -465,7 +497,7 @@ class _SafetyEvaluation:
465
497
  blame=ErrorBlame.USER_ERROR,
466
498
  )
467
499
  return evaluators_dict
468
-
500
+
469
501
  @staticmethod
470
502
  def _check_target_returns_context(target: Callable) -> bool:
471
503
  """
@@ -478,6 +510,15 @@ class _SafetyEvaluation:
478
510
  ret_type = sig.return_annotation
479
511
  if ret_type == inspect.Signature.empty:
480
512
  return False
513
+
514
+ # Check for Coroutine/Awaitable return types for async functions
515
+ origin = getattr(ret_type, "__origin__", None)
516
+ if origin is not None and (origin is Coroutine or origin is Awaitable):
517
+ args = getattr(ret_type, "__args__", None)
518
+ if args and len(args) > 0:
519
+ # For async functions, check the actual return type inside the Coroutine
520
+ ret_type = args[-1]
521
+
481
522
  if ret_type is tuple:
482
523
  return True
483
524
  return False
@@ -494,13 +535,33 @@ class _SafetyEvaluation:
494
535
  ret_type = sig.return_annotation
495
536
  if ret_type == inspect.Signature.empty:
496
537
  return False
538
+
539
+ # Check for Coroutine/Awaitable return types for async functions
540
+ origin = getattr(ret_type, "__origin__", None)
541
+ if origin is not None and (origin is Coroutine or origin is Awaitable):
542
+ args = getattr(ret_type, "__args__", None)
543
+ if args and len(args) > 0:
544
+ # For async functions, check the actual return type inside the Coroutine
545
+ ret_type = args[-1]
546
+
497
547
  if ret_type is str:
498
548
  return True
499
549
  return False
500
550
 
501
-
502
551
  @staticmethod
503
- def _check_target_is_callback(target:Callable) -> bool:
552
+ def _is_async_function(target: Callable) -> bool:
553
+ """
554
+ Checks if the target function is an async function.
555
+
556
+ :param target: The target function to check.
557
+ :type target: Callable
558
+ :return: True if the target function is async, False otherwise.
559
+ :rtype: bool
560
+ """
561
+ return asyncio.iscoroutinefunction(target)
562
+
563
+ @staticmethod
564
+ def _check_target_is_callback(target: Callable) -> bool:
504
565
  sig = inspect.signature(target)
505
566
  param_names = list(sig.parameters.keys())
506
567
  return 'messages' in param_names and 'stream' in param_names and 'session_state' in param_names and 'context' in param_names
@@ -560,7 +621,28 @@ class _SafetyEvaluation:
560
621
  category=ErrorCategory.INVALID_VALUE,
561
622
  blame=ErrorBlame.USER_ERROR,
562
623
  )
563
-
624
+
625
+ if _SafetyEvaluator.CODE_VULNERABILITY in evaluators and num_turns > 1:
626
+ self.logger.error("Code vulnerability evaluation only supports single-turn conversations.")
627
+ msg = "Code vulnerability evaluation only supports single-turn conversations."
628
+ raise EvaluationException(
629
+ message=msg,
630
+ internal_message=msg,
631
+ target=ErrorTarget.UNKNOWN,
632
+ category=ErrorCategory.INVALID_VALUE,
633
+ blame=ErrorBlame.USER_ERROR,
634
+ )
635
+ if _SafetyEvaluator.UNGROUNDED_ATTRIBUTES in evaluators and num_turns > 1:
636
+ self.logger.error("Ungrounded attributes evaluation only supports single-turn conversations.")
637
+ msg = "Ungrounded attributes evaluation only supports single-turn conversations."
638
+ raise EvaluationException(
639
+ message=msg,
640
+ internal_message=msg,
641
+ target=ErrorTarget.UNKNOWN,
642
+ category=ErrorCategory.INVALID_VALUE,
643
+ blame=ErrorBlame.USER_ERROR,
644
+ )
645
+
564
646
  if _SafetyEvaluator.CONTENT_SAFETY in evaluators and scenario and num_turns > 1 and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
565
647
  self.logger.error(f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn.")
566
648
  msg = f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
@@ -604,7 +686,9 @@ class _SafetyEvaluation:
604
686
  violence_defects = []
605
687
  sexual_defects = []
606
688
  self_harm_defects = []
607
- for i, row in enumerate(zip(evaluation_result_dict[jailbreak_cols[0]]['rows'], evaluation_result_dict[regular_cols[0]]['rows'])):
689
+ jailbreak_rows = sorted(evaluation_result_dict[jailbreak_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
690
+ regular_rows = sorted(evaluation_result_dict[regular_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
691
+ for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
608
692
  hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
609
693
  violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
610
694
  sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
@@ -630,7 +714,7 @@ class _SafetyEvaluation:
630
714
 
631
715
  async def __call__(
632
716
  self,
633
- target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
717
+ target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
634
718
  evaluators: List[_SafetyEvaluator] = [],
635
719
  evaluation_name: Optional[str] = None,
636
720
  num_turns : int = 1,
@@ -643,13 +727,15 @@ class _SafetyEvaluation:
643
727
  data_path: Optional[Union[str, os.PathLike]] = None,
644
728
  jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
645
729
  output_path: Optional[Union[str, os.PathLike]] = None,
646
- data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None
730
+ data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None,
731
+ randomization_seed: Optional[int] = None,
732
+ concurrent_async_tasks: Optional[int] = 5,
647
733
  ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
648
734
  '''
649
735
  Evaluates the target function based on the provided parameters.
650
736
 
651
- :param target: The target function to call during the evaluation.
652
- :type target: Callable
737
+ :param target: The target function to call during the evaluation. This can be a synchronous or asynchronous function.
738
+ :type target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
653
739
  :param evaluators: A list of SafetyEvaluator.
654
740
  :type evaluators: List[_SafetyEvaluator]
655
741
  :param evaluation_name: The display name name of the evaluation.
@@ -671,12 +757,17 @@ class _SafetyEvaluation:
671
757
  :param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run.
672
758
  :type data_path: Optional[Union[str, os.PathLike]]
673
759
  :param jailbreak_data_path: The path to the data file generated by the Simulator for jailbreak scenario. If None, the DirectAttackSimulator will be run.
674
- :type jailbreak_data_path: Optional[Union[str, os.PathLike]]
675
- :param output_path: The path to write the evaluation results to if set.
760
+ :type jailbreak_data_path: Optional[Union[str, os.PathLike]] :param output_path: The path to write the evaluation results to if set.
676
761
  :type output_path: Optional[Union[str, os.PathLike]]
762
+ :param data_paths: A dictionary of data paths to evaluate. If None, the Simulator will be run.
763
+ :type data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]]
764
+ :param randomization_seed: The seed used to randomize prompt selection. If unset, the system's default seed is used.
765
+ :type randomization_seed: Optional[int]
766
+ :param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
767
+ :type concurrent_async_tasks: Optional[int]
677
768
  '''
678
- ## Log inputs
679
- self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}")
769
+ ## Log inputs
770
+ self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}")
680
771
 
681
772
  ## Validate arguments
682
773
  self._validate_inputs(
@@ -706,6 +797,7 @@ class _SafetyEvaluation:
706
797
  tasks=tasks,
707
798
  source_text=source_text,
708
799
  direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
800
+ randomization_seed=randomization_seed,
709
801
  )
710
802
  elif data_path:
711
803
  data_paths = {Path(data_path).stem: data_path}
@@ -3,4 +3,4 @@
3
3
  # ---------------------------------------------------------
4
4
  # represents upcoming version
5
5
 
6
- VERSION = "1.5.0"
6
+ VERSION = "1.7.0"
@@ -42,4 +42,4 @@ class AttackStrategy(Enum):
42
42
  raise ValueError("All items must be instances of AttackStrategy")
43
43
  if len(items) > 2:
44
44
  raise ValueError("Composed strategies must have at most 2 items")
45
- return items
45
+ return items