azure-ai-evaluation 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -16
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +5 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +159 -29
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +80 -2
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +4 -4
- azure/ai/evaluation/_eval_mapping.py +71 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +17 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +372 -105
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +120 -7
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +9 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +2 -2
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +8 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +7 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/{_red_team/_utils → _legacy/_common}/__init__.py +1 -1
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_objective_generator.py +3 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_strategy.py +4 -1
- azure/ai/evaluation/{_red_team → red_team}/_red_team.py +885 -481
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/constants.py +2 -1
- azure/ai/evaluation/{_red_team → red_team}/_utils/formatting_utils.py +23 -22
- azure/ai/evaluation/{_red_team → red_team}/_utils/logging_utils.py +1 -1
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/strategy_utils.py +9 -5
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +36 -2
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +148 -80
- azure/ai/evaluation/_red_team/_red_team_result.py +0 -246
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_callback_chat_target.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_default_converter.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team/_utils}/__init__.py +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -2,20 +2,27 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
5
6
|
import re
|
|
7
|
+
|
|
8
|
+
from logging import Logger
|
|
6
9
|
from os import PathLike
|
|
7
10
|
from pathlib import Path
|
|
8
|
-
from typing import Any, AsyncGenerator, Dict, Final, List, Mapping, Optional, Sequence, Tuple, Union, cast
|
|
11
|
+
from typing import Any, AsyncGenerator, Awaitable, Dict, Final, List, Mapping, Optional, Sequence, Tuple, Union, cast
|
|
9
12
|
|
|
10
|
-
from openai import AsyncAzureOpenAI, AsyncOpenAI, NotGiven
|
|
13
|
+
from openai import AsyncAzureOpenAI, AsyncOpenAI, NotGiven, OpenAIError
|
|
14
|
+
from openai.lib.azure import AsyncAzureADTokenProvider
|
|
15
|
+
from azure.core.credentials import TokenCredential
|
|
16
|
+
from azure.core.credentials_async import AsyncTokenCredential
|
|
11
17
|
|
|
12
18
|
from azure.ai.evaluation._exceptions import ErrorTarget
|
|
13
|
-
from azure.ai.evaluation._constants import DefaultOpenEncoding
|
|
19
|
+
from azure.ai.evaluation._constants import DefaultOpenEncoding, TokenScope
|
|
14
20
|
from azure.ai.evaluation._legacy.prompty._exceptions import (
|
|
15
21
|
InvalidInputError,
|
|
16
22
|
PromptyException,
|
|
17
23
|
MissingRequiredInputError,
|
|
18
24
|
NotSupportedError,
|
|
25
|
+
WrappedOpenAIError,
|
|
19
26
|
)
|
|
20
27
|
from azure.ai.evaluation._legacy.prompty._connection import AzureOpenAIConnection, Connection, OpenAIConnection
|
|
21
28
|
from azure.ai.evaluation._legacy.prompty._yaml_utils import load_yaml_string
|
|
@@ -25,10 +32,14 @@ from azure.ai.evaluation._legacy.prompty._utils import (
|
|
|
25
32
|
OpenAIChatResponseType,
|
|
26
33
|
build_messages,
|
|
27
34
|
format_llm_response,
|
|
35
|
+
openai_error_retryable,
|
|
28
36
|
prepare_open_ai_request_params,
|
|
29
37
|
resolve_references,
|
|
30
38
|
update_dict_recursively,
|
|
31
39
|
)
|
|
40
|
+
from azure.ai.evaluation._constants import DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
|
|
41
|
+
from azure.ai.evaluation._legacy._common._logging import get_logger
|
|
42
|
+
from azure.ai.evaluation._legacy._common._async_token_provider import AsyncAzureTokenProvider
|
|
32
43
|
|
|
33
44
|
|
|
34
45
|
PROMPTY_EXTENSION: Final[str] = ".prompty"
|
|
@@ -124,10 +135,24 @@ class AsyncPrompty:
|
|
|
124
135
|
def __init__(
|
|
125
136
|
self,
|
|
126
137
|
path: Union[str, PathLike],
|
|
138
|
+
*,
|
|
139
|
+
logger: Optional[Logger] = None,
|
|
140
|
+
token_credential: Optional[Union[TokenCredential, AsyncTokenCredential]] = None,
|
|
141
|
+
is_reasoning_model: bool = False,
|
|
127
142
|
**kwargs: Any,
|
|
128
143
|
):
|
|
129
144
|
path = Path(path)
|
|
130
145
|
configs, self._template = self._parse_prompty(path)
|
|
146
|
+
|
|
147
|
+
if is_reasoning_model:
|
|
148
|
+
parameters = configs.get("model", {}).get("parameters", {})
|
|
149
|
+
if "max_tokens" in parameters:
|
|
150
|
+
parameters.pop("max_tokens", None)
|
|
151
|
+
parameters["max_completion_tokens"] = DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
|
|
152
|
+
# Remove unsupported parameters for reasoning models
|
|
153
|
+
for key in ["temperature", "top_p", "presence_penalty", "frequency_penalty"]:
|
|
154
|
+
parameters.pop(key, None)
|
|
155
|
+
|
|
131
156
|
configs = resolve_references(configs, base_path=path.parent)
|
|
132
157
|
configs = update_dict_recursively(configs, resolve_references(kwargs, base_path=path.parent))
|
|
133
158
|
|
|
@@ -142,6 +167,9 @@ class AsyncPrompty:
|
|
|
142
167
|
self._inputs: Dict[str, Any] = configs.get("inputs", {})
|
|
143
168
|
self._outputs: Dict[str, Any] = configs.get("outputs", {})
|
|
144
169
|
self._name: str = configs.get("name", path.stem)
|
|
170
|
+
self._logger = logger or get_logger(__name__)
|
|
171
|
+
self._token_credential: Union[TokenCredential, AsyncTokenCredential] = \
|
|
172
|
+
token_credential or AsyncAzureTokenProvider()
|
|
145
173
|
|
|
146
174
|
@property
|
|
147
175
|
def path(self) -> Path:
|
|
@@ -234,9 +262,6 @@ class AsyncPrompty:
|
|
|
234
262
|
|
|
235
263
|
return resolved_inputs
|
|
236
264
|
|
|
237
|
-
# TODO ralphe: error handling
|
|
238
|
-
# @trace
|
|
239
|
-
# @handle_openai_error()
|
|
240
265
|
async def __call__( # pylint: disable=docstring-keyword-should-match-keyword-only
|
|
241
266
|
self,
|
|
242
267
|
**kwargs: Any,
|
|
@@ -257,7 +282,7 @@ class AsyncPrompty:
|
|
|
257
282
|
messages = build_messages(prompt=self._template, working_dir=self.path.parent, **inputs)
|
|
258
283
|
params = prepare_open_ai_request_params(self._model, messages)
|
|
259
284
|
|
|
260
|
-
timeout:
|
|
285
|
+
timeout: Optional[float] = None
|
|
261
286
|
if timeout_val := cast(Any, kwargs.get("timeout", None)):
|
|
262
287
|
timeout = float(timeout_val)
|
|
263
288
|
|
|
@@ -273,6 +298,9 @@ class AsyncPrompty:
|
|
|
273
298
|
azure_deployment=connection.azure_deployment,
|
|
274
299
|
api_version=connection.api_version,
|
|
275
300
|
max_retries=max_retries,
|
|
301
|
+
azure_ad_token_provider=(self.get_token_provider(self._token_credential)
|
|
302
|
+
if not connection.api_key
|
|
303
|
+
else None),
|
|
276
304
|
)
|
|
277
305
|
elif isinstance(connection, OpenAIConnection):
|
|
278
306
|
api_client = AsyncOpenAI(
|
|
@@ -286,8 +314,10 @@ class AsyncPrompty:
|
|
|
286
314
|
f"'{type(connection).__name__}' is not a supported connection type.", target=ErrorTarget.EVAL_RUN
|
|
287
315
|
)
|
|
288
316
|
|
|
289
|
-
response: OpenAIChatResponseType = await
|
|
290
|
-
|
|
317
|
+
response: OpenAIChatResponseType = await self._send_with_retries(
|
|
318
|
+
api_client=api_client,
|
|
319
|
+
params=params,
|
|
320
|
+
timeout=timeout,
|
|
291
321
|
)
|
|
292
322
|
|
|
293
323
|
return await format_llm_response(
|
|
@@ -311,3 +341,83 @@ class AsyncPrompty:
|
|
|
311
341
|
inputs = self._resolve_inputs(kwargs)
|
|
312
342
|
messages = build_messages(prompt=self._template, working_dir=self.path.parent, **inputs)
|
|
313
343
|
return messages
|
|
344
|
+
|
|
345
|
+
async def _send_with_retries(
|
|
346
|
+
self,
|
|
347
|
+
api_client: Union[AsyncAzureOpenAI, AsyncOpenAI],
|
|
348
|
+
params: Mapping[str, Any],
|
|
349
|
+
timeout: Optional[float],
|
|
350
|
+
max_retries: int = 10,
|
|
351
|
+
max_entity_retries: int = 3,
|
|
352
|
+
) -> OpenAIChatResponseType:
|
|
353
|
+
"""Send the request with retries.
|
|
354
|
+
|
|
355
|
+
:param Union[AsyncAzureOpenAI, AsyncOpenAI] api_client: The OpenAI client.
|
|
356
|
+
:param Mapping[str, Any] params: The request parameters.
|
|
357
|
+
:param Optional[float] timeout: The timeout for the request.
|
|
358
|
+
:param int max_retries: The maximum number of retries.
|
|
359
|
+
:param int max_entity_retries: The maximum number of retries for entity errors.
|
|
360
|
+
:return: The response from OpenAI.
|
|
361
|
+
:rtype: OpenAIChatResponseType
|
|
362
|
+
"""
|
|
363
|
+
|
|
364
|
+
client_name: str = api_client.__class__.__name__
|
|
365
|
+
client: Union[AsyncAzureOpenAI, AsyncOpenAI] = api_client.with_options(timeout=timeout or NotGiven())
|
|
366
|
+
|
|
367
|
+
entity_retries: List[int] = [0]
|
|
368
|
+
should_retry: bool = True
|
|
369
|
+
retry: int = 0
|
|
370
|
+
delay: Optional[float] = None
|
|
371
|
+
|
|
372
|
+
while should_retry:
|
|
373
|
+
try:
|
|
374
|
+
if delay:
|
|
375
|
+
await asyncio.sleep(delay)
|
|
376
|
+
|
|
377
|
+
response = await client.chat.completions.create(**params)
|
|
378
|
+
return response
|
|
379
|
+
except OpenAIError as error:
|
|
380
|
+
if retry >= max_retries:
|
|
381
|
+
should_retry = False
|
|
382
|
+
else:
|
|
383
|
+
should_retry, delay = openai_error_retryable(error, retry, entity_retries, max_entity_retries)
|
|
384
|
+
|
|
385
|
+
if should_retry:
|
|
386
|
+
self._logger.warning(
|
|
387
|
+
"[%d/%d] %s request failed. %s: %s. Retrying in %f seconds.",
|
|
388
|
+
retry,
|
|
389
|
+
max_retries,
|
|
390
|
+
client_name,
|
|
391
|
+
type(error).__name__,
|
|
392
|
+
str(error),
|
|
393
|
+
delay or 0.0,
|
|
394
|
+
exc_info=True,
|
|
395
|
+
)
|
|
396
|
+
else:
|
|
397
|
+
self._logger.exception(
|
|
398
|
+
"[%d/%d] %s request failed. %s: %s",
|
|
399
|
+
retry,
|
|
400
|
+
max_retries,
|
|
401
|
+
client_name,
|
|
402
|
+
type(error).__name__,
|
|
403
|
+
str(error),
|
|
404
|
+
)
|
|
405
|
+
raise WrappedOpenAIError(error=error) from error
|
|
406
|
+
|
|
407
|
+
retry += 1
|
|
408
|
+
|
|
409
|
+
@staticmethod
|
|
410
|
+
def get_token_provider(cred: Union[TokenCredential, AsyncTokenCredential]) -> AsyncAzureADTokenProvider:
|
|
411
|
+
"""Get the token provider for the prompty.
|
|
412
|
+
|
|
413
|
+
:param Union[TokenCredential, AsyncTokenCredential] cred: The Azure authentication credential.
|
|
414
|
+
:return: The token provider if a credential is provided, otherwise None.
|
|
415
|
+
:rtype: Optional[AsyncAzureADTokenProvider]
|
|
416
|
+
"""
|
|
417
|
+
async def _wrapper() -> str:
|
|
418
|
+
token = cred.get_token(TokenScope.COGNITIVE_SERVICES_MANAGEMENT)
|
|
419
|
+
if isinstance(token, Awaitable):
|
|
420
|
+
token = await token
|
|
421
|
+
return token.token
|
|
422
|
+
|
|
423
|
+
return _wrapper
|
|
@@ -2,12 +2,15 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
# cspell:ignore apng, retriable
|
|
6
|
+
|
|
5
7
|
import copy
|
|
6
|
-
from dataclasses import dataclass, is_dataclass, fields
|
|
7
8
|
import os
|
|
8
9
|
import re
|
|
9
10
|
import json
|
|
10
11
|
import base64
|
|
12
|
+
from dataclasses import dataclass, is_dataclass, fields
|
|
13
|
+
from logging import Logger
|
|
11
14
|
from pathlib import Path
|
|
12
15
|
from typing import (
|
|
13
16
|
Any,
|
|
@@ -30,6 +33,7 @@ from typing import (
|
|
|
30
33
|
from jinja2 import Template
|
|
31
34
|
from openai import AsyncStream
|
|
32
35
|
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
36
|
+
from openai import APIConnectionError, APIStatusError, APITimeoutError, OpenAIError
|
|
33
37
|
|
|
34
38
|
from azure.ai.evaluation._constants import DefaultOpenEncoding
|
|
35
39
|
from azure.ai.evaluation._legacy.prompty._exceptions import (
|
|
@@ -217,7 +221,7 @@ DEFAULT_IMAGE_MIME_TYPE: Final[str] = "image/*"
|
|
|
217
221
|
"""The mime type to use when we don't know the image type"""
|
|
218
222
|
|
|
219
223
|
FILE_EXT_TO_MIME: Final[Mapping[str, str]] = {
|
|
220
|
-
".apng": "image/apng",
|
|
224
|
+
".apng": "image/apng",
|
|
221
225
|
".avif": "image/avif",
|
|
222
226
|
".bmp": "image/bmp",
|
|
223
227
|
".gif": "image/gif",
|
|
@@ -542,4 +546,70 @@ async def format_llm_response(
|
|
|
542
546
|
return result
|
|
543
547
|
|
|
544
548
|
|
|
549
|
+
def openai_error_retryable(
|
|
550
|
+
error: OpenAIError, retry: int, entity_retry: List[int], max_entity_retries: int
|
|
551
|
+
) -> Tuple[bool, float]:
|
|
552
|
+
"""
|
|
553
|
+
Determines if an OpenAI error is retryable, and optionally determines the min retry delay to use.
|
|
554
|
+
If none is returned, the caller will determine the delay to use.
|
|
555
|
+
|
|
556
|
+
:param OpenAIError error: The error to handle
|
|
557
|
+
:param int retry: The current retry count (0 means we're on the first attempt and no retries have been made)
|
|
558
|
+
:param List[int] entity_retry: The current retry count for the unprocessable entity failures. This should be a
|
|
559
|
+
list containing only 1 element to mimic pass by reference semantics. A value of 0 means we're on the
|
|
560
|
+
first attempt and no retries have been made.
|
|
561
|
+
:param int max_entity_retries: The maximum number of retries to make for unprocessable entity failures
|
|
562
|
+
:return: A tuple containing whether the error is retryable and the min delay to use if any
|
|
563
|
+
:rtype: Tuple[bool, Optional[float]]
|
|
564
|
+
"""
|
|
565
|
+
|
|
566
|
+
# Using https://platform.openai.com/docs/guides/error-codes/api-errors#python-library-error-types as a reference
|
|
567
|
+
|
|
568
|
+
should_retry: bool
|
|
569
|
+
delay: Optional[float] = None
|
|
570
|
+
|
|
571
|
+
if isinstance(error, APIConnectionError):
|
|
572
|
+
retriable_error_messages: Sequence[str] = [
|
|
573
|
+
"connection aborted",
|
|
574
|
+
# issue 2296
|
|
575
|
+
"server disconnected without sending a response",
|
|
576
|
+
]
|
|
577
|
+
should_retry = (
|
|
578
|
+
isinstance(error, APITimeoutError) # APITimeoutError is a subclass of APIConnectionError
|
|
579
|
+
or str(error).lower() in retriable_error_messages
|
|
580
|
+
or str(error.__cause__).lower() in retriable_error_messages
|
|
581
|
+
)
|
|
582
|
+
elif isinstance(error, APIStatusError):
|
|
583
|
+
status_code: int = error.response.status_code
|
|
584
|
+
if status_code == 422:
|
|
585
|
+
# As per the original legacy code, UnprocessableEntityError (HTTP 422) should be handled differently
|
|
586
|
+
# with a smaller retry count, as retrying more may not be beneficial.
|
|
587
|
+
should_retry = entity_retry[0] < max_entity_retries
|
|
588
|
+
entity_retry[0] += 1
|
|
589
|
+
elif status_code == 429:
|
|
590
|
+
# Two types, one is you are throttled and so should retry after a delay, the other is you have exceeded
|
|
591
|
+
# your quota and should not retry.
|
|
592
|
+
if (error.type or "").lower() == "insufficient_quota":
|
|
593
|
+
should_retry = False
|
|
594
|
+
else:
|
|
595
|
+
should_retry = True
|
|
596
|
+
should_retry = error.type != "insufficient_quota"
|
|
597
|
+
else:
|
|
598
|
+
should_retry = status_code >= 500
|
|
599
|
+
|
|
600
|
+
# Use what the service tells us to use for the delay if it's provided
|
|
601
|
+
if should_retry and not delay:
|
|
602
|
+
delay_str = error.response.headers.get("Retry-After", None)
|
|
603
|
+
if delay_str is not None:
|
|
604
|
+
delay = float(delay_str)
|
|
605
|
+
else:
|
|
606
|
+
should_retry = False
|
|
607
|
+
|
|
608
|
+
# Use exponential backoff for retries if the service doesn't provide a delay
|
|
609
|
+
if not delay:
|
|
610
|
+
delay = min(60, 2 + 2**retry)
|
|
611
|
+
|
|
612
|
+
return (should_retry, delay)
|
|
613
|
+
|
|
614
|
+
|
|
545
615
|
# endregion
|
|
@@ -6,9 +6,10 @@ from enum import Enum
|
|
|
6
6
|
import os
|
|
7
7
|
import inspect
|
|
8
8
|
import logging
|
|
9
|
+
import asyncio
|
|
9
10
|
from datetime import datetime
|
|
10
11
|
from azure.ai.evaluation._common._experimental import experimental
|
|
11
|
-
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
12
|
+
from typing import Any, Callable, Dict, List, Optional, Union, cast, Coroutine, TypeVar, Awaitable
|
|
12
13
|
from azure.ai.evaluation._common.math import list_mean_nan_safe
|
|
13
14
|
from azure.ai.evaluation._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
14
15
|
from azure.ai.evaluation._evaluators import (
|
|
@@ -20,6 +21,8 @@ from azure.ai.evaluation._evaluators import (
|
|
|
20
21
|
_fluency,
|
|
21
22
|
_xpia,
|
|
22
23
|
_coherence,
|
|
24
|
+
_code_vulnerability,
|
|
25
|
+
_ungrounded_attributes,
|
|
23
26
|
)
|
|
24
27
|
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
|
|
25
28
|
from azure.ai.evaluation._evaluate import _evaluate
|
|
@@ -31,7 +34,7 @@ from azure.ai.evaluation.simulator import (
|
|
|
31
34
|
AdversarialScenario,
|
|
32
35
|
AdversarialScenarioJailbreak,
|
|
33
36
|
IndirectAttackSimulator,
|
|
34
|
-
DirectAttackSimulator
|
|
37
|
+
DirectAttackSimulator,
|
|
35
38
|
)
|
|
36
39
|
from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
|
|
37
40
|
from azure.ai.evaluation.simulator._utils import JsonLineList
|
|
@@ -71,6 +74,7 @@ class _SafetyEvaluator(Enum):
|
|
|
71
74
|
"""
|
|
72
75
|
|
|
73
76
|
CONTENT_SAFETY = "content_safety"
|
|
77
|
+
CODE_VULNERABILITY = "code_vulnerability"
|
|
74
78
|
GROUNDEDNESS = "groundedness"
|
|
75
79
|
PROTECTED_MATERIAL = "protected_material"
|
|
76
80
|
RELEVANCE = "relevance"
|
|
@@ -80,21 +84,22 @@ class _SafetyEvaluator(Enum):
|
|
|
80
84
|
INDIRECT_ATTACK = "indirect_attack"
|
|
81
85
|
DIRECT_ATTACK = "direct_attack"
|
|
82
86
|
ECI = "eci"
|
|
87
|
+
UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
|
|
83
88
|
|
|
84
89
|
|
|
85
90
|
@experimental
|
|
86
91
|
class _SafetyEvaluation:
|
|
87
92
|
def __init__(
|
|
88
93
|
self,
|
|
89
|
-
azure_ai_project: dict,
|
|
94
|
+
azure_ai_project: Union[str, dict],
|
|
90
95
|
credential: TokenCredential,
|
|
91
96
|
model_config: Optional[Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]] = None,
|
|
92
97
|
):
|
|
93
98
|
"""
|
|
94
99
|
Initializes a SafetyEvaluation object.
|
|
95
100
|
|
|
96
|
-
:param azure_ai_project: A dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'.
|
|
97
|
-
:type azure_ai_project: Dict[str, str]
|
|
101
|
+
:param azure_ai_project: A string or dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'.
|
|
102
|
+
:type azure_ai_project: Union[str, Dict[str, str]]
|
|
98
103
|
:param credential: The credential for connecting to Azure AI project.
|
|
99
104
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
100
105
|
:param model_config: A dictionary defining the configuration for the model. Acceptable types are AzureOpenAIModelConfiguration and OpenAIModelConfiguration.
|
|
@@ -106,8 +111,7 @@ class _SafetyEvaluation:
|
|
|
106
111
|
self.model_config = model_config
|
|
107
112
|
else:
|
|
108
113
|
self.model_config = None
|
|
109
|
-
validate_azure_ai_project(azure_ai_project)
|
|
110
|
-
self.azure_ai_project = AzureAIProject(**azure_ai_project)
|
|
114
|
+
self.azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
111
115
|
self.credential = credential
|
|
112
116
|
self.logger = _setup_logger()
|
|
113
117
|
|
|
@@ -192,10 +196,17 @@ class _SafetyEvaluation:
|
|
|
192
196
|
context = latest_message.get("context", None)
|
|
193
197
|
latest_context = None
|
|
194
198
|
try:
|
|
199
|
+
is_async = self._is_async_function(target)
|
|
195
200
|
if self._check_target_returns_context(target):
|
|
196
|
-
|
|
201
|
+
if is_async:
|
|
202
|
+
response, latest_context = await target(query=application_input)
|
|
203
|
+
else:
|
|
204
|
+
response, latest_context = target(query=application_input)
|
|
197
205
|
else:
|
|
198
|
-
|
|
206
|
+
if is_async:
|
|
207
|
+
response = await target(query=application_input)
|
|
208
|
+
else:
|
|
209
|
+
response = target(query=application_input)
|
|
199
210
|
except Exception as e:
|
|
200
211
|
response = f"Something went wrong {e!s}"
|
|
201
212
|
|
|
@@ -372,6 +383,10 @@ class _SafetyEvaluation:
|
|
|
372
383
|
)
|
|
373
384
|
if evaluator == _SafetyEvaluator.ECI:
|
|
374
385
|
return _UnstableAdversarialScenario.ECI
|
|
386
|
+
if evaluator == _SafetyEvaluator.CODE_VULNERABILITY:
|
|
387
|
+
return AdversarialScenario.ADVERSARIAL_CODE_VULNERABILITY
|
|
388
|
+
if evaluator == _SafetyEvaluator.UNGROUNDED_ATTRIBUTES:
|
|
389
|
+
return AdversarialScenario.ADVERSARIAL_UNGROUNDED_ATTRIBUTES
|
|
375
390
|
if evaluator in [
|
|
376
391
|
_SafetyEvaluator.GROUNDEDNESS,
|
|
377
392
|
_SafetyEvaluator.RELEVANCE,
|
|
@@ -453,6 +468,14 @@ class _SafetyEvaluation:
|
|
|
453
468
|
evaluators_dict["eci"] = ECIEvaluator(
|
|
454
469
|
azure_ai_project=self.azure_ai_project, credential=self.credential
|
|
455
470
|
)
|
|
471
|
+
elif evaluator == _SafetyEvaluator.CODE_VULNERABILITY:
|
|
472
|
+
evaluators_dict["code_vulnerability"] = _code_vulnerability.CodeVulnerabilityEvaluator(
|
|
473
|
+
azure_ai_project=self.azure_ai_project, credential=self.credential
|
|
474
|
+
)
|
|
475
|
+
elif evaluator == _SafetyEvaluator.UNGROUNDED_ATTRIBUTES:
|
|
476
|
+
evaluators_dict["ungrounded_attributes"] = _ungrounded_attributes.UngroundedAttributesEvaluator(
|
|
477
|
+
azure_ai_project=self.azure_ai_project, credential=self.credential
|
|
478
|
+
)
|
|
456
479
|
else:
|
|
457
480
|
msg = (
|
|
458
481
|
f"Invalid evaluator: {evaluator}. Supported evaluators are: {_SafetyEvaluator.__members__.values()}"
|
|
@@ -465,7 +488,7 @@ class _SafetyEvaluation:
|
|
|
465
488
|
blame=ErrorBlame.USER_ERROR,
|
|
466
489
|
)
|
|
467
490
|
return evaluators_dict
|
|
468
|
-
|
|
491
|
+
|
|
469
492
|
@staticmethod
|
|
470
493
|
def _check_target_returns_context(target: Callable) -> bool:
|
|
471
494
|
"""
|
|
@@ -478,6 +501,15 @@ class _SafetyEvaluation:
|
|
|
478
501
|
ret_type = sig.return_annotation
|
|
479
502
|
if ret_type == inspect.Signature.empty:
|
|
480
503
|
return False
|
|
504
|
+
|
|
505
|
+
# Check for Coroutine/Awaitable return types for async functions
|
|
506
|
+
origin = getattr(ret_type, "__origin__", None)
|
|
507
|
+
if origin is not None and (origin is Coroutine or origin is Awaitable):
|
|
508
|
+
args = getattr(ret_type, "__args__", None)
|
|
509
|
+
if args and len(args) > 0:
|
|
510
|
+
# For async functions, check the actual return type inside the Coroutine
|
|
511
|
+
ret_type = args[-1]
|
|
512
|
+
|
|
481
513
|
if ret_type is tuple:
|
|
482
514
|
return True
|
|
483
515
|
return False
|
|
@@ -494,13 +526,33 @@ class _SafetyEvaluation:
|
|
|
494
526
|
ret_type = sig.return_annotation
|
|
495
527
|
if ret_type == inspect.Signature.empty:
|
|
496
528
|
return False
|
|
529
|
+
|
|
530
|
+
# Check for Coroutine/Awaitable return types for async functions
|
|
531
|
+
origin = getattr(ret_type, "__origin__", None)
|
|
532
|
+
if origin is not None and (origin is Coroutine or origin is Awaitable):
|
|
533
|
+
args = getattr(ret_type, "__args__", None)
|
|
534
|
+
if args and len(args) > 0:
|
|
535
|
+
# For async functions, check the actual return type inside the Coroutine
|
|
536
|
+
ret_type = args[-1]
|
|
537
|
+
|
|
497
538
|
if ret_type is str:
|
|
498
539
|
return True
|
|
499
540
|
return False
|
|
500
541
|
|
|
501
|
-
|
|
502
542
|
@staticmethod
|
|
503
|
-
def
|
|
543
|
+
def _is_async_function(target: Callable) -> bool:
|
|
544
|
+
"""
|
|
545
|
+
Checks if the target function is an async function.
|
|
546
|
+
|
|
547
|
+
:param target: The target function to check.
|
|
548
|
+
:type target: Callable
|
|
549
|
+
:return: True if the target function is async, False otherwise.
|
|
550
|
+
:rtype: bool
|
|
551
|
+
"""
|
|
552
|
+
return asyncio.iscoroutinefunction(target)
|
|
553
|
+
|
|
554
|
+
@staticmethod
|
|
555
|
+
def _check_target_is_callback(target: Callable) -> bool:
|
|
504
556
|
sig = inspect.signature(target)
|
|
505
557
|
param_names = list(sig.parameters.keys())
|
|
506
558
|
return 'messages' in param_names and 'stream' in param_names and 'session_state' in param_names and 'context' in param_names
|
|
@@ -560,7 +612,28 @@ class _SafetyEvaluation:
|
|
|
560
612
|
category=ErrorCategory.INVALID_VALUE,
|
|
561
613
|
blame=ErrorBlame.USER_ERROR,
|
|
562
614
|
)
|
|
563
|
-
|
|
615
|
+
|
|
616
|
+
if _SafetyEvaluator.CODE_VULNERABILITY in evaluators and num_turns > 1:
|
|
617
|
+
self.logger.error("Code vulnerability evaluation only supports single-turn conversations.")
|
|
618
|
+
msg = "Code vulnerability evaluation only supports single-turn conversations."
|
|
619
|
+
raise EvaluationException(
|
|
620
|
+
message=msg,
|
|
621
|
+
internal_message=msg,
|
|
622
|
+
target=ErrorTarget.UNKNOWN,
|
|
623
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
624
|
+
blame=ErrorBlame.USER_ERROR,
|
|
625
|
+
)
|
|
626
|
+
if _SafetyEvaluator.UNGROUNDED_ATTRIBUTES in evaluators and num_turns > 1:
|
|
627
|
+
self.logger.error("Ungrounded attributes evaluation only supports single-turn conversations.")
|
|
628
|
+
msg = "Ungrounded attributes evaluation only supports single-turn conversations."
|
|
629
|
+
raise EvaluationException(
|
|
630
|
+
message=msg,
|
|
631
|
+
internal_message=msg,
|
|
632
|
+
target=ErrorTarget.UNKNOWN,
|
|
633
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
634
|
+
blame=ErrorBlame.USER_ERROR,
|
|
635
|
+
)
|
|
636
|
+
|
|
564
637
|
if _SafetyEvaluator.CONTENT_SAFETY in evaluators and scenario and num_turns > 1 and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
|
|
565
638
|
self.logger.error(f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn.")
|
|
566
639
|
msg = f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
|
|
@@ -630,7 +703,7 @@ class _SafetyEvaluation:
|
|
|
630
703
|
|
|
631
704
|
async def __call__(
|
|
632
705
|
self,
|
|
633
|
-
target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
706
|
+
target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
634
707
|
evaluators: List[_SafetyEvaluator] = [],
|
|
635
708
|
evaluation_name: Optional[str] = None,
|
|
636
709
|
num_turns : int = 1,
|
|
@@ -644,12 +717,12 @@ class _SafetyEvaluation:
|
|
|
644
717
|
jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
|
|
645
718
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
646
719
|
data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None
|
|
647
|
-
) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
|
|
720
|
+
) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
|
|
648
721
|
'''
|
|
649
722
|
Evaluates the target function based on the provided parameters.
|
|
650
723
|
|
|
651
|
-
:param target: The target function to call during the evaluation.
|
|
652
|
-
:type target: Callable
|
|
724
|
+
:param target: The target function to call during the evaluation. This can be a synchronous or asynchronous function.
|
|
725
|
+
:type target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
|
|
653
726
|
:param evaluators: A list of SafetyEvaluator.
|
|
654
727
|
:type evaluators: List[_SafetyEvaluator]
|
|
655
728
|
:param evaluation_name: The display name name of the evaluation.
|
azure/ai/evaluation/_version.py
CHANGED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from ._red_team import RedTeam
|
|
7
|
+
from ._attack_strategy import AttackStrategy
|
|
8
|
+
from ._attack_objective_generator import RiskCategory
|
|
9
|
+
from ._red_team_result import RedTeamResult
|
|
10
|
+
except ImportError:
|
|
11
|
+
print("[INFO] Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`.")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"RedTeam",
|
|
16
|
+
"AttackStrategy",
|
|
17
|
+
"RiskCategory",
|
|
18
|
+
"RedTeamResult",
|
|
19
|
+
]
|
|
@@ -7,7 +7,10 @@ import os
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from typing import List, Optional, Dict, Any
|
|
9
9
|
from pathlib import Path
|
|
10
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
10
11
|
|
|
12
|
+
|
|
13
|
+
@experimental
|
|
11
14
|
class RiskCategory(str, Enum):
|
|
12
15
|
"""Risk categories for attack objectives."""
|
|
13
16
|
HateUnfairness = "hate_unfairness"
|
|
@@ -3,7 +3,10 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from enum import Enum
|
|
5
5
|
from typing import List
|
|
6
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
6
7
|
|
|
8
|
+
|
|
9
|
+
@experimental
|
|
7
10
|
class AttackStrategy(Enum):
|
|
8
11
|
"""Strategies for attacks."""
|
|
9
12
|
EASY = "easy"
|
|
@@ -39,4 +42,4 @@ class AttackStrategy(Enum):
|
|
|
39
42
|
raise ValueError("All items must be instances of AttackStrategy")
|
|
40
43
|
if len(items) > 2:
|
|
41
44
|
raise ValueError("Composed strategies must have at most 2 items")
|
|
42
|
-
return items
|
|
45
|
+
return items
|