azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +10 -0
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +7 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +165 -34
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +79 -1
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_converters/_ai_services.py +162 -118
- azure/ai/evaluation/_converters/_models.py +76 -6
- azure/ai/evaluation/_eval_mapping.py +73 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
- azure/ai/evaluation/_evaluate/_utils.py +117 -4
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +976 -546
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -2,20 +2,27 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
5
6
|
import re
|
|
7
|
+
|
|
8
|
+
from logging import Logger
|
|
6
9
|
from os import PathLike
|
|
7
10
|
from pathlib import Path
|
|
8
|
-
from typing import Any, AsyncGenerator, Dict, Final, List, Mapping, Optional, Sequence, Tuple, Union, cast
|
|
11
|
+
from typing import Any, AsyncGenerator, Awaitable, Dict, Final, List, Mapping, Optional, Sequence, Tuple, Union, cast
|
|
9
12
|
|
|
10
|
-
from openai import AsyncAzureOpenAI, AsyncOpenAI, NotGiven
|
|
13
|
+
from openai import AsyncAzureOpenAI, AsyncOpenAI, NotGiven, OpenAIError
|
|
14
|
+
from openai.lib.azure import AsyncAzureADTokenProvider
|
|
15
|
+
from azure.core.credentials import TokenCredential
|
|
16
|
+
from azure.core.credentials_async import AsyncTokenCredential
|
|
11
17
|
|
|
12
18
|
from azure.ai.evaluation._exceptions import ErrorTarget
|
|
13
|
-
from azure.ai.evaluation._constants import DefaultOpenEncoding
|
|
19
|
+
from azure.ai.evaluation._constants import DefaultOpenEncoding, TokenScope
|
|
14
20
|
from azure.ai.evaluation._legacy.prompty._exceptions import (
|
|
15
21
|
InvalidInputError,
|
|
16
22
|
PromptyException,
|
|
17
23
|
MissingRequiredInputError,
|
|
18
24
|
NotSupportedError,
|
|
25
|
+
WrappedOpenAIError,
|
|
19
26
|
)
|
|
20
27
|
from azure.ai.evaluation._legacy.prompty._connection import AzureOpenAIConnection, Connection, OpenAIConnection
|
|
21
28
|
from azure.ai.evaluation._legacy.prompty._yaml_utils import load_yaml_string
|
|
@@ -25,10 +32,14 @@ from azure.ai.evaluation._legacy.prompty._utils import (
|
|
|
25
32
|
OpenAIChatResponseType,
|
|
26
33
|
build_messages,
|
|
27
34
|
format_llm_response,
|
|
35
|
+
openai_error_retryable,
|
|
28
36
|
prepare_open_ai_request_params,
|
|
29
37
|
resolve_references,
|
|
30
38
|
update_dict_recursively,
|
|
31
39
|
)
|
|
40
|
+
from azure.ai.evaluation._constants import DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
|
|
41
|
+
from azure.ai.evaluation._legacy._common._logging import get_logger
|
|
42
|
+
from azure.ai.evaluation._legacy._common._async_token_provider import AsyncAzureTokenProvider
|
|
32
43
|
|
|
33
44
|
|
|
34
45
|
PROMPTY_EXTENSION: Final[str] = ".prompty"
|
|
@@ -124,10 +135,24 @@ class AsyncPrompty:
|
|
|
124
135
|
def __init__(
|
|
125
136
|
self,
|
|
126
137
|
path: Union[str, PathLike],
|
|
138
|
+
*,
|
|
139
|
+
logger: Optional[Logger] = None,
|
|
140
|
+
token_credential: Optional[Union[TokenCredential, AsyncTokenCredential]] = None,
|
|
141
|
+
is_reasoning_model: bool = False,
|
|
127
142
|
**kwargs: Any,
|
|
128
143
|
):
|
|
129
144
|
path = Path(path)
|
|
130
145
|
configs, self._template = self._parse_prompty(path)
|
|
146
|
+
|
|
147
|
+
if is_reasoning_model:
|
|
148
|
+
parameters = configs.get("model", {}).get("parameters", {})
|
|
149
|
+
if "max_tokens" in parameters:
|
|
150
|
+
parameters.pop("max_tokens", None)
|
|
151
|
+
parameters["max_completion_tokens"] = DEFAULT_MAX_COMPLETION_TOKENS_REASONING_MODELS
|
|
152
|
+
# Remove unsupported parameters for reasoning models
|
|
153
|
+
for key in ["temperature", "top_p", "presence_penalty", "frequency_penalty"]:
|
|
154
|
+
parameters.pop(key, None)
|
|
155
|
+
|
|
131
156
|
configs = resolve_references(configs, base_path=path.parent)
|
|
132
157
|
configs = update_dict_recursively(configs, resolve_references(kwargs, base_path=path.parent))
|
|
133
158
|
|
|
@@ -142,6 +167,9 @@ class AsyncPrompty:
|
|
|
142
167
|
self._inputs: Dict[str, Any] = configs.get("inputs", {})
|
|
143
168
|
self._outputs: Dict[str, Any] = configs.get("outputs", {})
|
|
144
169
|
self._name: str = configs.get("name", path.stem)
|
|
170
|
+
self._logger = logger or get_logger(__name__)
|
|
171
|
+
self._token_credential: Union[TokenCredential, AsyncTokenCredential] = \
|
|
172
|
+
token_credential or AsyncAzureTokenProvider()
|
|
145
173
|
|
|
146
174
|
@property
|
|
147
175
|
def path(self) -> Path:
|
|
@@ -234,9 +262,6 @@ class AsyncPrompty:
|
|
|
234
262
|
|
|
235
263
|
return resolved_inputs
|
|
236
264
|
|
|
237
|
-
# TODO ralphe: error handling
|
|
238
|
-
# @trace
|
|
239
|
-
# @handle_openai_error()
|
|
240
265
|
async def __call__( # pylint: disable=docstring-keyword-should-match-keyword-only
|
|
241
266
|
self,
|
|
242
267
|
**kwargs: Any,
|
|
@@ -257,7 +282,7 @@ class AsyncPrompty:
|
|
|
257
282
|
messages = build_messages(prompt=self._template, working_dir=self.path.parent, **inputs)
|
|
258
283
|
params = prepare_open_ai_request_params(self._model, messages)
|
|
259
284
|
|
|
260
|
-
timeout:
|
|
285
|
+
timeout: Optional[float] = None
|
|
261
286
|
if timeout_val := cast(Any, kwargs.get("timeout", None)):
|
|
262
287
|
timeout = float(timeout_val)
|
|
263
288
|
|
|
@@ -273,6 +298,9 @@ class AsyncPrompty:
|
|
|
273
298
|
azure_deployment=connection.azure_deployment,
|
|
274
299
|
api_version=connection.api_version,
|
|
275
300
|
max_retries=max_retries,
|
|
301
|
+
azure_ad_token_provider=(self.get_token_provider(self._token_credential)
|
|
302
|
+
if not connection.api_key
|
|
303
|
+
else None),
|
|
276
304
|
)
|
|
277
305
|
elif isinstance(connection, OpenAIConnection):
|
|
278
306
|
api_client = AsyncOpenAI(
|
|
@@ -286,8 +314,10 @@ class AsyncPrompty:
|
|
|
286
314
|
f"'{type(connection).__name__}' is not a supported connection type.", target=ErrorTarget.EVAL_RUN
|
|
287
315
|
)
|
|
288
316
|
|
|
289
|
-
response: OpenAIChatResponseType = await
|
|
290
|
-
|
|
317
|
+
response: OpenAIChatResponseType = await self._send_with_retries(
|
|
318
|
+
api_client=api_client,
|
|
319
|
+
params=params,
|
|
320
|
+
timeout=timeout,
|
|
291
321
|
)
|
|
292
322
|
|
|
293
323
|
return await format_llm_response(
|
|
@@ -311,3 +341,83 @@ class AsyncPrompty:
|
|
|
311
341
|
inputs = self._resolve_inputs(kwargs)
|
|
312
342
|
messages = build_messages(prompt=self._template, working_dir=self.path.parent, **inputs)
|
|
313
343
|
return messages
|
|
344
|
+
|
|
345
|
+
async def _send_with_retries(
|
|
346
|
+
self,
|
|
347
|
+
api_client: Union[AsyncAzureOpenAI, AsyncOpenAI],
|
|
348
|
+
params: Mapping[str, Any],
|
|
349
|
+
timeout: Optional[float],
|
|
350
|
+
max_retries: int = 10,
|
|
351
|
+
max_entity_retries: int = 3,
|
|
352
|
+
) -> OpenAIChatResponseType:
|
|
353
|
+
"""Send the request with retries.
|
|
354
|
+
|
|
355
|
+
:param Union[AsyncAzureOpenAI, AsyncOpenAI] api_client: The OpenAI client.
|
|
356
|
+
:param Mapping[str, Any] params: The request parameters.
|
|
357
|
+
:param Optional[float] timeout: The timeout for the request.
|
|
358
|
+
:param int max_retries: The maximum number of retries.
|
|
359
|
+
:param int max_entity_retries: The maximum number of retries for entity errors.
|
|
360
|
+
:return: The response from OpenAI.
|
|
361
|
+
:rtype: OpenAIChatResponseType
|
|
362
|
+
"""
|
|
363
|
+
|
|
364
|
+
client_name: str = api_client.__class__.__name__
|
|
365
|
+
client: Union[AsyncAzureOpenAI, AsyncOpenAI] = api_client.with_options(timeout=timeout or NotGiven())
|
|
366
|
+
|
|
367
|
+
entity_retries: List[int] = [0]
|
|
368
|
+
should_retry: bool = True
|
|
369
|
+
retry: int = 0
|
|
370
|
+
delay: Optional[float] = None
|
|
371
|
+
|
|
372
|
+
while should_retry:
|
|
373
|
+
try:
|
|
374
|
+
if delay:
|
|
375
|
+
await asyncio.sleep(delay)
|
|
376
|
+
|
|
377
|
+
response = await client.chat.completions.create(**params)
|
|
378
|
+
return response
|
|
379
|
+
except OpenAIError as error:
|
|
380
|
+
if retry >= max_retries:
|
|
381
|
+
should_retry = False
|
|
382
|
+
else:
|
|
383
|
+
should_retry, delay = openai_error_retryable(error, retry, entity_retries, max_entity_retries)
|
|
384
|
+
|
|
385
|
+
if should_retry:
|
|
386
|
+
self._logger.warning(
|
|
387
|
+
"[%d/%d] %s request failed. %s: %s. Retrying in %f seconds.",
|
|
388
|
+
retry,
|
|
389
|
+
max_retries,
|
|
390
|
+
client_name,
|
|
391
|
+
type(error).__name__,
|
|
392
|
+
str(error),
|
|
393
|
+
delay or 0.0,
|
|
394
|
+
exc_info=True,
|
|
395
|
+
)
|
|
396
|
+
else:
|
|
397
|
+
self._logger.exception(
|
|
398
|
+
"[%d/%d] %s request failed. %s: %s",
|
|
399
|
+
retry,
|
|
400
|
+
max_retries,
|
|
401
|
+
client_name,
|
|
402
|
+
type(error).__name__,
|
|
403
|
+
str(error),
|
|
404
|
+
)
|
|
405
|
+
raise WrappedOpenAIError(error=error) from error
|
|
406
|
+
|
|
407
|
+
retry += 1
|
|
408
|
+
|
|
409
|
+
@staticmethod
|
|
410
|
+
def get_token_provider(cred: Union[TokenCredential, AsyncTokenCredential]) -> AsyncAzureADTokenProvider:
|
|
411
|
+
"""Get the token provider for the prompty.
|
|
412
|
+
|
|
413
|
+
:param Union[TokenCredential, AsyncTokenCredential] cred: The Azure authentication credential.
|
|
414
|
+
:return: The token provider if a credential is provided, otherwise None.
|
|
415
|
+
:rtype: Optional[AsyncAzureADTokenProvider]
|
|
416
|
+
"""
|
|
417
|
+
async def _wrapper() -> str:
|
|
418
|
+
token = cred.get_token(TokenScope.COGNITIVE_SERVICES_MANAGEMENT)
|
|
419
|
+
if isinstance(token, Awaitable):
|
|
420
|
+
token = await token
|
|
421
|
+
return token.token
|
|
422
|
+
|
|
423
|
+
return _wrapper
|
|
@@ -2,12 +2,15 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
# cspell:ignore apng, retriable
|
|
6
|
+
|
|
5
7
|
import copy
|
|
6
|
-
from dataclasses import dataclass, is_dataclass, fields
|
|
7
8
|
import os
|
|
8
9
|
import re
|
|
9
10
|
import json
|
|
10
11
|
import base64
|
|
12
|
+
from dataclasses import dataclass, is_dataclass, fields
|
|
13
|
+
from logging import Logger
|
|
11
14
|
from pathlib import Path
|
|
12
15
|
from typing import (
|
|
13
16
|
Any,
|
|
@@ -30,6 +33,7 @@ from typing import (
|
|
|
30
33
|
from jinja2 import Template
|
|
31
34
|
from openai import AsyncStream
|
|
32
35
|
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
36
|
+
from openai import APIConnectionError, APIStatusError, APITimeoutError, OpenAIError
|
|
33
37
|
|
|
34
38
|
from azure.ai.evaluation._constants import DefaultOpenEncoding
|
|
35
39
|
from azure.ai.evaluation._legacy.prompty._exceptions import (
|
|
@@ -217,7 +221,7 @@ DEFAULT_IMAGE_MIME_TYPE: Final[str] = "image/*"
|
|
|
217
221
|
"""The mime type to use when we don't know the image type"""
|
|
218
222
|
|
|
219
223
|
FILE_EXT_TO_MIME: Final[Mapping[str, str]] = {
|
|
220
|
-
".apng": "image/apng",
|
|
224
|
+
".apng": "image/apng",
|
|
221
225
|
".avif": "image/avif",
|
|
222
226
|
".bmp": "image/bmp",
|
|
223
227
|
".gif": "image/gif",
|
|
@@ -542,4 +546,70 @@ async def format_llm_response(
|
|
|
542
546
|
return result
|
|
543
547
|
|
|
544
548
|
|
|
549
|
+
def openai_error_retryable(
|
|
550
|
+
error: OpenAIError, retry: int, entity_retry: List[int], max_entity_retries: int
|
|
551
|
+
) -> Tuple[bool, float]:
|
|
552
|
+
"""
|
|
553
|
+
Determines if an OpenAI error is retryable, and optionally determines the min retry delay to use.
|
|
554
|
+
If none is returned, the caller will determine the delay to use.
|
|
555
|
+
|
|
556
|
+
:param OpenAIError error: The error to handle
|
|
557
|
+
:param int retry: The current retry count (0 means we're on the first attempt and no retries have been made)
|
|
558
|
+
:param List[int] entity_retry: The current retry count for the unprocessable entity failures. This should be a
|
|
559
|
+
list containing only 1 element to mimic pass by reference semantics. A value of 0 means we're on the
|
|
560
|
+
first attempt and no retries have been made.
|
|
561
|
+
:param int max_entity_retries: The maximum number of retries to make for unprocessable entity failures
|
|
562
|
+
:return: A tuple containing whether the error is retryable and the min delay to use if any
|
|
563
|
+
:rtype: Tuple[bool, Optional[float]]
|
|
564
|
+
"""
|
|
565
|
+
|
|
566
|
+
# Using https://platform.openai.com/docs/guides/error-codes/api-errors#python-library-error-types as a reference
|
|
567
|
+
|
|
568
|
+
should_retry: bool
|
|
569
|
+
delay: Optional[float] = None
|
|
570
|
+
|
|
571
|
+
if isinstance(error, APIConnectionError):
|
|
572
|
+
retriable_error_messages: Sequence[str] = [
|
|
573
|
+
"connection aborted",
|
|
574
|
+
# issue 2296
|
|
575
|
+
"server disconnected without sending a response",
|
|
576
|
+
]
|
|
577
|
+
should_retry = (
|
|
578
|
+
isinstance(error, APITimeoutError) # APITimeoutError is a subclass of APIConnectionError
|
|
579
|
+
or str(error).lower() in retriable_error_messages
|
|
580
|
+
or str(error.__cause__).lower() in retriable_error_messages
|
|
581
|
+
)
|
|
582
|
+
elif isinstance(error, APIStatusError):
|
|
583
|
+
status_code: int = error.response.status_code
|
|
584
|
+
if status_code == 422:
|
|
585
|
+
# As per the original legacy code, UnprocessableEntityError (HTTP 422) should be handled differently
|
|
586
|
+
# with a smaller retry count, as retrying more may not be beneficial.
|
|
587
|
+
should_retry = entity_retry[0] < max_entity_retries
|
|
588
|
+
entity_retry[0] += 1
|
|
589
|
+
elif status_code == 429:
|
|
590
|
+
# Two types, one is you are throttled and so should retry after a delay, the other is you have exceeded
|
|
591
|
+
# your quota and should not retry.
|
|
592
|
+
if (error.type or "").lower() == "insufficient_quota":
|
|
593
|
+
should_retry = False
|
|
594
|
+
else:
|
|
595
|
+
should_retry = True
|
|
596
|
+
should_retry = error.type != "insufficient_quota"
|
|
597
|
+
else:
|
|
598
|
+
should_retry = status_code >= 500
|
|
599
|
+
|
|
600
|
+
# Use what the service tells us to use for the delay if it's provided
|
|
601
|
+
if should_retry and not delay:
|
|
602
|
+
delay_str = error.response.headers.get("Retry-After", None)
|
|
603
|
+
if delay_str is not None:
|
|
604
|
+
delay = float(delay_str)
|
|
605
|
+
else:
|
|
606
|
+
should_retry = False
|
|
607
|
+
|
|
608
|
+
# Use exponential backoff for retries if the service doesn't provide a delay
|
|
609
|
+
if not delay:
|
|
610
|
+
delay = min(60, 2 + 2**retry)
|
|
611
|
+
|
|
612
|
+
return (should_retry, delay)
|
|
613
|
+
|
|
614
|
+
|
|
545
615
|
# endregion
|
|
@@ -6,9 +6,10 @@ from enum import Enum
|
|
|
6
6
|
import os
|
|
7
7
|
import inspect
|
|
8
8
|
import logging
|
|
9
|
+
import asyncio
|
|
9
10
|
from datetime import datetime
|
|
10
11
|
from azure.ai.evaluation._common._experimental import experimental
|
|
11
|
-
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
12
|
+
from typing import Any, Callable, Dict, List, Optional, Union, cast, Coroutine, TypeVar, Awaitable
|
|
12
13
|
from azure.ai.evaluation._common.math import list_mean_nan_safe
|
|
13
14
|
from azure.ai.evaluation._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
14
15
|
from azure.ai.evaluation._evaluators import (
|
|
@@ -20,6 +21,8 @@ from azure.ai.evaluation._evaluators import (
|
|
|
20
21
|
_fluency,
|
|
21
22
|
_xpia,
|
|
22
23
|
_coherence,
|
|
24
|
+
_code_vulnerability,
|
|
25
|
+
_ungrounded_attributes,
|
|
23
26
|
)
|
|
24
27
|
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
|
|
25
28
|
from azure.ai.evaluation._evaluate import _evaluate
|
|
@@ -31,7 +34,7 @@ from azure.ai.evaluation.simulator import (
|
|
|
31
34
|
AdversarialScenario,
|
|
32
35
|
AdversarialScenarioJailbreak,
|
|
33
36
|
IndirectAttackSimulator,
|
|
34
|
-
DirectAttackSimulator
|
|
37
|
+
DirectAttackSimulator,
|
|
35
38
|
)
|
|
36
39
|
from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
|
|
37
40
|
from azure.ai.evaluation.simulator._utils import JsonLineList
|
|
@@ -71,6 +74,7 @@ class _SafetyEvaluator(Enum):
|
|
|
71
74
|
"""
|
|
72
75
|
|
|
73
76
|
CONTENT_SAFETY = "content_safety"
|
|
77
|
+
CODE_VULNERABILITY = "code_vulnerability"
|
|
74
78
|
GROUNDEDNESS = "groundedness"
|
|
75
79
|
PROTECTED_MATERIAL = "protected_material"
|
|
76
80
|
RELEVANCE = "relevance"
|
|
@@ -80,21 +84,22 @@ class _SafetyEvaluator(Enum):
|
|
|
80
84
|
INDIRECT_ATTACK = "indirect_attack"
|
|
81
85
|
DIRECT_ATTACK = "direct_attack"
|
|
82
86
|
ECI = "eci"
|
|
87
|
+
UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
|
|
83
88
|
|
|
84
89
|
|
|
85
90
|
@experimental
|
|
86
91
|
class _SafetyEvaluation:
|
|
87
92
|
def __init__(
|
|
88
93
|
self,
|
|
89
|
-
azure_ai_project: dict,
|
|
94
|
+
azure_ai_project: Union[str, dict],
|
|
90
95
|
credential: TokenCredential,
|
|
91
96
|
model_config: Optional[Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]] = None,
|
|
92
97
|
):
|
|
93
98
|
"""
|
|
94
99
|
Initializes a SafetyEvaluation object.
|
|
95
100
|
|
|
96
|
-
:param azure_ai_project: A dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'.
|
|
97
|
-
:type azure_ai_project: Dict[str, str]
|
|
101
|
+
:param azure_ai_project: A string or dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'.
|
|
102
|
+
:type azure_ai_project: Union[str, Dict[str, str]]
|
|
98
103
|
:param credential: The credential for connecting to Azure AI project.
|
|
99
104
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
100
105
|
:param model_config: A dictionary defining the configuration for the model. Acceptable types are AzureOpenAIModelConfiguration and OpenAIModelConfiguration.
|
|
@@ -106,8 +111,7 @@ class _SafetyEvaluation:
|
|
|
106
111
|
self.model_config = model_config
|
|
107
112
|
else:
|
|
108
113
|
self.model_config = None
|
|
109
|
-
validate_azure_ai_project(azure_ai_project)
|
|
110
|
-
self.azure_ai_project = AzureAIProject(**azure_ai_project)
|
|
114
|
+
self.azure_ai_project = validate_azure_ai_project(azure_ai_project)
|
|
111
115
|
self.credential = credential
|
|
112
116
|
self.logger = _setup_logger()
|
|
113
117
|
|
|
@@ -157,6 +161,8 @@ class _SafetyEvaluation:
|
|
|
157
161
|
adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
|
|
158
162
|
source_text: Optional[str] = None,
|
|
159
163
|
direct_attack: bool = False,
|
|
164
|
+
randomization_seed: Optional[int] = None,
|
|
165
|
+
concurrent_async_tasks: Optional[int] = 5,
|
|
160
166
|
) -> Dict[str, str]:
|
|
161
167
|
"""
|
|
162
168
|
Generates synthetic conversations based on provided parameters.
|
|
@@ -192,10 +198,17 @@ class _SafetyEvaluation:
|
|
|
192
198
|
context = latest_message.get("context", None)
|
|
193
199
|
latest_context = None
|
|
194
200
|
try:
|
|
201
|
+
is_async = self._is_async_function(target)
|
|
195
202
|
if self._check_target_returns_context(target):
|
|
196
|
-
|
|
203
|
+
if is_async:
|
|
204
|
+
response, latest_context = await target(query=application_input)
|
|
205
|
+
else:
|
|
206
|
+
response, latest_context = target(query=application_input)
|
|
197
207
|
else:
|
|
198
|
-
|
|
208
|
+
if is_async:
|
|
209
|
+
response = await target(query=application_input)
|
|
210
|
+
else:
|
|
211
|
+
response = target(query=application_input)
|
|
199
212
|
except Exception as e:
|
|
200
213
|
response = f"Something went wrong {e!s}"
|
|
201
214
|
|
|
@@ -234,6 +247,8 @@ class _SafetyEvaluation:
|
|
|
234
247
|
conversation_turns=conversation_turns,
|
|
235
248
|
text=source_text,
|
|
236
249
|
target=callback,
|
|
250
|
+
randomization_seed=randomization_seed,
|
|
251
|
+
concurrent_async_task=concurrent_async_tasks
|
|
237
252
|
)
|
|
238
253
|
|
|
239
254
|
# if DirectAttack, run DirectAttackSimulator
|
|
@@ -247,6 +262,8 @@ class _SafetyEvaluation:
|
|
|
247
262
|
max_conversation_turns=max_conversation_turns,
|
|
248
263
|
max_simulation_results=max_simulation_results,
|
|
249
264
|
target=callback,
|
|
265
|
+
randomization_seed=randomization_seed,
|
|
266
|
+
concurrent_async_task=concurrent_async_tasks,
|
|
250
267
|
)
|
|
251
268
|
jailbreak_outputs = simulator_outputs["jailbreak"]
|
|
252
269
|
simulator_outputs = simulator_outputs["regular"]
|
|
@@ -264,6 +281,7 @@ class _SafetyEvaluation:
|
|
|
264
281
|
num_queries=max_simulation_results,
|
|
265
282
|
target=callback,
|
|
266
283
|
text=source_text if source_text else "",
|
|
284
|
+
concurrent_async_tasks=concurrent_async_tasks,
|
|
267
285
|
)
|
|
268
286
|
|
|
269
287
|
## Run AdversarialSimulator
|
|
@@ -279,6 +297,8 @@ class _SafetyEvaluation:
|
|
|
279
297
|
conversation_turns=conversation_turns,
|
|
280
298
|
target=callback,
|
|
281
299
|
text=source_text,
|
|
300
|
+
randomization_seed=randomization_seed,
|
|
301
|
+
concurrent_async_task=concurrent_async_tasks
|
|
282
302
|
)
|
|
283
303
|
|
|
284
304
|
## If no outputs are generated, raise an exception
|
|
@@ -372,6 +392,10 @@ class _SafetyEvaluation:
|
|
|
372
392
|
)
|
|
373
393
|
if evaluator == _SafetyEvaluator.ECI:
|
|
374
394
|
return _UnstableAdversarialScenario.ECI
|
|
395
|
+
if evaluator == _SafetyEvaluator.CODE_VULNERABILITY:
|
|
396
|
+
return AdversarialScenario.ADVERSARIAL_CODE_VULNERABILITY
|
|
397
|
+
if evaluator == _SafetyEvaluator.UNGROUNDED_ATTRIBUTES:
|
|
398
|
+
return AdversarialScenario.ADVERSARIAL_UNGROUNDED_ATTRIBUTES
|
|
375
399
|
if evaluator in [
|
|
376
400
|
_SafetyEvaluator.GROUNDEDNESS,
|
|
377
401
|
_SafetyEvaluator.RELEVANCE,
|
|
@@ -453,6 +477,14 @@ class _SafetyEvaluation:
|
|
|
453
477
|
evaluators_dict["eci"] = ECIEvaluator(
|
|
454
478
|
azure_ai_project=self.azure_ai_project, credential=self.credential
|
|
455
479
|
)
|
|
480
|
+
elif evaluator == _SafetyEvaluator.CODE_VULNERABILITY:
|
|
481
|
+
evaluators_dict["code_vulnerability"] = _code_vulnerability.CodeVulnerabilityEvaluator(
|
|
482
|
+
azure_ai_project=self.azure_ai_project, credential=self.credential
|
|
483
|
+
)
|
|
484
|
+
elif evaluator == _SafetyEvaluator.UNGROUNDED_ATTRIBUTES:
|
|
485
|
+
evaluators_dict["ungrounded_attributes"] = _ungrounded_attributes.UngroundedAttributesEvaluator(
|
|
486
|
+
azure_ai_project=self.azure_ai_project, credential=self.credential
|
|
487
|
+
)
|
|
456
488
|
else:
|
|
457
489
|
msg = (
|
|
458
490
|
f"Invalid evaluator: {evaluator}. Supported evaluators are: {_SafetyEvaluator.__members__.values()}"
|
|
@@ -465,7 +497,7 @@ class _SafetyEvaluation:
|
|
|
465
497
|
blame=ErrorBlame.USER_ERROR,
|
|
466
498
|
)
|
|
467
499
|
return evaluators_dict
|
|
468
|
-
|
|
500
|
+
|
|
469
501
|
@staticmethod
|
|
470
502
|
def _check_target_returns_context(target: Callable) -> bool:
|
|
471
503
|
"""
|
|
@@ -478,6 +510,15 @@ class _SafetyEvaluation:
|
|
|
478
510
|
ret_type = sig.return_annotation
|
|
479
511
|
if ret_type == inspect.Signature.empty:
|
|
480
512
|
return False
|
|
513
|
+
|
|
514
|
+
# Check for Coroutine/Awaitable return types for async functions
|
|
515
|
+
origin = getattr(ret_type, "__origin__", None)
|
|
516
|
+
if origin is not None and (origin is Coroutine or origin is Awaitable):
|
|
517
|
+
args = getattr(ret_type, "__args__", None)
|
|
518
|
+
if args and len(args) > 0:
|
|
519
|
+
# For async functions, check the actual return type inside the Coroutine
|
|
520
|
+
ret_type = args[-1]
|
|
521
|
+
|
|
481
522
|
if ret_type is tuple:
|
|
482
523
|
return True
|
|
483
524
|
return False
|
|
@@ -494,13 +535,33 @@ class _SafetyEvaluation:
|
|
|
494
535
|
ret_type = sig.return_annotation
|
|
495
536
|
if ret_type == inspect.Signature.empty:
|
|
496
537
|
return False
|
|
538
|
+
|
|
539
|
+
# Check for Coroutine/Awaitable return types for async functions
|
|
540
|
+
origin = getattr(ret_type, "__origin__", None)
|
|
541
|
+
if origin is not None and (origin is Coroutine or origin is Awaitable):
|
|
542
|
+
args = getattr(ret_type, "__args__", None)
|
|
543
|
+
if args and len(args) > 0:
|
|
544
|
+
# For async functions, check the actual return type inside the Coroutine
|
|
545
|
+
ret_type = args[-1]
|
|
546
|
+
|
|
497
547
|
if ret_type is str:
|
|
498
548
|
return True
|
|
499
549
|
return False
|
|
500
550
|
|
|
501
|
-
|
|
502
551
|
@staticmethod
|
|
503
|
-
def
|
|
552
|
+
def _is_async_function(target: Callable) -> bool:
|
|
553
|
+
"""
|
|
554
|
+
Checks if the target function is an async function.
|
|
555
|
+
|
|
556
|
+
:param target: The target function to check.
|
|
557
|
+
:type target: Callable
|
|
558
|
+
:return: True if the target function is async, False otherwise.
|
|
559
|
+
:rtype: bool
|
|
560
|
+
"""
|
|
561
|
+
return asyncio.iscoroutinefunction(target)
|
|
562
|
+
|
|
563
|
+
@staticmethod
|
|
564
|
+
def _check_target_is_callback(target: Callable) -> bool:
|
|
504
565
|
sig = inspect.signature(target)
|
|
505
566
|
param_names = list(sig.parameters.keys())
|
|
506
567
|
return 'messages' in param_names and 'stream' in param_names and 'session_state' in param_names and 'context' in param_names
|
|
@@ -560,7 +621,28 @@ class _SafetyEvaluation:
|
|
|
560
621
|
category=ErrorCategory.INVALID_VALUE,
|
|
561
622
|
blame=ErrorBlame.USER_ERROR,
|
|
562
623
|
)
|
|
563
|
-
|
|
624
|
+
|
|
625
|
+
if _SafetyEvaluator.CODE_VULNERABILITY in evaluators and num_turns > 1:
|
|
626
|
+
self.logger.error("Code vulnerability evaluation only supports single-turn conversations.")
|
|
627
|
+
msg = "Code vulnerability evaluation only supports single-turn conversations."
|
|
628
|
+
raise EvaluationException(
|
|
629
|
+
message=msg,
|
|
630
|
+
internal_message=msg,
|
|
631
|
+
target=ErrorTarget.UNKNOWN,
|
|
632
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
633
|
+
blame=ErrorBlame.USER_ERROR,
|
|
634
|
+
)
|
|
635
|
+
if _SafetyEvaluator.UNGROUNDED_ATTRIBUTES in evaluators and num_turns > 1:
|
|
636
|
+
self.logger.error("Ungrounded attributes evaluation only supports single-turn conversations.")
|
|
637
|
+
msg = "Ungrounded attributes evaluation only supports single-turn conversations."
|
|
638
|
+
raise EvaluationException(
|
|
639
|
+
message=msg,
|
|
640
|
+
internal_message=msg,
|
|
641
|
+
target=ErrorTarget.UNKNOWN,
|
|
642
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
643
|
+
blame=ErrorBlame.USER_ERROR,
|
|
644
|
+
)
|
|
645
|
+
|
|
564
646
|
if _SafetyEvaluator.CONTENT_SAFETY in evaluators and scenario and num_turns > 1 and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
|
|
565
647
|
self.logger.error(f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn.")
|
|
566
648
|
msg = f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
|
|
@@ -604,7 +686,9 @@ class _SafetyEvaluation:
|
|
|
604
686
|
violence_defects = []
|
|
605
687
|
sexual_defects = []
|
|
606
688
|
self_harm_defects = []
|
|
607
|
-
|
|
689
|
+
jailbreak_rows = sorted(evaluation_result_dict[jailbreak_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
|
|
690
|
+
regular_rows = sorted(evaluation_result_dict[regular_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
|
|
691
|
+
for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
|
|
608
692
|
hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
|
|
609
693
|
violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
|
|
610
694
|
sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
|
|
@@ -630,7 +714,7 @@ class _SafetyEvaluation:
|
|
|
630
714
|
|
|
631
715
|
async def __call__(
|
|
632
716
|
self,
|
|
633
|
-
target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
717
|
+
target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
634
718
|
evaluators: List[_SafetyEvaluator] = [],
|
|
635
719
|
evaluation_name: Optional[str] = None,
|
|
636
720
|
num_turns : int = 1,
|
|
@@ -643,13 +727,15 @@ class _SafetyEvaluation:
|
|
|
643
727
|
data_path: Optional[Union[str, os.PathLike]] = None,
|
|
644
728
|
jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
|
|
645
729
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
646
|
-
data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None
|
|
730
|
+
data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None,
|
|
731
|
+
randomization_seed: Optional[int] = None,
|
|
732
|
+
concurrent_async_tasks: Optional[int] = 5,
|
|
647
733
|
) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
|
|
648
734
|
'''
|
|
649
735
|
Evaluates the target function based on the provided parameters.
|
|
650
736
|
|
|
651
|
-
:param target: The target function to call during the evaluation.
|
|
652
|
-
:type target: Callable
|
|
737
|
+
:param target: The target function to call during the evaluation. This can be a synchronous or asynchronous function.
|
|
738
|
+
:type target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
|
|
653
739
|
:param evaluators: A list of SafetyEvaluator.
|
|
654
740
|
:type evaluators: List[_SafetyEvaluator]
|
|
655
741
|
:param evaluation_name: The display name name of the evaluation.
|
|
@@ -671,12 +757,17 @@ class _SafetyEvaluation:
|
|
|
671
757
|
:param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run.
|
|
672
758
|
:type data_path: Optional[Union[str, os.PathLike]]
|
|
673
759
|
:param jailbreak_data_path: The path to the data file generated by the Simulator for jailbreak scenario. If None, the DirectAttackSimulator will be run.
|
|
674
|
-
:type jailbreak_data_path: Optional[Union[str, os.PathLike]]
|
|
675
|
-
:param output_path: The path to write the evaluation results to if set.
|
|
760
|
+
:type jailbreak_data_path: Optional[Union[str, os.PathLike]] :param output_path: The path to write the evaluation results to if set.
|
|
676
761
|
:type output_path: Optional[Union[str, os.PathLike]]
|
|
762
|
+
:param data_paths: A dictionary of data paths to evaluate. If None, the Simulator will be run.
|
|
763
|
+
:type data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]]
|
|
764
|
+
:param randomization_seed: The seed used to randomize prompt selection. If unset, the system's default seed is used.
|
|
765
|
+
:type randomization_seed: Optional[int]
|
|
766
|
+
:param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
|
|
767
|
+
:type concurrent_async_tasks: Optional[int]
|
|
677
768
|
'''
|
|
678
|
-
## Log inputs
|
|
679
|
-
self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}")
|
|
769
|
+
## Log inputs
|
|
770
|
+
self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}")
|
|
680
771
|
|
|
681
772
|
## Validate arguments
|
|
682
773
|
self._validate_inputs(
|
|
@@ -706,6 +797,7 @@ class _SafetyEvaluation:
|
|
|
706
797
|
tasks=tasks,
|
|
707
798
|
source_text=source_text,
|
|
708
799
|
direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
|
|
800
|
+
randomization_seed=randomization_seed,
|
|
709
801
|
)
|
|
710
802
|
elif data_path:
|
|
711
803
|
data_paths = {Path(data_path).stem: data_path}
|
azure/ai/evaluation/_version.py
CHANGED