azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
|
@@ -3,20 +3,16 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
-
import inspect
|
|
7
6
|
import logging
|
|
8
7
|
import os
|
|
9
8
|
import time
|
|
10
9
|
from abc import ABC, abstractmethod
|
|
11
10
|
from enum import Enum
|
|
12
|
-
from typing import Optional, Union
|
|
11
|
+
from typing import Dict, Optional, Union
|
|
13
12
|
|
|
14
|
-
from azure.core.credentials import AccessToken, TokenCredential
|
|
15
13
|
from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
|
|
16
14
|
|
|
17
|
-
AZURE_TOKEN_REFRESH_INTERVAL =
|
|
18
|
-
os.getenv("AZURE_TOKEN_REFRESH_INTERVAL", "600")
|
|
19
|
-
) # token refresh interval in seconds
|
|
15
|
+
AZURE_TOKEN_REFRESH_INTERVAL = 600 # seconds
|
|
20
16
|
|
|
21
17
|
|
|
22
18
|
class TokenScope(Enum):
|
|
@@ -33,24 +29,24 @@ class APITokenManager(ABC):
|
|
|
33
29
|
:param auth_header: Authorization header prefix. Defaults to "Bearer"
|
|
34
30
|
:type auth_header: str
|
|
35
31
|
:param credential: Azure credential object
|
|
36
|
-
:type credential: Optional[
|
|
32
|
+
:type credential: Optional[Union[azure.identity.DefaultAzureCredential, azure.identity.ManagedIdentityCredential]
|
|
37
33
|
"""
|
|
38
34
|
|
|
39
35
|
def __init__(
|
|
40
36
|
self,
|
|
41
37
|
logger: logging.Logger,
|
|
42
38
|
auth_header: str = "Bearer",
|
|
43
|
-
credential: Optional[
|
|
39
|
+
credential: Optional[Union[DefaultAzureCredential, ManagedIdentityCredential]] = None,
|
|
44
40
|
) -> None:
|
|
45
41
|
self.logger = logger
|
|
46
42
|
self.auth_header = auth_header
|
|
47
|
-
self._lock
|
|
43
|
+
self._lock = None
|
|
48
44
|
if credential is not None:
|
|
49
45
|
self.credential = credential
|
|
50
46
|
else:
|
|
51
47
|
self.credential = self.get_aad_credential()
|
|
52
|
-
self.token
|
|
53
|
-
self.last_refresh_time
|
|
48
|
+
self.token = None
|
|
49
|
+
self.last_refresh_time = None
|
|
54
50
|
|
|
55
51
|
@property
|
|
56
52
|
def lock(self) -> asyncio.Lock:
|
|
@@ -77,26 +73,20 @@ class APITokenManager(ABC):
|
|
|
77
73
|
identity_client_id = os.environ.get("DEFAULT_IDENTITY_CLIENT_ID", None)
|
|
78
74
|
if identity_client_id is not None:
|
|
79
75
|
self.logger.info(f"Using DEFAULT_IDENTITY_CLIENT_ID: {identity_client_id}")
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
@abstractmethod
|
|
86
|
-
def get_token(self) -> str:
|
|
87
|
-
"""Async method to get the API token. Subclasses should implement this method.
|
|
88
|
-
|
|
89
|
-
:return: API token
|
|
90
|
-
:rtype: str
|
|
91
|
-
"""
|
|
76
|
+
credential = ManagedIdentityCredential(client_id=identity_client_id)
|
|
77
|
+
else:
|
|
78
|
+
self.logger.info("Environment variable DEFAULT_IDENTITY_CLIENT_ID is not set, using DefaultAzureCredential")
|
|
79
|
+
credential = DefaultAzureCredential()
|
|
80
|
+
return credential
|
|
92
81
|
|
|
93
82
|
@abstractmethod
|
|
94
|
-
async def
|
|
83
|
+
async def get_token(self) -> str:
|
|
95
84
|
"""Async method to get the API token. Subclasses should implement this method.
|
|
96
85
|
|
|
97
86
|
:return: API token
|
|
98
87
|
:rtype: str
|
|
99
88
|
"""
|
|
89
|
+
pass # pylint: disable=unnecessary-pass
|
|
100
90
|
|
|
101
91
|
|
|
102
92
|
class ManagedIdentityAPITokenManager(APITokenManager):
|
|
@@ -110,18 +100,12 @@ class ManagedIdentityAPITokenManager(APITokenManager):
|
|
|
110
100
|
:paramtype kwargs: Dict
|
|
111
101
|
"""
|
|
112
102
|
|
|
113
|
-
def __init__(
|
|
114
|
-
|
|
115
|
-
token_scope: TokenScope,
|
|
116
|
-
logger: logging.Logger,
|
|
117
|
-
*,
|
|
118
|
-
auth_header: str = "Bearer",
|
|
119
|
-
credential: Optional[TokenCredential] = None,
|
|
120
|
-
):
|
|
121
|
-
super().__init__(logger, auth_header=auth_header, credential=credential)
|
|
103
|
+
def __init__(self, token_scope: TokenScope, logger: logging.Logger, **kwargs: Dict):
|
|
104
|
+
super().__init__(logger, **kwargs)
|
|
122
105
|
self.token_scope = token_scope
|
|
123
106
|
|
|
124
|
-
|
|
107
|
+
# Bug 3353724: This get_token is sync method, but it is defined as async method in the base class
|
|
108
|
+
def get_token(self) -> str: # pylint: disable=invalid-overridden-method
|
|
125
109
|
"""Get the API token. If the token is not available or has expired, refresh the token.
|
|
126
110
|
|
|
127
111
|
:return: API token
|
|
@@ -138,31 +122,6 @@ class ManagedIdentityAPITokenManager(APITokenManager):
|
|
|
138
122
|
|
|
139
123
|
return self.token
|
|
140
124
|
|
|
141
|
-
async def get_token_async(self) -> str:
|
|
142
|
-
"""Get the API token synchronously. If the token is not available or has expired, refresh it.
|
|
143
|
-
|
|
144
|
-
:return: API token
|
|
145
|
-
:rtype: str
|
|
146
|
-
"""
|
|
147
|
-
if (
|
|
148
|
-
self.token is None
|
|
149
|
-
or self.last_refresh_time is None
|
|
150
|
-
or time.time() - self.last_refresh_time > AZURE_TOKEN_REFRESH_INTERVAL
|
|
151
|
-
):
|
|
152
|
-
self.last_refresh_time = time.time()
|
|
153
|
-
get_token_method = self.credential.get_token(self.token_scope.value)
|
|
154
|
-
if inspect.isawaitable(get_token_method):
|
|
155
|
-
# If it's awaitable, await it
|
|
156
|
-
token_response: AccessToken = await get_token_method
|
|
157
|
-
else:
|
|
158
|
-
# Otherwise, call it synchronously
|
|
159
|
-
token_response = get_token_method
|
|
160
|
-
|
|
161
|
-
self.token = token_response.token
|
|
162
|
-
self.logger.info("Refreshed Azure endpoint token.")
|
|
163
|
-
|
|
164
|
-
return self.token
|
|
165
|
-
|
|
166
125
|
|
|
167
126
|
class PlainTokenManager(APITokenManager):
|
|
168
127
|
"""Plain API Token Manager
|
|
@@ -175,18 +134,11 @@ class PlainTokenManager(APITokenManager):
|
|
|
175
134
|
:paramtype kwargs: Dict
|
|
176
135
|
"""
|
|
177
136
|
|
|
178
|
-
def __init__(
|
|
179
|
-
|
|
180
|
-
openapi_key
|
|
181
|
-
logger: logging.Logger,
|
|
182
|
-
*,
|
|
183
|
-
auth_header: str = "Bearer",
|
|
184
|
-
credential: Optional[TokenCredential] = None,
|
|
185
|
-
) -> None:
|
|
186
|
-
super().__init__(logger, auth_header=auth_header, credential=credential)
|
|
187
|
-
self.token: str = openapi_key
|
|
137
|
+
def __init__(self, openapi_key: str, logger: logging.Logger, **kwargs: Dict):
|
|
138
|
+
super().__init__(logger, **kwargs)
|
|
139
|
+
self.token = openapi_key
|
|
188
140
|
|
|
189
|
-
def get_token(self) -> str:
|
|
141
|
+
async def get_token(self) -> str:
|
|
190
142
|
"""Get the API token
|
|
191
143
|
|
|
192
144
|
:return: API token
|
|
@@ -6,14 +6,14 @@ import copy
|
|
|
6
6
|
import json
|
|
7
7
|
import time
|
|
8
8
|
import uuid
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Dict, List
|
|
10
10
|
|
|
11
|
-
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
12
|
-
from azure.ai.evaluation._user_agent import USER_AGENT
|
|
13
11
|
from azure.core.exceptions import HttpResponseError
|
|
14
12
|
from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
|
|
15
13
|
|
|
16
|
-
from
|
|
14
|
+
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
|
|
15
|
+
from azure.ai.evaluation._user_agent import USER_AGENT
|
|
16
|
+
|
|
17
17
|
from .models import OpenAIChatCompletionsModel
|
|
18
18
|
|
|
19
19
|
|
|
@@ -34,15 +34,7 @@ class SimulationRequestDTO:
|
|
|
34
34
|
:type template_parameters: Dict
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
-
def __init__(
|
|
38
|
-
self,
|
|
39
|
-
url: str,
|
|
40
|
-
headers: Dict[str, str],
|
|
41
|
-
payload: Dict[str, Any],
|
|
42
|
-
params: Dict[str, str],
|
|
43
|
-
templatekey: str,
|
|
44
|
-
template_parameters: Optional[TemplateParameters],
|
|
45
|
-
):
|
|
37
|
+
def __init__(self, url, headers, payload, params, templatekey, template_parameters):
|
|
46
38
|
self.url = url
|
|
47
39
|
self.headers = headers
|
|
48
40
|
self.json = json.dumps(payload)
|
|
@@ -56,12 +48,9 @@ class SimulationRequestDTO:
|
|
|
56
48
|
:return: The DTO as a dictionary.
|
|
57
49
|
:rtype: Dict
|
|
58
50
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
toReturn["templateParameters"] = {str(k): str(v) for k, v in toReturn["templateParameters"].items()}
|
|
63
|
-
|
|
64
|
-
return toReturn
|
|
51
|
+
if self.templateParameters is not None:
|
|
52
|
+
self.templateParameters = {str(k): str(v) for k, v in self.templateParameters.items()}
|
|
53
|
+
return self.__dict__
|
|
65
54
|
|
|
66
55
|
def to_json(self):
|
|
67
56
|
"""Convert the DTO to a JSON string.
|
|
@@ -85,12 +74,12 @@ class ProxyChatCompletionsModel(OpenAIChatCompletionsModel):
|
|
|
85
74
|
:keyword kwargs: Additional keyword arguments to pass to the parent class.
|
|
86
75
|
"""
|
|
87
76
|
|
|
88
|
-
def __init__(self, name: str, template_key: str, template_parameters
|
|
77
|
+
def __init__(self, name: str, template_key: str, template_parameters, *args, **kwargs) -> None:
|
|
89
78
|
self.tkey = template_key
|
|
90
79
|
self.tparam = template_parameters
|
|
91
|
-
self.result_url
|
|
80
|
+
self.result_url = None
|
|
92
81
|
|
|
93
|
-
super().__init__(name=name, **kwargs)
|
|
82
|
+
super().__init__(name=name, *args, **kwargs)
|
|
94
83
|
|
|
95
84
|
def format_request_data(self, messages: List[Dict], **request_params) -> Dict: # type: ignore[override]
|
|
96
85
|
"""Format the request data to query the model with.
|
|
@@ -172,6 +161,7 @@ class ProxyChatCompletionsModel(OpenAIChatCompletionsModel):
|
|
|
172
161
|
}
|
|
173
162
|
# add all additional headers
|
|
174
163
|
headers.update(self.additional_headers) # type: ignore[arg-type]
|
|
164
|
+
|
|
175
165
|
params = {}
|
|
176
166
|
if self.api_version:
|
|
177
167
|
params["api-version"] = self.api_version
|
|
@@ -195,8 +185,8 @@ class ProxyChatCompletionsModel(OpenAIChatCompletionsModel):
|
|
|
195
185
|
message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
|
|
196
186
|
)
|
|
197
187
|
|
|
198
|
-
|
|
199
|
-
self.result_url =
|
|
188
|
+
response = response.json()
|
|
189
|
+
self.result_url = response["location"]
|
|
200
190
|
|
|
201
191
|
retry_policy = AsyncRetryPolicy( # set up retry configuration
|
|
202
192
|
retry_on_status_codes=[202], # on which statuses to retry
|
|
@@ -213,12 +203,6 @@ class ProxyChatCompletionsModel(OpenAIChatCompletionsModel):
|
|
|
213
203
|
time.sleep(15)
|
|
214
204
|
|
|
215
205
|
async with get_async_http_client().with_policies(retry_policy=retry_policy) as exp_retry_client:
|
|
216
|
-
token = await self.token_manager.get_token_async()
|
|
217
|
-
proxy_headers = {
|
|
218
|
-
"Authorization": f"Bearer {token}",
|
|
219
|
-
"Content-Type": "application/json",
|
|
220
|
-
"User-Agent": USER_AGENT,
|
|
221
|
-
}
|
|
222
206
|
response = await exp_retry_client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
|
|
223
207
|
self.result_url, headers=proxy_headers
|
|
224
208
|
)
|
|
@@ -2,14 +2,15 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import os
|
|
5
|
-
from typing import Any
|
|
5
|
+
from typing import Any, Dict
|
|
6
6
|
from urllib.parse import urljoin, urlparse
|
|
7
7
|
|
|
8
|
-
from azure.
|
|
8
|
+
from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
|
|
9
|
+
|
|
9
10
|
from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client, get_http_client
|
|
10
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
11
11
|
from azure.ai.evaluation._user_agent import USER_AGENT
|
|
12
|
-
from azure.
|
|
12
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
13
14
|
|
|
14
15
|
from ._identity_manager import APITokenManager
|
|
15
16
|
|
|
@@ -20,7 +21,7 @@ if "RAI_SVC_URL" in os.environ:
|
|
|
20
21
|
print(f"Found RAI_SVC_URL in environment variable, using {api_url} for the service endpoint.")
|
|
21
22
|
|
|
22
23
|
|
|
23
|
-
class RAIClient:
|
|
24
|
+
class RAIClient:
|
|
24
25
|
"""Client for the Responsible AI Service
|
|
25
26
|
|
|
26
27
|
:param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project
|
|
@@ -30,9 +31,7 @@ class RAIClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
30
31
|
:type token_manage: ~azure.ai.evaluation.simulator._model_tools._identity_manager.APITokenManager
|
|
31
32
|
"""
|
|
32
33
|
|
|
33
|
-
def __init__(
|
|
34
|
-
self, azure_ai_project: AzureAIProject, token_manager: APITokenManager
|
|
35
|
-
) -> None:
|
|
34
|
+
def __init__(self, azure_ai_project: AzureAIProject, token_manager: APITokenManager) -> None:
|
|
36
35
|
self.azure_ai_project = azure_ai_project
|
|
37
36
|
self.token_manager = token_manager
|
|
38
37
|
|
|
@@ -74,18 +73,14 @@ class RAIClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
74
73
|
timeout=5,
|
|
75
74
|
)
|
|
76
75
|
if response.status_code != 200:
|
|
77
|
-
msg =
|
|
78
|
-
f"Failed to connect to your Azure AI project. Please check if the project scope is configured "
|
|
79
|
-
f"correctly, and make sure you have the necessary access permissions. "
|
|
80
|
-
f"Status code: {response.status_code}."
|
|
81
|
-
)
|
|
76
|
+
msg = f"Failed to retrieve the discovery service URL."
|
|
82
77
|
raise EvaluationException(
|
|
83
78
|
message=msg,
|
|
79
|
+
internal_message=msg,
|
|
84
80
|
target=ErrorTarget.RAI_CLIENT,
|
|
85
|
-
category=ErrorCategory.
|
|
86
|
-
blame=ErrorBlame.
|
|
81
|
+
category=ErrorCategory.SERVICE_UNAVAILABLE,
|
|
82
|
+
blame=ErrorBlame.UNKNOWN,
|
|
87
83
|
)
|
|
88
|
-
|
|
89
84
|
base_url = urlparse(response.json()["properties"]["discoveryUrl"])
|
|
90
85
|
return f"{base_url.scheme}://{base_url.netloc}"
|
|
91
86
|
|
|
@@ -109,11 +104,7 @@ class RAIClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
109
104
|
return self.contentharm_parameters
|
|
110
105
|
|
|
111
106
|
async def get_jailbreaks_dataset(self, type: str) -> Any:
|
|
112
|
-
"
|
|
113
|
-
|
|
114
|
-
:param type: The dataset type. Should be one of 'xpia' or 'upia'
|
|
115
|
-
:type type: str
|
|
116
|
-
"""
|
|
107
|
+
"Get the jailbreaks dataset, if exists"
|
|
117
108
|
if self.jailbreaks_dataset is None:
|
|
118
109
|
if type == "xpia":
|
|
119
110
|
self.jailbreaks_dataset = await self.get(self.xpia_jailbreaks_json_endpoint)
|
|
@@ -155,10 +146,8 @@ class RAIClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
155
146
|
if response.status_code == 200:
|
|
156
147
|
return response.json()
|
|
157
148
|
|
|
158
|
-
msg =
|
|
159
|
-
|
|
160
|
-
+ "please go to https://aka.ms/azureaistudiosafetyeval to see which regions are supported"
|
|
161
|
-
)
|
|
149
|
+
msg = "Azure safety evaluation service is not available in your current region, "
|
|
150
|
+
"please go to https://aka.ms/azureaistudiosafetyeval to see which regions are supported"
|
|
162
151
|
raise EvaluationException(
|
|
163
152
|
message=msg,
|
|
164
153
|
internal_message=msg,
|
|
@@ -2,66 +2,25 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from typing import
|
|
6
|
-
|
|
7
|
-
from typing_extensions import NotRequired
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
8
6
|
|
|
9
7
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
10
8
|
|
|
11
9
|
from ._rai_client import RAIClient
|
|
12
10
|
|
|
13
|
-
CONTENT_HARM_TEMPLATES_COLLECTION_KEY =
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class TemplateParameters(TypedDict):
|
|
27
|
-
"""Parameters used in Templates
|
|
28
|
-
|
|
29
|
-
.. note::
|
|
30
|
-
|
|
31
|
-
This type is good enough to type check, but is incorrect. It's meant to represent a dictionary with a known
|
|
32
|
-
`metadata` key (Dict[str, str]), a known `ch_template_placeholder` key (str), and an unknown number of keys
|
|
33
|
-
that map to `str` values.
|
|
34
|
-
|
|
35
|
-
In typescript, this type would be spelled:
|
|
36
|
-
|
|
37
|
-
.. code-block:: typescript
|
|
38
|
-
|
|
39
|
-
type AdversarialTemplateParameters = {
|
|
40
|
-
[key: string]: string
|
|
41
|
-
ch_template_placeholder: string
|
|
42
|
-
metadata: {[index: string]: string} # Doesn't typecheck but gets the point across
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
At time of writing, this isn't possible to express with a TypedDict. TypedDicts must be "closed" in that
|
|
46
|
-
they fully specify all the keys they can contain.
|
|
47
|
-
|
|
48
|
-
`PEP 728 – TypedDict with Typed Extra Items <https://peps.python.org/pep-0728/>` is a proposal to support
|
|
49
|
-
this, but would only be available in Python 3.13 at the earliest.
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
metadata: Dict[str, str]
|
|
53
|
-
conversation_starter: str
|
|
54
|
-
ch_template_placeholder: str
|
|
55
|
-
group_of_people: NotRequired[str]
|
|
56
|
-
category: NotRequired[str]
|
|
57
|
-
target_population: NotRequired[str]
|
|
58
|
-
topic: NotRequired[str]
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class _CategorizedParameter(TypedDict):
|
|
62
|
-
parameters: List[TemplateParameters]
|
|
63
|
-
category: str
|
|
64
|
-
parameters_key: str
|
|
11
|
+
CONTENT_HARM_TEMPLATES_COLLECTION_KEY = set(
|
|
12
|
+
[
|
|
13
|
+
"adv_qa",
|
|
14
|
+
"adv_conversation",
|
|
15
|
+
"adv_summarization",
|
|
16
|
+
"adv_search",
|
|
17
|
+
"adv_rewrite",
|
|
18
|
+
"adv_content_gen_ungrounded",
|
|
19
|
+
"adv_content_gen_grounded",
|
|
20
|
+
"adv_content_protected_material",
|
|
21
|
+
"adv_politics",
|
|
22
|
+
]
|
|
23
|
+
)
|
|
65
24
|
|
|
66
25
|
|
|
67
26
|
class ContentHarmTemplatesUtils:
|
|
@@ -126,19 +85,13 @@ class AdversarialTemplate:
|
|
|
126
85
|
:param template_parameters: The template parameters.
|
|
127
86
|
"""
|
|
128
87
|
|
|
129
|
-
def __init__(
|
|
130
|
-
self,
|
|
131
|
-
template_name: str,
|
|
132
|
-
text: Optional[str],
|
|
133
|
-
context_key: List,
|
|
134
|
-
template_parameters: Optional[List[TemplateParameters]] = None,
|
|
135
|
-
) -> None:
|
|
88
|
+
def __init__(self, template_name, text, context_key, template_parameters=None) -> None:
|
|
136
89
|
self.text = text
|
|
137
90
|
self.context_key = context_key
|
|
138
91
|
self.template_name = template_name
|
|
139
|
-
self.template_parameters = template_parameters
|
|
92
|
+
self.template_parameters = template_parameters
|
|
140
93
|
|
|
141
|
-
def __str__(self)
|
|
94
|
+
def __str__(self):
|
|
142
95
|
return "{{ch_template_placeholder}}"
|
|
143
96
|
|
|
144
97
|
|
|
@@ -153,13 +106,16 @@ class AdversarialTemplateHandler:
|
|
|
153
106
|
"""
|
|
154
107
|
|
|
155
108
|
def __init__(self, azure_ai_project: AzureAIProject, rai_client: RAIClient) -> None:
|
|
109
|
+
self.cached_templates_source = {}
|
|
110
|
+
# self.template_env = JinjaEnvironment(loader=JinjaFileSystemLoader(searchpath=template_dir))
|
|
156
111
|
self.azure_ai_project = azure_ai_project
|
|
157
|
-
self.categorized_ch_parameters
|
|
112
|
+
self.categorized_ch_parameters = None
|
|
158
113
|
self.rai_client = rai_client
|
|
159
114
|
|
|
160
|
-
async def _get_content_harm_template_collections(self, collection_key
|
|
115
|
+
async def _get_content_harm_template_collections(self, collection_key):
|
|
116
|
+
|
|
161
117
|
if self.categorized_ch_parameters is None:
|
|
162
|
-
categorized_parameters
|
|
118
|
+
categorized_parameters = {}
|
|
163
119
|
util = ContentHarmTemplatesUtils
|
|
164
120
|
|
|
165
121
|
parameters = await self.rai_client.get_contentharm_parameters()
|
|
@@ -167,7 +123,7 @@ class AdversarialTemplateHandler:
|
|
|
167
123
|
for k in parameters.keys():
|
|
168
124
|
template_key = util.get_template_key(k)
|
|
169
125
|
categorized_parameters[template_key] = {
|
|
170
|
-
"parameters":
|
|
126
|
+
"parameters": parameters[k],
|
|
171
127
|
"category": util.get_template_category(k),
|
|
172
128
|
"parameters_key": k,
|
|
173
129
|
}
|
|
@@ -12,9 +12,10 @@ from abc import ABC, abstractmethod
|
|
|
12
12
|
from collections import deque
|
|
13
13
|
from typing import Deque, Dict, List, Optional, Union
|
|
14
14
|
from urllib.parse import urlparse
|
|
15
|
+
import ast
|
|
15
16
|
|
|
16
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
17
|
from azure.ai.evaluation._http_utils import AsyncHttpPipeline
|
|
18
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
18
19
|
|
|
19
20
|
from ._identity_manager import APITokenManager
|
|
20
21
|
|
|
@@ -28,15 +29,16 @@ def get_model_class_from_url(endpoint_url: str):
|
|
|
28
29
|
|
|
29
30
|
if endpoint_path.endswith("chat/completions"):
|
|
30
31
|
return OpenAIChatCompletionsModel
|
|
31
|
-
|
|
32
|
+
elif endpoint_path.endswith("completions"):
|
|
32
33
|
return OpenAICompletionsModel
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
34
|
+
else:
|
|
35
|
+
raise EvaluationException(
|
|
36
|
+
message=f"Unknown API type for endpoint {endpoint_url}",
|
|
37
|
+
internal_message="Unknown API type",
|
|
38
|
+
error_category=ErrorCategory.UNKNOWN_FIELD,
|
|
39
|
+
error_blame=ErrorBlame.USER_ERROR,
|
|
40
|
+
error_target=ErrorTarget.MODELS,
|
|
41
|
+
)
|
|
40
42
|
|
|
41
43
|
|
|
42
44
|
# ===========================================================
|
|
@@ -49,10 +51,10 @@ class LLMBase(ABC):
|
|
|
49
51
|
Base class for all LLM models.
|
|
50
52
|
"""
|
|
51
53
|
|
|
52
|
-
def __init__(self, endpoint_url: str, name: str = "unknown", additional_headers: Optional[
|
|
54
|
+
def __init__(self, endpoint_url: str, name: str = "unknown", additional_headers: Optional[dict] = {}):
|
|
53
55
|
self.endpoint_url = endpoint_url
|
|
54
56
|
self.name = name
|
|
55
|
-
self.additional_headers = additional_headers
|
|
57
|
+
self.additional_headers = additional_headers
|
|
56
58
|
self.logger = logging.getLogger(repr(self))
|
|
57
59
|
|
|
58
60
|
# Metric tracking
|
|
@@ -208,7 +210,7 @@ class OpenAICompletionsModel(LLMBase):
|
|
|
208
210
|
*,
|
|
209
211
|
endpoint_url: str,
|
|
210
212
|
name: str = "OpenAICompletionsModel",
|
|
211
|
-
additional_headers: Optional[
|
|
213
|
+
additional_headers: Optional[dict] = {},
|
|
212
214
|
api_version: Optional[str] = "2023-03-15-preview",
|
|
213
215
|
token_manager: APITokenManager,
|
|
214
216
|
azureml_model_deployment: Optional[str] = None,
|
|
@@ -220,7 +222,7 @@ class OpenAICompletionsModel(LLMBase):
|
|
|
220
222
|
frequency_penalty: Optional[float] = 0,
|
|
221
223
|
presence_penalty: Optional[float] = 0,
|
|
222
224
|
stop: Optional[Union[List[str], str]] = None,
|
|
223
|
-
image_captions:
|
|
225
|
+
image_captions: Dict[str, str] = {},
|
|
224
226
|
images_dir: Optional[str] = None, # Note: unused, kept for class compatibility
|
|
225
227
|
):
|
|
226
228
|
super().__init__(endpoint_url=endpoint_url, name=name, additional_headers=additional_headers)
|
|
@@ -234,7 +236,7 @@ class OpenAICompletionsModel(LLMBase):
|
|
|
234
236
|
self.n = n
|
|
235
237
|
self.frequency_penalty = frequency_penalty
|
|
236
238
|
self.presence_penalty = presence_penalty
|
|
237
|
-
self.image_captions = image_captions
|
|
239
|
+
self.image_captions = image_captions
|
|
238
240
|
|
|
239
241
|
# Default stop to end token if not provided
|
|
240
242
|
if not stop:
|
|
@@ -263,7 +265,7 @@ class OpenAICompletionsModel(LLMBase):
|
|
|
263
265
|
def get_model_params(self):
|
|
264
266
|
return {param: getattr(self, param) for param in self.model_param_names if getattr(self, param) is not None}
|
|
265
267
|
|
|
266
|
-
def format_request_data(self, prompt:
|
|
268
|
+
def format_request_data(self, prompt: str, **request_params) -> Dict[str, str]:
|
|
267
269
|
"""
|
|
268
270
|
Format the request data for the OpenAI API.
|
|
269
271
|
"""
|
|
@@ -328,7 +330,7 @@ class OpenAICompletionsModel(LLMBase):
|
|
|
328
330
|
# Format prompts and tag with index
|
|
329
331
|
request_datas: List[Dict] = []
|
|
330
332
|
for idx, prompt in enumerate(prompts):
|
|
331
|
-
prompt = self.format_request_data(prompt, **request_params)
|
|
333
|
+
prompt: Dict[str, str] = self.format_request_data(prompt, **request_params)
|
|
332
334
|
prompt[self.prompt_idx_key] = idx # type: ignore[assignment]
|
|
333
335
|
request_datas.append(prompt)
|
|
334
336
|
|
|
@@ -447,7 +449,7 @@ class OpenAICompletionsModel(LLMBase):
|
|
|
447
449
|
|
|
448
450
|
self._log_request(request_data)
|
|
449
451
|
|
|
450
|
-
token = self.token_manager.get_token()
|
|
452
|
+
token = await self.token_manager.get_token()
|
|
451
453
|
|
|
452
454
|
headers = {
|
|
453
455
|
"Content-Type": "application/json",
|
|
@@ -522,8 +524,8 @@ class OpenAIChatCompletionsModel(OpenAICompletionsModel):
|
|
|
522
524
|
formats the prompt for chat completion.
|
|
523
525
|
"""
|
|
524
526
|
|
|
525
|
-
def __init__(self, name="OpenAIChatCompletionsModel", **kwargs):
|
|
526
|
-
super().__init__(name=name, **kwargs)
|
|
527
|
+
def __init__(self, name="OpenAIChatCompletionsModel", *args, **kwargs):
|
|
528
|
+
super().__init__(name=name, *args, **kwargs)
|
|
527
529
|
|
|
528
530
|
def format_request_data(self, messages: List[dict], **request_params): # type: ignore[override]
|
|
529
531
|
request_data = {"messages": messages, **self.get_model_params()}
|
|
@@ -3,6 +3,11 @@ name: TaskSimulatorQueryResponse
|
|
|
3
3
|
description: Gets queries and responses from a blob of text
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
+
configuration:
|
|
7
|
+
type: azure_openai
|
|
8
|
+
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
+
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
+
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
6
11
|
parameters:
|
|
7
12
|
temperature: 0.0
|
|
8
13
|
top_p: 1.0
|
|
@@ -28,16 +33,15 @@ Answer must not be more than 5 words
|
|
|
28
33
|
Answer must be picked from Text as is
|
|
29
34
|
Question should be as descriptive as possible and must include as much context as possible from Text
|
|
30
35
|
Output must always have the provided number of QnAs
|
|
31
|
-
Output must be in JSON format
|
|
32
|
-
Output must have {{num_queries}} objects in the format specified below. Any other count is unacceptable.
|
|
36
|
+
Output must be in JSON format
|
|
33
37
|
Text:
|
|
34
38
|
<|text_start|>
|
|
35
39
|
On January 24, 1984, former Apple CEO Steve Jobs introduced the first Macintosh. In late 2003, Apple had 2.06 percent of the desktop share in the United States.
|
|
36
40
|
Some years later, research firms IDC and Gartner reported that Apple's market share in the U.S. had increased to about 6%.
|
|
37
41
|
<|text_end|>
|
|
38
42
|
Output with 5 QnAs:
|
|
39
|
-
|
|
40
|
-
|
|
43
|
+
[
|
|
44
|
+
{
|
|
41
45
|
"q": "When did the former Apple CEO Steve Jobs introduced the first Macintosh?",
|
|
42
46
|
"r": "January 24, 1984"
|
|
43
47
|
},
|
|
@@ -56,8 +60,8 @@ Output with 5 QnAs:
|
|
|
56
60
|
{
|
|
57
61
|
"q": "What was the percentage increase of Apple's market share in the U.S., as reported by research firms IDC and Gartner?",
|
|
58
62
|
"r": "6%"
|
|
59
|
-
}
|
|
60
|
-
|
|
63
|
+
}
|
|
64
|
+
]
|
|
61
65
|
Text:
|
|
62
66
|
<|text_start|>
|
|
63
67
|
{{ text }}
|
|
@@ -3,6 +3,10 @@ name: TaskSimulatorWithPersona
|
|
|
3
3
|
description: Simulates a user to complete a conversation
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
+
configuration:
|
|
7
|
+
type: azure_openai
|
|
8
|
+
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
+
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
6
10
|
parameters:
|
|
7
11
|
temperature: 0.0
|
|
8
12
|
top_p: 1.0
|
|
@@ -16,9 +20,6 @@ inputs:
|
|
|
16
20
|
type: string
|
|
17
21
|
conversation_history:
|
|
18
22
|
type: dict
|
|
19
|
-
action:
|
|
20
|
-
type: string
|
|
21
|
-
default: continue the converasation and make sure the task is completed by asking relevant questions
|
|
22
23
|
|
|
23
24
|
---
|
|
24
25
|
system:
|
|
@@ -28,10 +29,8 @@ Output must be in JSON format
|
|
|
28
29
|
Here's a sample output:
|
|
29
30
|
{
|
|
30
31
|
"content": "Here is my follow-up question.",
|
|
31
|
-
"
|
|
32
|
+
"user": "user"
|
|
32
33
|
}
|
|
33
34
|
|
|
34
35
|
Output with a json object that continues the conversation, given the conversation history:
|
|
35
36
|
{{ conversation_history }}
|
|
36
|
-
|
|
37
|
-
{{ action }}
|