azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
|
@@ -6,23 +6,19 @@
|
|
|
6
6
|
import asyncio
|
|
7
7
|
import logging
|
|
8
8
|
import random
|
|
9
|
-
from typing import Any, Callable, Dict, List,
|
|
10
|
-
from itertools import zip_longest
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
11
10
|
|
|
11
|
+
from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
|
|
12
|
+
from azure.identity import DefaultAzureCredential
|
|
12
13
|
from tqdm import tqdm
|
|
13
14
|
|
|
14
|
-
from azure.ai.evaluation._common._experimental import experimental
|
|
15
|
-
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
16
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
17
15
|
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
16
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
18
17
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
19
18
|
from azure.ai.evaluation.simulator import AdversarialScenario
|
|
20
19
|
from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
|
|
21
|
-
from azure.core.credentials import TokenCredential
|
|
22
|
-
from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
|
|
23
20
|
|
|
24
|
-
from .
|
|
25
|
-
from ._conversation import CallbackConversationBot, ConversationBot, ConversationRole, ConversationTurn
|
|
21
|
+
from ._conversation import CallbackConversationBot, ConversationBot, ConversationRole
|
|
26
22
|
from ._conversation._conversation import simulate_conversation
|
|
27
23
|
from ._model_tools import (
|
|
28
24
|
AdversarialTemplateHandler,
|
|
@@ -31,13 +27,13 @@ from ._model_tools import (
|
|
|
31
27
|
RAIClient,
|
|
32
28
|
TokenScope,
|
|
33
29
|
)
|
|
34
|
-
from .
|
|
30
|
+
from ._tracing import monitor_adversarial_scenario
|
|
35
31
|
from ._utils import JsonLineList
|
|
32
|
+
from ._constants import SupportedLanguages
|
|
36
33
|
|
|
37
34
|
logger = logging.getLogger(__name__)
|
|
38
35
|
|
|
39
36
|
|
|
40
|
-
@experimental
|
|
41
37
|
class AdversarialSimulator:
|
|
42
38
|
"""
|
|
43
39
|
Initializes the adversarial simulator with a project scope.
|
|
@@ -47,40 +43,43 @@ class AdversarialSimulator:
|
|
|
47
43
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
48
44
|
:param credential: The credential for connecting to Azure AI project.
|
|
49
45
|
:type credential: ~azure.core.credentials.TokenCredential
|
|
50
|
-
|
|
51
|
-
.. admonition:: Example:
|
|
52
|
-
|
|
53
|
-
.. literalinclude:: ../samples/evaluation_samples_simulate.py
|
|
54
|
-
:start-after: [START adversarial_scenario]
|
|
55
|
-
:end-before: [END adversarial_scenario]
|
|
56
|
-
:language: python
|
|
57
|
-
:dedent: 8
|
|
58
|
-
:caption: Run the AdversarialSimulator with an AdversarialConversation scenario to produce 2 results with
|
|
59
|
-
2 conversation turns each (4 messages per result).
|
|
60
46
|
"""
|
|
61
47
|
|
|
62
|
-
def __init__(self, *, azure_ai_project: AzureAIProject, credential
|
|
48
|
+
def __init__(self, *, azure_ai_project: AzureAIProject, credential=None):
|
|
63
49
|
"""Constructor."""
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
except EvaluationException as e:
|
|
50
|
+
# check if azure_ai_project has the keys: subscription_id, resource_group_name and project_name
|
|
51
|
+
if not all(key in azure_ai_project for key in ["subscription_id", "resource_group_name", "project_name"]):
|
|
52
|
+
msg = "azure_ai_project must contain keys: subscription_id, resource_group_name, project_name"
|
|
68
53
|
raise EvaluationException(
|
|
69
|
-
message=
|
|
70
|
-
internal_message=
|
|
54
|
+
message=msg,
|
|
55
|
+
internal_message=msg,
|
|
71
56
|
target=ErrorTarget.ADVERSARIAL_SIMULATOR,
|
|
72
|
-
category=
|
|
73
|
-
blame=
|
|
74
|
-
)
|
|
75
|
-
|
|
57
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
58
|
+
blame=ErrorBlame.USER_ERROR,
|
|
59
|
+
)
|
|
60
|
+
# check the value of the keys in azure_ai_project is not none
|
|
61
|
+
if not all(azure_ai_project[key] for key in ["subscription_id", "resource_group_name", "project_name"]):
|
|
62
|
+
msg = "subscription_id, resource_group_name and project_name cannot be None"
|
|
63
|
+
raise EvaluationException(
|
|
64
|
+
message=msg,
|
|
65
|
+
internal_message=msg,
|
|
66
|
+
target=ErrorTarget.ADVERSARIAL_SIMULATOR,
|
|
67
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
68
|
+
blame=ErrorBlame.USER_ERROR,
|
|
69
|
+
)
|
|
70
|
+
if "credential" not in azure_ai_project and not credential:
|
|
71
|
+
credential = DefaultAzureCredential()
|
|
72
|
+
elif "credential" in azure_ai_project:
|
|
73
|
+
credential = azure_ai_project["credential"]
|
|
74
|
+
self.azure_ai_project = azure_ai_project
|
|
76
75
|
self.token_manager = ManagedIdentityAPITokenManager(
|
|
77
76
|
token_scope=TokenScope.DEFAULT_AZURE_MANAGEMENT,
|
|
78
77
|
logger=logging.getLogger("AdversarialSimulator"),
|
|
79
|
-
credential=
|
|
78
|
+
credential=credential,
|
|
80
79
|
)
|
|
81
|
-
self.rai_client = RAIClient(azure_ai_project=
|
|
80
|
+
self.rai_client = RAIClient(azure_ai_project=azure_ai_project, token_manager=self.token_manager)
|
|
82
81
|
self.adversarial_template_handler = AdversarialTemplateHandler(
|
|
83
|
-
azure_ai_project=
|
|
82
|
+
azure_ai_project=azure_ai_project, rai_client=self.rai_client
|
|
84
83
|
)
|
|
85
84
|
|
|
86
85
|
def _ensure_service_dependencies(self):
|
|
@@ -94,7 +93,7 @@ class AdversarialSimulator:
|
|
|
94
93
|
blame=ErrorBlame.USER_ERROR,
|
|
95
94
|
)
|
|
96
95
|
|
|
97
|
-
#
|
|
96
|
+
# @monitor_adversarial_scenario
|
|
98
97
|
async def __call__(
|
|
99
98
|
self,
|
|
100
99
|
*,
|
|
@@ -108,10 +107,10 @@ class AdversarialSimulator:
|
|
|
108
107
|
api_call_retry_sleep_sec: int = 1,
|
|
109
108
|
api_call_delay_sec: int = 0,
|
|
110
109
|
concurrent_async_task: int = 3,
|
|
110
|
+
_jailbreak_type: Optional[str] = None,
|
|
111
111
|
language: SupportedLanguages = SupportedLanguages.English,
|
|
112
112
|
randomize_order: bool = True,
|
|
113
113
|
randomization_seed: Optional[int] = None,
|
|
114
|
-
**kwargs,
|
|
115
114
|
):
|
|
116
115
|
"""
|
|
117
116
|
Executes the adversarial simulation against a specified target function asynchronously.
|
|
@@ -161,6 +160,28 @@ class AdversarialSimulator:
|
|
|
161
160
|
|
|
162
161
|
The 'content' for 'assistant' role messages may includes the messages that your callback returned.
|
|
163
162
|
:rtype: List[Dict[str, Any]]
|
|
163
|
+
|
|
164
|
+
**Output format**
|
|
165
|
+
|
|
166
|
+
.. code-block:: python
|
|
167
|
+
|
|
168
|
+
return_value = [
|
|
169
|
+
{
|
|
170
|
+
'template_parameters': {},
|
|
171
|
+
'messages': [
|
|
172
|
+
{
|
|
173
|
+
'content': '<jailbreak prompt> <adversarial query>',
|
|
174
|
+
'role': 'user'
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
'content': "<response from endpoint>",
|
|
178
|
+
'role': 'assistant',
|
|
179
|
+
'context': None
|
|
180
|
+
}
|
|
181
|
+
],
|
|
182
|
+
'$schema': 'http://azureml/sdk-2-0/ChatConversation.json'
|
|
183
|
+
}
|
|
184
|
+
]
|
|
164
185
|
"""
|
|
165
186
|
|
|
166
187
|
# validate the inputs
|
|
@@ -196,7 +217,6 @@ class AdversarialSimulator:
|
|
|
196
217
|
total_tasks,
|
|
197
218
|
)
|
|
198
219
|
total_tasks = min(total_tasks, max_simulation_results)
|
|
199
|
-
_jailbreak_type = kwargs.get("_jailbreak_type", None)
|
|
200
220
|
if _jailbreak_type:
|
|
201
221
|
jailbreak_dataset = await self.rai_client.get_jailbreaks_dataset(type=_jailbreak_type)
|
|
202
222
|
progress_bar = tqdm(
|
|
@@ -205,18 +225,17 @@ class AdversarialSimulator:
|
|
|
205
225
|
ncols=100,
|
|
206
226
|
unit="simulations",
|
|
207
227
|
)
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
for template, parameter in zip(templates, param_group):
|
|
228
|
+
for template in templates:
|
|
229
|
+
parameter_order = list(range(len(template.template_parameters)))
|
|
230
|
+
if randomize_order:
|
|
231
|
+
# The template parameter lists are persistent across sim runs within a session,
|
|
232
|
+
# So randomize a the selection instead of the parameter list directly,
|
|
233
|
+
# or a potentially large deep copy.
|
|
234
|
+
if randomization_seed is not None:
|
|
235
|
+
random.seed(randomization_seed)
|
|
236
|
+
random.shuffle(parameter_order)
|
|
237
|
+
for index in parameter_order:
|
|
238
|
+
parameter = template.template_parameters[index].copy()
|
|
220
239
|
if _jailbreak_type == "upia":
|
|
221
240
|
parameter = self._join_conversation_starter(parameter, random.choice(jailbreak_dataset))
|
|
222
241
|
tasks.append(
|
|
@@ -245,21 +264,16 @@ class AdversarialSimulator:
|
|
|
245
264
|
|
|
246
265
|
return JsonLineList(sim_results)
|
|
247
266
|
|
|
248
|
-
def _to_chat_protocol(
|
|
249
|
-
self,
|
|
250
|
-
*,
|
|
251
|
-
conversation_history: List[ConversationTurn],
|
|
252
|
-
template_parameters: Optional[Dict[str, Union[str, Dict[str, str]]]] = None,
|
|
253
|
-
):
|
|
267
|
+
def _to_chat_protocol(self, *, conversation_history, template_parameters: Dict = None):
|
|
254
268
|
if template_parameters is None:
|
|
255
269
|
template_parameters = {}
|
|
256
270
|
messages = []
|
|
257
271
|
for _, m in enumerate(conversation_history):
|
|
258
272
|
message = {"content": m.message, "role": m.role.value}
|
|
259
|
-
if
|
|
273
|
+
if "context" in m.full_response:
|
|
260
274
|
message["context"] = m.full_response["context"]
|
|
261
275
|
messages.append(message)
|
|
262
|
-
conversation_category =
|
|
276
|
+
conversation_category = template_parameters.pop("metadata", {}).get("Category")
|
|
263
277
|
template_parameters["metadata"] = {}
|
|
264
278
|
for key in (
|
|
265
279
|
"conversation_starter",
|
|
@@ -267,9 +281,6 @@ class AdversarialSimulator:
|
|
|
267
281
|
"target_population",
|
|
268
282
|
"topic",
|
|
269
283
|
"ch_template_placeholder",
|
|
270
|
-
"chatbot_name",
|
|
271
|
-
"name",
|
|
272
|
-
"group",
|
|
273
284
|
):
|
|
274
285
|
template_parameters.pop(key, None)
|
|
275
286
|
if conversation_category:
|
|
@@ -284,14 +295,14 @@ class AdversarialSimulator:
|
|
|
284
295
|
self,
|
|
285
296
|
*,
|
|
286
297
|
target: Callable,
|
|
287
|
-
template
|
|
288
|
-
parameters
|
|
289
|
-
max_conversation_turns
|
|
290
|
-
api_call_retry_limit
|
|
291
|
-
api_call_retry_sleep_sec
|
|
292
|
-
api_call_delay_sec
|
|
293
|
-
language
|
|
294
|
-
semaphore
|
|
298
|
+
template,
|
|
299
|
+
parameters,
|
|
300
|
+
max_conversation_turns,
|
|
301
|
+
api_call_retry_limit,
|
|
302
|
+
api_call_retry_sleep_sec,
|
|
303
|
+
api_call_delay_sec,
|
|
304
|
+
language,
|
|
305
|
+
semaphore,
|
|
295
306
|
) -> List[Dict]:
|
|
296
307
|
user_bot = self._setup_bot(role=ConversationRole.USER, template=template, parameters=parameters)
|
|
297
308
|
system_bot = self._setup_bot(
|
|
@@ -314,15 +325,9 @@ class AdversarialSimulator:
|
|
|
314
325
|
api_call_delay_sec=api_call_delay_sec,
|
|
315
326
|
language=language,
|
|
316
327
|
)
|
|
328
|
+
return self._to_chat_protocol(conversation_history=conversation_history, template_parameters=parameters)
|
|
317
329
|
|
|
318
|
-
|
|
319
|
-
conversation_history=conversation_history,
|
|
320
|
-
template_parameters=cast(Dict[str, Union[str, Dict[str, str]]], parameters),
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
def _get_user_proxy_completion_model(
|
|
324
|
-
self, template_key: str, template_parameters: TemplateParameters
|
|
325
|
-
) -> ProxyChatCompletionsModel:
|
|
330
|
+
def _get_user_proxy_completion_model(self, template_key, template_parameters):
|
|
326
331
|
return ProxyChatCompletionsModel(
|
|
327
332
|
name="raisvc_proxy_model",
|
|
328
333
|
template_key=template_key,
|
|
@@ -334,15 +339,8 @@ class AdversarialSimulator:
|
|
|
334
339
|
temperature=0.0,
|
|
335
340
|
)
|
|
336
341
|
|
|
337
|
-
def _setup_bot(
|
|
338
|
-
|
|
339
|
-
*,
|
|
340
|
-
role: ConversationRole,
|
|
341
|
-
template: AdversarialTemplate,
|
|
342
|
-
parameters: TemplateParameters,
|
|
343
|
-
target: Optional[Callable] = None,
|
|
344
|
-
) -> ConversationBot:
|
|
345
|
-
if role is ConversationRole.USER:
|
|
342
|
+
def _setup_bot(self, *, role, template, parameters, target: Callable = None):
|
|
343
|
+
if role == ConversationRole.USER:
|
|
346
344
|
model = self._get_user_proxy_completion_model(
|
|
347
345
|
template_key=template.template_name, template_parameters=parameters
|
|
348
346
|
)
|
|
@@ -353,46 +351,27 @@ class AdversarialSimulator:
|
|
|
353
351
|
instantiation_parameters=parameters,
|
|
354
352
|
)
|
|
355
353
|
|
|
356
|
-
if role
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
raise EvaluationException(
|
|
361
|
-
message=msg,
|
|
362
|
-
internal_message=msg,
|
|
363
|
-
target=ErrorTarget.ADVERSARIAL_SIMULATOR,
|
|
364
|
-
error_category=ErrorCategory.INVALID_VALUE,
|
|
365
|
-
blame=ErrorBlame.SYSTEM_ERROR,
|
|
366
|
-
)
|
|
367
|
-
|
|
368
|
-
class DummyModel:
|
|
369
|
-
def __init__(self):
|
|
370
|
-
self.name = "dummy_model"
|
|
371
|
-
|
|
372
|
-
def __call__(self) -> None:
|
|
373
|
-
pass
|
|
374
|
-
|
|
354
|
+
if role == ConversationRole.ASSISTANT:
|
|
355
|
+
dummy_model = lambda: None # noqa: E731
|
|
356
|
+
dummy_model.name = "dummy_model"
|
|
375
357
|
return CallbackConversationBot(
|
|
376
358
|
callback=target,
|
|
377
359
|
role=role,
|
|
378
|
-
model=
|
|
360
|
+
model=dummy_model,
|
|
379
361
|
user_template=str(template),
|
|
380
362
|
user_template_parameters=parameters,
|
|
381
363
|
conversation_template="",
|
|
382
364
|
instantiation_parameters={},
|
|
383
365
|
)
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
target=ErrorTarget.ADVERSARIAL_SIMULATOR,
|
|
390
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
391
|
-
blame=ErrorBlame.SYSTEM_ERROR,
|
|
366
|
+
return ConversationBot(
|
|
367
|
+
role=role,
|
|
368
|
+
model=model,
|
|
369
|
+
conversation_template=template,
|
|
370
|
+
instantiation_parameters=parameters,
|
|
392
371
|
)
|
|
393
372
|
|
|
394
|
-
def _join_conversation_starter(self, parameters
|
|
395
|
-
key
|
|
373
|
+
def _join_conversation_starter(self, parameters, to_join):
|
|
374
|
+
key = "conversation_starter"
|
|
396
375
|
if key in parameters.keys():
|
|
397
376
|
parameters[key] = f"{to_join} {parameters[key]}"
|
|
398
377
|
else:
|
|
@@ -5,17 +5,7 @@ from enum import Enum
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class SupportedLanguages(Enum):
|
|
8
|
-
"""Supported languages for evaluation, using ISO standard language codes.
|
|
9
|
-
|
|
10
|
-
.. admonition:: Example:
|
|
11
|
-
|
|
12
|
-
.. literalinclude:: ../samples/evaluation_samples_simulate.py
|
|
13
|
-
:start-after: [START supported_languages]
|
|
14
|
-
:end-before: [END supported_languages]
|
|
15
|
-
:language: python
|
|
16
|
-
:dedent: 8
|
|
17
|
-
:caption: Run the AdversarialSimulator with Simplified Chinese language support for evaluation.
|
|
18
|
-
"""
|
|
8
|
+
"""Supported languages for evaluation, using ISO standard language codes."""
|
|
19
9
|
|
|
20
10
|
Spanish = "es"
|
|
21
11
|
Italian = "it"
|
|
@@ -7,15 +7,14 @@ import copy
|
|
|
7
7
|
import logging
|
|
8
8
|
import time
|
|
9
9
|
from dataclasses import dataclass
|
|
10
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
10
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
11
11
|
|
|
12
12
|
import jinja2
|
|
13
13
|
|
|
14
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
14
|
from azure.ai.evaluation._http_utils import AsyncHttpPipeline
|
|
15
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
16
16
|
|
|
17
17
|
from .._model_tools import LLMBase, OpenAIChatCompletionsModel
|
|
18
|
-
from .._model_tools._template_handler import TemplateParameters
|
|
19
18
|
from .constants import ConversationRole
|
|
20
19
|
|
|
21
20
|
|
|
@@ -41,7 +40,7 @@ class ConversationTurn:
|
|
|
41
40
|
role: "ConversationRole"
|
|
42
41
|
name: Optional[str] = None
|
|
43
42
|
message: str = ""
|
|
44
|
-
full_response: Optional[
|
|
43
|
+
full_response: Optional[Any] = None
|
|
45
44
|
request: Optional[Any] = None
|
|
46
45
|
|
|
47
46
|
def to_openai_chat_format(self, reverse: bool = False) -> Dict[str, str]:
|
|
@@ -110,7 +109,7 @@ class ConversationBot:
|
|
|
110
109
|
role: ConversationRole,
|
|
111
110
|
model: Union[LLMBase, OpenAIChatCompletionsModel],
|
|
112
111
|
conversation_template: str,
|
|
113
|
-
instantiation_parameters:
|
|
112
|
+
instantiation_parameters: Dict[str, str],
|
|
114
113
|
) -> None:
|
|
115
114
|
self.role = role
|
|
116
115
|
self.conversation_template_orig = conversation_template
|
|
@@ -119,13 +118,13 @@ class ConversationBot:
|
|
|
119
118
|
)
|
|
120
119
|
self.persona_template_args = instantiation_parameters
|
|
121
120
|
if self.role == ConversationRole.USER:
|
|
122
|
-
self.name
|
|
121
|
+
self.name = self.persona_template_args.get("name", role.value)
|
|
123
122
|
else:
|
|
124
|
-
self.name =
|
|
123
|
+
self.name = self.persona_template_args.get("chatbot_name", role.value) or model.name
|
|
125
124
|
self.model = model
|
|
126
125
|
|
|
127
126
|
self.logger = logging.getLogger(repr(self))
|
|
128
|
-
self.conversation_starter
|
|
127
|
+
self.conversation_starter = None # can either be a dictionary or jinja template
|
|
129
128
|
if role == ConversationRole.USER:
|
|
130
129
|
if "conversation_starter" in self.persona_template_args:
|
|
131
130
|
conversation_starter_content = self.persona_template_args["conversation_starter"]
|
|
@@ -149,7 +148,7 @@ class ConversationBot:
|
|
|
149
148
|
conversation_history: List[ConversationTurn],
|
|
150
149
|
max_history: int,
|
|
151
150
|
turn_number: int = 0,
|
|
152
|
-
) -> Tuple[dict, dict,
|
|
151
|
+
) -> Tuple[dict, dict, int, dict]:
|
|
153
152
|
"""
|
|
154
153
|
Prompt the ConversationBot for a response.
|
|
155
154
|
|
|
@@ -162,7 +161,7 @@ class ConversationBot:
|
|
|
162
161
|
:param turn_number: Parameters used to query GPT-4 model.
|
|
163
162
|
:type turn_number: int
|
|
164
163
|
:return: The response from the ConversationBot.
|
|
165
|
-
:rtype: Tuple[dict, dict,
|
|
164
|
+
:rtype: Tuple[dict, dict, int, dict]
|
|
166
165
|
"""
|
|
167
166
|
|
|
168
167
|
# check if this is the first turn and the conversation_starter is not None,
|
|
@@ -170,11 +169,11 @@ class ConversationBot:
|
|
|
170
169
|
if turn_number == 0 and self.conversation_starter is not None:
|
|
171
170
|
# if conversation_starter is a dictionary, pass it into samples as is
|
|
172
171
|
if isinstance(self.conversation_starter, dict):
|
|
173
|
-
samples
|
|
172
|
+
samples = [self.conversation_starter]
|
|
174
173
|
if isinstance(self.conversation_starter, jinja2.Template):
|
|
175
174
|
samples = [self.conversation_starter.render(**self.persona_template_args)]
|
|
176
175
|
else:
|
|
177
|
-
samples = [self.conversation_starter]
|
|
176
|
+
samples = [self.conversation_starter] # type: ignore[attr-defined]
|
|
178
177
|
time_taken = 0
|
|
179
178
|
|
|
180
179
|
finish_reason = ["stop"]
|
|
@@ -239,7 +238,7 @@ class CallbackConversationBot(ConversationBot):
|
|
|
239
238
|
self,
|
|
240
239
|
callback: Callable,
|
|
241
240
|
user_template: str,
|
|
242
|
-
user_template_parameters:
|
|
241
|
+
user_template_parameters: Dict,
|
|
243
242
|
*args,
|
|
244
243
|
**kwargs,
|
|
245
244
|
) -> None:
|
|
@@ -255,7 +254,7 @@ class CallbackConversationBot(ConversationBot):
|
|
|
255
254
|
conversation_history: List[Any],
|
|
256
255
|
max_history: int,
|
|
257
256
|
turn_number: int = 0,
|
|
258
|
-
) -> Tuple[dict, dict,
|
|
257
|
+
) -> Tuple[dict, dict, int, dict]:
|
|
259
258
|
chat_protocol_message = self._to_chat_protocol(
|
|
260
259
|
self.user_template, conversation_history, self.user_template_parameters
|
|
261
260
|
)
|
|
@@ -4,14 +4,14 @@
|
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import logging
|
|
7
|
-
from typing import Callable, Dict, List,
|
|
7
|
+
from typing import Callable, Dict, List, Tuple, Union
|
|
8
8
|
|
|
9
|
-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
10
|
-
from azure.ai.evaluation.simulator._constants import SupportedLanguages
|
|
11
9
|
from azure.ai.evaluation.simulator._helpers._language_suffix_mapping import SUPPORTED_LANGUAGES_MAPPING
|
|
10
|
+
from azure.ai.evaluation.simulator._constants import SupportedLanguages
|
|
12
11
|
|
|
13
12
|
from ..._http_utils import AsyncHttpPipeline
|
|
14
13
|
from . import ConversationBot, ConversationTurn
|
|
14
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def is_closing_message(response: Union[Dict, str], recursion_depth: int = 0) -> bool:
|
|
@@ -80,26 +80,26 @@ async def simulate_conversation(
|
|
|
80
80
|
history_limit: int = 5,
|
|
81
81
|
api_call_delay_sec: float = 0,
|
|
82
82
|
logger: logging.Logger = logging.getLogger(__name__),
|
|
83
|
-
) -> Tuple
|
|
83
|
+
) -> Tuple:
|
|
84
84
|
"""
|
|
85
85
|
Simulate a conversation between the given bots.
|
|
86
86
|
|
|
87
|
-
:
|
|
88
|
-
:
|
|
89
|
-
:
|
|
90
|
-
:
|
|
91
|
-
:
|
|
92
|
-
:
|
|
93
|
-
:
|
|
94
|
-
:
|
|
95
|
-
:
|
|
96
|
-
:
|
|
97
|
-
:
|
|
98
|
-
:
|
|
99
|
-
:
|
|
100
|
-
:
|
|
87
|
+
:param bots: List of ConversationBot instances participating in the conversation.
|
|
88
|
+
:type bots: List[ConversationBot]
|
|
89
|
+
:param session: The session to use for making API calls.
|
|
90
|
+
:type session: AsyncHttpPipeline
|
|
91
|
+
:param stopping_criteria: A callable that determines when the conversation should stop.
|
|
92
|
+
:type stopping_criteria: Callable[[str], bool]
|
|
93
|
+
:param turn_limit: The maximum number of turns in the conversation. Defaults to 10.
|
|
94
|
+
:type turn_limit: int
|
|
95
|
+
:param history_limit: The maximum number of turns to keep in the conversation history. Defaults to 5.
|
|
96
|
+
:type history_limit: int
|
|
97
|
+
:param api_call_delay_sec: Delay between API calls in seconds. Defaults to 0.
|
|
98
|
+
:type api_call_delay_sec: float
|
|
99
|
+
:param logger: The logger to use for logging. Defaults to the logger named after the current module.
|
|
100
|
+
:type logger: logging.Logger
|
|
101
101
|
:return: Simulation a conversation between the given bots.
|
|
102
|
-
:rtype: Tuple
|
|
102
|
+
:rtype: Tuple
|
|
103
103
|
"""
|
|
104
104
|
|
|
105
105
|
# Read the first prompt.
|
|
@@ -110,7 +110,7 @@ async def simulate_conversation(
|
|
|
110
110
|
turn_number=0,
|
|
111
111
|
)
|
|
112
112
|
if "id" in first_response:
|
|
113
|
-
conversation_id
|
|
113
|
+
conversation_id = first_response["id"]
|
|
114
114
|
else:
|
|
115
115
|
conversation_id = None
|
|
116
116
|
first_prompt = first_response["samples"][0]
|