azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +60 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/constants.py +65 -0
- azure/ai/evaluation/_common/rai_service.py +452 -0
- azure/ai/evaluation/_common/utils.py +87 -0
- azure/ai/evaluation/_constants.py +50 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
- azure/ai/evaluation/_evaluate/_utils.py +237 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
- azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
- azure/ai/evaluation/_exceptions.py +107 -0
- azure/ai/evaluation/_http_utils.py +395 -0
- azure/ai/evaluation/_model_configurations.py +27 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +15 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
- azure/ai/evaluation/simulator/_constants.py +17 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
- azure/ai/evaluation/simulator/_tracing.py +92 -0
- azure/ai/evaluation/simulator/_utils.py +111 -0
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0b1.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from ._adversarial_scenario import AdversarialScenario
|
|
2
|
+
from ._adversarial_simulator import AdversarialSimulator
|
|
3
|
+
from ._constants import SupportedLanguages
|
|
4
|
+
from ._direct_attack_simulator import DirectAttackSimulator
|
|
5
|
+
from ._indirect_attack_simulator import IndirectAttackSimulator
|
|
6
|
+
from .simulator import Simulator
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"AdversarialSimulator",
|
|
10
|
+
"AdversarialScenario",
|
|
11
|
+
"DirectAttackSimulator",
|
|
12
|
+
"IndirectAttackSimulator",
|
|
13
|
+
"SupportedLanguages",
|
|
14
|
+
"Simulator",
|
|
15
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AdversarialScenario(Enum):
|
|
9
|
+
"""Adversarial scenario types"""
|
|
10
|
+
|
|
11
|
+
ADVERSARIAL_QA = "adv_qa"
|
|
12
|
+
ADVERSARIAL_CONVERSATION = "adv_conversation"
|
|
13
|
+
ADVERSARIAL_SUMMARIZATION = "adv_summarization"
|
|
14
|
+
ADVERSARIAL_SEARCH = "adv_search"
|
|
15
|
+
ADVERSARIAL_REWRITE = "adv_rewrite"
|
|
16
|
+
ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded"
|
|
17
|
+
ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded"
|
|
18
|
+
ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material"
|
|
19
|
+
ADVERSARIAL_INDIRECT_JAILBREAK = "adv_xpia"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class _UnstableAdversarialScenario(Enum):
|
|
23
|
+
"""Adversarial scenario types that we haven't published, but still want available for internal use
|
|
24
|
+
Values listed here are subject to potential change, and/or migration to the main enum over time.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
ECI = "adv_politics"
|
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
# noqa: E501
|
|
5
|
+
# pylint: disable=E0401,E0611
|
|
6
|
+
import asyncio
|
|
7
|
+
import logging
|
|
8
|
+
import random
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
|
|
12
|
+
from azure.identity import DefaultAzureCredential
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
15
|
+
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
16
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
17
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
|
+
from azure.ai.evaluation.simulator import AdversarialScenario
|
|
19
|
+
from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
|
|
20
|
+
|
|
21
|
+
from ._conversation import CallbackConversationBot, ConversationBot, ConversationRole
|
|
22
|
+
from ._conversation._conversation import simulate_conversation
|
|
23
|
+
from ._model_tools import (
|
|
24
|
+
AdversarialTemplateHandler,
|
|
25
|
+
ManagedIdentityAPITokenManager,
|
|
26
|
+
ProxyChatCompletionsModel,
|
|
27
|
+
RAIClient,
|
|
28
|
+
TokenScope,
|
|
29
|
+
)
|
|
30
|
+
from ._tracing import monitor_adversarial_scenario
|
|
31
|
+
from ._utils import JsonLineList
|
|
32
|
+
from ._constants import SupportedLanguages
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class AdversarialSimulator:
|
|
38
|
+
"""
|
|
39
|
+
Initializes the adversarial simulator with a project scope.
|
|
40
|
+
|
|
41
|
+
:param azure_ai_project: The scope of the Azure AI project. It contains subscription id, resource group, and project
|
|
42
|
+
name.
|
|
43
|
+
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
44
|
+
:param credential: The credential for connecting to Azure AI project.
|
|
45
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, *, azure_ai_project: AzureAIProject, credential=None):
|
|
49
|
+
"""Constructor."""
|
|
50
|
+
# check if azure_ai_project has the keys: subscription_id, resource_group_name and project_name
|
|
51
|
+
if not all(key in azure_ai_project for key in ["subscription_id", "resource_group_name", "project_name"]):
|
|
52
|
+
msg = "azure_ai_project must contain keys: subscription_id, resource_group_name, project_name"
|
|
53
|
+
raise EvaluationException(
|
|
54
|
+
message=msg,
|
|
55
|
+
internal_message=msg,
|
|
56
|
+
target=ErrorTarget.ADVERSARIAL_SIMULATOR,
|
|
57
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
58
|
+
blame=ErrorBlame.USER_ERROR,
|
|
59
|
+
)
|
|
60
|
+
# check the value of the keys in azure_ai_project is not none
|
|
61
|
+
if not all(azure_ai_project[key] for key in ["subscription_id", "resource_group_name", "project_name"]):
|
|
62
|
+
msg = "subscription_id, resource_group_name and project_name cannot be None"
|
|
63
|
+
raise EvaluationException(
|
|
64
|
+
message=msg,
|
|
65
|
+
internal_message=msg,
|
|
66
|
+
target=ErrorTarget.ADVERSARIAL_SIMULATOR,
|
|
67
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
68
|
+
blame=ErrorBlame.USER_ERROR,
|
|
69
|
+
)
|
|
70
|
+
if "credential" not in azure_ai_project and not credential:
|
|
71
|
+
credential = DefaultAzureCredential()
|
|
72
|
+
elif "credential" in azure_ai_project:
|
|
73
|
+
credential = azure_ai_project["credential"]
|
|
74
|
+
self.azure_ai_project = azure_ai_project
|
|
75
|
+
self.token_manager = ManagedIdentityAPITokenManager(
|
|
76
|
+
token_scope=TokenScope.DEFAULT_AZURE_MANAGEMENT,
|
|
77
|
+
logger=logging.getLogger("AdversarialSimulator"),
|
|
78
|
+
credential=credential,
|
|
79
|
+
)
|
|
80
|
+
self.rai_client = RAIClient(azure_ai_project=azure_ai_project, token_manager=self.token_manager)
|
|
81
|
+
self.adversarial_template_handler = AdversarialTemplateHandler(
|
|
82
|
+
azure_ai_project=azure_ai_project, rai_client=self.rai_client
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _ensure_service_dependencies(self):
|
|
86
|
+
if self.rai_client is None:
|
|
87
|
+
msg = "RAI service is required for simulation, but an RAI client was not provided."
|
|
88
|
+
raise EvaluationException(
|
|
89
|
+
message=msg,
|
|
90
|
+
internal_message=msg,
|
|
91
|
+
target=ErrorTarget.ADVERSARIAL_SIMULATOR,
|
|
92
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
93
|
+
blame=ErrorBlame.USER_ERROR,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# @monitor_adversarial_scenario
|
|
97
|
+
async def __call__(
|
|
98
|
+
self,
|
|
99
|
+
*,
|
|
100
|
+
# Note: the scenario input also accepts inputs from _PrivateAdversarialScenario, but that's
|
|
101
|
+
# not stated since those values are nominally for internal use only.
|
|
102
|
+
scenario: AdversarialScenario,
|
|
103
|
+
target: Callable,
|
|
104
|
+
max_conversation_turns: int = 1,
|
|
105
|
+
max_simulation_results: int = 3,
|
|
106
|
+
api_call_retry_limit: int = 3,
|
|
107
|
+
api_call_retry_sleep_sec: int = 1,
|
|
108
|
+
api_call_delay_sec: int = 0,
|
|
109
|
+
concurrent_async_task: int = 3,
|
|
110
|
+
_jailbreak_type: Optional[str] = None,
|
|
111
|
+
language: SupportedLanguages = SupportedLanguages.English,
|
|
112
|
+
randomize_order: bool = True,
|
|
113
|
+
randomization_seed: Optional[int] = None,
|
|
114
|
+
):
|
|
115
|
+
"""
|
|
116
|
+
Executes the adversarial simulation against a specified target function asynchronously.
|
|
117
|
+
|
|
118
|
+
:keyword scenario: Enum value specifying the adversarial scenario used for generating inputs.
|
|
119
|
+
example:
|
|
120
|
+
|
|
121
|
+
- :py:const:`azure.ai.evaluation.simulator.AdversarialScenario.ADVERSARIAL_QA`
|
|
122
|
+
- :py:const:`azure.ai.evaluation.simulator.AdversarialScenario.ADVERSARIAL_CONVERSATION`
|
|
123
|
+
:paramtype scenario: azure.ai.evaluation.simulator.AdversarialScenario
|
|
124
|
+
:keyword target: The target function to simulate adversarial inputs against.
|
|
125
|
+
This function should be asynchronous and accept a dictionary representing the adversarial input.
|
|
126
|
+
:paramtype target: Callable
|
|
127
|
+
:keyword max_conversation_turns: The maximum number of conversation turns to simulate.
|
|
128
|
+
Defaults to 1.
|
|
129
|
+
:paramtype max_conversation_turns: int
|
|
130
|
+
:keyword max_simulation_results: The maximum number of simulation results to return.
|
|
131
|
+
Defaults to 3.
|
|
132
|
+
:paramtype max_simulation_results: int
|
|
133
|
+
:keyword api_call_retry_limit: The maximum number of retries for each API call within the simulation.
|
|
134
|
+
Defaults to 3.
|
|
135
|
+
:paramtype api_call_retry_limit: int
|
|
136
|
+
:keyword api_call_retry_sleep_sec: The sleep duration (in seconds) between retries for API calls.
|
|
137
|
+
Defaults to 1 second.
|
|
138
|
+
:paramtype api_call_retry_sleep_sec: int
|
|
139
|
+
:keyword api_call_delay_sec: The delay (in seconds) before making an API call.
|
|
140
|
+
This can be used to avoid hitting rate limits. Defaults to 0 seconds.
|
|
141
|
+
:paramtype api_call_delay_sec: int
|
|
142
|
+
:keyword concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation.
|
|
143
|
+
Defaults to 3.
|
|
144
|
+
:paramtype concurrent_async_task: int
|
|
145
|
+
:keyword language: The language in which the conversation should be generated. Defaults to English.
|
|
146
|
+
:paramtype language: azure.ai.evaluation.simulator.SupportedLanguages
|
|
147
|
+
:keyword randomize_order: Whether or not the order of the prompts should be randomized. Defaults to True.
|
|
148
|
+
:paramtype randomize_order: bool
|
|
149
|
+
:keyword randomization_seed: The seed used to randomize prompt selection. If unset, the system's
|
|
150
|
+
default seed is used. Defaults to None.
|
|
151
|
+
:paramtype randomization_seed: Optional[int]
|
|
152
|
+
:return: A list of dictionaries, each representing a simulated conversation. Each dictionary contains:
|
|
153
|
+
|
|
154
|
+
- 'template_parameters': A dictionary with parameters used in the conversation template,
|
|
155
|
+
including 'conversation_starter'.
|
|
156
|
+
- 'messages': A list of dictionaries, each representing a turn in the conversation.
|
|
157
|
+
Each message dictionary includes 'content' (the message text) and
|
|
158
|
+
'role' (indicating whether the message is from the 'user' or the 'assistant').
|
|
159
|
+
- '**$schema**': A string indicating the schema URL for the conversation format.
|
|
160
|
+
|
|
161
|
+
The 'content' for 'assistant' role messages may includes the messages that your callback returned.
|
|
162
|
+
:rtype: List[Dict[str, Any]]
|
|
163
|
+
|
|
164
|
+
**Output format**
|
|
165
|
+
|
|
166
|
+
.. code-block:: python
|
|
167
|
+
|
|
168
|
+
return_value = [
|
|
169
|
+
{
|
|
170
|
+
'template_parameters': {},
|
|
171
|
+
'messages': [
|
|
172
|
+
{
|
|
173
|
+
'content': '<jailbreak prompt> <adversarial query>',
|
|
174
|
+
'role': 'user'
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
'content': "<response from endpoint>",
|
|
178
|
+
'role': 'assistant',
|
|
179
|
+
'context': None
|
|
180
|
+
}
|
|
181
|
+
],
|
|
182
|
+
'$schema': 'http://azureml/sdk-2-0/ChatConversation.json'
|
|
183
|
+
}
|
|
184
|
+
]
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
# validate the inputs
|
|
188
|
+
if scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
|
|
189
|
+
max_conversation_turns = 2
|
|
190
|
+
else:
|
|
191
|
+
max_conversation_turns = max_conversation_turns * 2
|
|
192
|
+
if not (
|
|
193
|
+
scenario in AdversarialScenario.__members__.values()
|
|
194
|
+
or scenario in _UnstableAdversarialScenario.__members__.values()
|
|
195
|
+
):
|
|
196
|
+
msg = f"Invalid scenario: {scenario}. Supported scenarios are: {AdversarialScenario.__members__.values()}"
|
|
197
|
+
raise EvaluationException(
|
|
198
|
+
message=msg,
|
|
199
|
+
internal_message=msg,
|
|
200
|
+
target=ErrorTarget.ADVERSARIAL_SIMULATOR,
|
|
201
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
202
|
+
blame=ErrorBlame.USER_ERROR,
|
|
203
|
+
)
|
|
204
|
+
self._ensure_service_dependencies()
|
|
205
|
+
templates = await self.adversarial_template_handler._get_content_harm_template_collections(scenario.value)
|
|
206
|
+
concurrent_async_task = min(concurrent_async_task, 1000)
|
|
207
|
+
semaphore = asyncio.Semaphore(concurrent_async_task)
|
|
208
|
+
sim_results = []
|
|
209
|
+
tasks = []
|
|
210
|
+
total_tasks = sum(len(t.template_parameters) for t in templates)
|
|
211
|
+
if max_simulation_results > total_tasks:
|
|
212
|
+
logger.warning(
|
|
213
|
+
"Cannot provide %s results due to maximum number of adversarial simulations that can be generated: %s."
|
|
214
|
+
"\n %s simulations will be generated.",
|
|
215
|
+
max_simulation_results,
|
|
216
|
+
total_tasks,
|
|
217
|
+
total_tasks,
|
|
218
|
+
)
|
|
219
|
+
total_tasks = min(total_tasks, max_simulation_results)
|
|
220
|
+
if _jailbreak_type:
|
|
221
|
+
jailbreak_dataset = await self.rai_client.get_jailbreaks_dataset(type=_jailbreak_type)
|
|
222
|
+
progress_bar = tqdm(
|
|
223
|
+
total=total_tasks,
|
|
224
|
+
desc="generating jailbreak simulations" if _jailbreak_type else "generating simulations",
|
|
225
|
+
ncols=100,
|
|
226
|
+
unit="simulations",
|
|
227
|
+
)
|
|
228
|
+
for template in templates:
|
|
229
|
+
parameter_order = list(range(len(template.template_parameters)))
|
|
230
|
+
if randomize_order:
|
|
231
|
+
# The template parameter lists are persistent across sim runs within a session,
|
|
232
|
+
# So randomize a the selection instead of the parameter list directly,
|
|
233
|
+
# or a potentially large deep copy.
|
|
234
|
+
if randomization_seed is not None:
|
|
235
|
+
random.seed(randomization_seed)
|
|
236
|
+
random.shuffle(parameter_order)
|
|
237
|
+
for index in parameter_order:
|
|
238
|
+
parameter = template.template_parameters[index].copy()
|
|
239
|
+
if _jailbreak_type == "upia":
|
|
240
|
+
parameter = self._join_conversation_starter(parameter, random.choice(jailbreak_dataset))
|
|
241
|
+
tasks.append(
|
|
242
|
+
asyncio.create_task(
|
|
243
|
+
self._simulate_async(
|
|
244
|
+
target=target,
|
|
245
|
+
template=template,
|
|
246
|
+
parameters=parameter,
|
|
247
|
+
max_conversation_turns=max_conversation_turns,
|
|
248
|
+
api_call_retry_limit=api_call_retry_limit,
|
|
249
|
+
api_call_retry_sleep_sec=api_call_retry_sleep_sec,
|
|
250
|
+
api_call_delay_sec=api_call_delay_sec,
|
|
251
|
+
language=language,
|
|
252
|
+
semaphore=semaphore,
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
)
|
|
256
|
+
if len(tasks) >= max_simulation_results:
|
|
257
|
+
break
|
|
258
|
+
if len(tasks) >= max_simulation_results:
|
|
259
|
+
break
|
|
260
|
+
for task in asyncio.as_completed(tasks):
|
|
261
|
+
sim_results.append(await task)
|
|
262
|
+
progress_bar.update(1)
|
|
263
|
+
progress_bar.close()
|
|
264
|
+
|
|
265
|
+
return JsonLineList(sim_results)
|
|
266
|
+
|
|
267
|
+
def _to_chat_protocol(self, *, conversation_history, template_parameters: Dict = None):
|
|
268
|
+
if template_parameters is None:
|
|
269
|
+
template_parameters = {}
|
|
270
|
+
messages = []
|
|
271
|
+
for _, m in enumerate(conversation_history):
|
|
272
|
+
message = {"content": m.message, "role": m.role.value}
|
|
273
|
+
if "context" in m.full_response:
|
|
274
|
+
message["context"] = m.full_response["context"]
|
|
275
|
+
messages.append(message)
|
|
276
|
+
conversation_category = template_parameters.pop("metadata", {}).get("Category")
|
|
277
|
+
template_parameters["metadata"] = {}
|
|
278
|
+
for key in (
|
|
279
|
+
"conversation_starter",
|
|
280
|
+
"group_of_people",
|
|
281
|
+
"target_population",
|
|
282
|
+
"topic",
|
|
283
|
+
"ch_template_placeholder",
|
|
284
|
+
):
|
|
285
|
+
template_parameters.pop(key, None)
|
|
286
|
+
if conversation_category:
|
|
287
|
+
template_parameters["category"] = conversation_category
|
|
288
|
+
return {
|
|
289
|
+
"template_parameters": template_parameters,
|
|
290
|
+
"messages": messages,
|
|
291
|
+
"$schema": "http://azureml/sdk-2-0/ChatConversation.json",
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
async def _simulate_async(
|
|
295
|
+
self,
|
|
296
|
+
*,
|
|
297
|
+
target: Callable,
|
|
298
|
+
template,
|
|
299
|
+
parameters,
|
|
300
|
+
max_conversation_turns,
|
|
301
|
+
api_call_retry_limit,
|
|
302
|
+
api_call_retry_sleep_sec,
|
|
303
|
+
api_call_delay_sec,
|
|
304
|
+
language,
|
|
305
|
+
semaphore,
|
|
306
|
+
) -> List[Dict]:
|
|
307
|
+
user_bot = self._setup_bot(role=ConversationRole.USER, template=template, parameters=parameters)
|
|
308
|
+
system_bot = self._setup_bot(
|
|
309
|
+
target=target, role=ConversationRole.ASSISTANT, template=template, parameters=parameters
|
|
310
|
+
)
|
|
311
|
+
bots = [user_bot, system_bot]
|
|
312
|
+
session = get_async_http_client().with_policies(
|
|
313
|
+
retry_policy=AsyncRetryPolicy(
|
|
314
|
+
retry_total=api_call_retry_limit,
|
|
315
|
+
retry_backoff_factor=api_call_retry_sleep_sec,
|
|
316
|
+
retry_mode=RetryMode.Fixed,
|
|
317
|
+
)
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
async with semaphore, session:
|
|
321
|
+
_, conversation_history = await simulate_conversation(
|
|
322
|
+
bots=bots,
|
|
323
|
+
session=session,
|
|
324
|
+
turn_limit=max_conversation_turns,
|
|
325
|
+
api_call_delay_sec=api_call_delay_sec,
|
|
326
|
+
language=language,
|
|
327
|
+
)
|
|
328
|
+
return self._to_chat_protocol(conversation_history=conversation_history, template_parameters=parameters)
|
|
329
|
+
|
|
330
|
+
def _get_user_proxy_completion_model(self, template_key, template_parameters):
|
|
331
|
+
return ProxyChatCompletionsModel(
|
|
332
|
+
name="raisvc_proxy_model",
|
|
333
|
+
template_key=template_key,
|
|
334
|
+
template_parameters=template_parameters,
|
|
335
|
+
endpoint_url=self.rai_client.simulation_submit_endpoint,
|
|
336
|
+
token_manager=self.token_manager,
|
|
337
|
+
api_version="2023-07-01-preview",
|
|
338
|
+
max_tokens=1200,
|
|
339
|
+
temperature=0.0,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
def _setup_bot(self, *, role, template, parameters, target: Callable = None):
|
|
343
|
+
if role == ConversationRole.USER:
|
|
344
|
+
model = self._get_user_proxy_completion_model(
|
|
345
|
+
template_key=template.template_name, template_parameters=parameters
|
|
346
|
+
)
|
|
347
|
+
return ConversationBot(
|
|
348
|
+
role=role,
|
|
349
|
+
model=model,
|
|
350
|
+
conversation_template=str(template),
|
|
351
|
+
instantiation_parameters=parameters,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
if role == ConversationRole.ASSISTANT:
|
|
355
|
+
dummy_model = lambda: None # noqa: E731
|
|
356
|
+
dummy_model.name = "dummy_model"
|
|
357
|
+
return CallbackConversationBot(
|
|
358
|
+
callback=target,
|
|
359
|
+
role=role,
|
|
360
|
+
model=dummy_model,
|
|
361
|
+
user_template=str(template),
|
|
362
|
+
user_template_parameters=parameters,
|
|
363
|
+
conversation_template="",
|
|
364
|
+
instantiation_parameters={},
|
|
365
|
+
)
|
|
366
|
+
return ConversationBot(
|
|
367
|
+
role=role,
|
|
368
|
+
model=model,
|
|
369
|
+
conversation_template=template,
|
|
370
|
+
instantiation_parameters=parameters,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
def _join_conversation_starter(self, parameters, to_join):
|
|
374
|
+
key = "conversation_starter"
|
|
375
|
+
if key in parameters.keys():
|
|
376
|
+
parameters[key] = f"{to_join} {parameters[key]}"
|
|
377
|
+
else:
|
|
378
|
+
parameters[key] = to_join
|
|
379
|
+
|
|
380
|
+
return parameters
|
|
381
|
+
|
|
382
|
+
def call_sync(
|
|
383
|
+
self,
|
|
384
|
+
*,
|
|
385
|
+
scenario: AdversarialScenario,
|
|
386
|
+
max_conversation_turns: int,
|
|
387
|
+
max_simulation_results: int,
|
|
388
|
+
target: Callable,
|
|
389
|
+
api_call_retry_limit: int,
|
|
390
|
+
api_call_retry_sleep_sec: int,
|
|
391
|
+
api_call_delay_sec: int,
|
|
392
|
+
concurrent_async_task: int,
|
|
393
|
+
) -> List[Dict[str, Any]]:
|
|
394
|
+
"""Call the adversarial simulator synchronously.
|
|
395
|
+
:keyword scenario: Enum value specifying the adversarial scenario used for generating inputs.
|
|
396
|
+
example:
|
|
397
|
+
|
|
398
|
+
- :py:const:`azure.ai.evaluation.simulator.adversarial_scenario.AdversarialScenario.ADVERSARIAL_QA`
|
|
399
|
+
- :py:const:`azure.ai.evaluation.simulator.adversarial_scenario.AdversarialScenario.ADVERSARIAL_CONVERSATION`
|
|
400
|
+
:paramtype scenario: azure.ai.evaluation.simulator.adversarial_scenario.AdversarialScenario
|
|
401
|
+
|
|
402
|
+
:keyword max_conversation_turns: The maximum number of conversation turns to simulate.
|
|
403
|
+
:paramtype max_conversation_turns: int
|
|
404
|
+
:keyword max_simulation_results: The maximum number of simulation results to return.
|
|
405
|
+
:paramtype max_simulation_results: int
|
|
406
|
+
:keyword target: The target function to simulate adversarial inputs against.
|
|
407
|
+
:paramtype target: Callable
|
|
408
|
+
:keyword api_call_retry_limit: The maximum number of retries for each API call within the simulation.
|
|
409
|
+
:paramtype api_call_retry_limit: int
|
|
410
|
+
:keyword api_call_retry_sleep_sec: The sleep duration (in seconds) between retries for API calls.
|
|
411
|
+
:paramtype api_call_retry_sleep_sec: int
|
|
412
|
+
:keyword api_call_delay_sec: The delay (in seconds) before making an API call.
|
|
413
|
+
:paramtype api_call_delay_sec: int
|
|
414
|
+
:keyword concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation.
|
|
415
|
+
:paramtype concurrent_async_task: int
|
|
416
|
+
:return: A list of dictionaries, each representing a simulated conversation.
|
|
417
|
+
:rtype: List[Dict[str, Any]]
|
|
418
|
+
"""
|
|
419
|
+
# Running the async method in a synchronous context
|
|
420
|
+
loop = asyncio.get_event_loop()
|
|
421
|
+
if loop.is_running():
|
|
422
|
+
# If the loop is already running, use run_until_complete
|
|
423
|
+
# Note: This approach might not be suitable in all contexts, especially with nested async calls
|
|
424
|
+
future = asyncio.ensure_future(
|
|
425
|
+
self(
|
|
426
|
+
scenario=scenario,
|
|
427
|
+
max_conversation_turns=max_conversation_turns,
|
|
428
|
+
max_simulation_results=max_simulation_results,
|
|
429
|
+
target=target,
|
|
430
|
+
api_call_retry_limit=api_call_retry_limit,
|
|
431
|
+
api_call_retry_sleep_sec=api_call_retry_sleep_sec,
|
|
432
|
+
api_call_delay_sec=api_call_delay_sec,
|
|
433
|
+
concurrent_async_task=concurrent_async_task,
|
|
434
|
+
)
|
|
435
|
+
)
|
|
436
|
+
return loop.run_until_complete(future)
|
|
437
|
+
|
|
438
|
+
# If no event loop is running, use asyncio.run (Python 3.7+)
|
|
439
|
+
return asyncio.run(
|
|
440
|
+
self(
|
|
441
|
+
scenario=scenario,
|
|
442
|
+
max_conversation_turns=max_conversation_turns,
|
|
443
|
+
max_simulation_results=max_simulation_results,
|
|
444
|
+
target=target,
|
|
445
|
+
api_call_retry_limit=api_call_retry_limit,
|
|
446
|
+
api_call_retry_sleep_sec=api_call_retry_sleep_sec,
|
|
447
|
+
api_call_delay_sec=api_call_delay_sec,
|
|
448
|
+
concurrent_async_task=concurrent_async_task,
|
|
449
|
+
)
|
|
450
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SupportedLanguages(Enum):
|
|
8
|
+
"""Supported languages for evaluation, using ISO standard language codes."""
|
|
9
|
+
|
|
10
|
+
Spanish = "es"
|
|
11
|
+
Italian = "it"
|
|
12
|
+
French = "fr"
|
|
13
|
+
German = "de"
|
|
14
|
+
SimplifiedChinese = "zh-cn"
|
|
15
|
+
Portuguese = "pt"
|
|
16
|
+
Japanese = "ja"
|
|
17
|
+
English = "en"
|