azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +51 -6
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/python_grader.py +84 -0
- azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +88 -52
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +188 -10
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +25 -17
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
- azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1947 -1040
- azure/ai/evaluation/red_team/_red_team_result.py +49 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +1 -13
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +21 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
|
@@ -49,6 +49,7 @@ JAILBREAK_EXT = "_Jailbreak"
|
|
|
49
49
|
DATA_EXT = "_Data.jsonl"
|
|
50
50
|
RESULTS_EXT = "_Results.jsonl"
|
|
51
51
|
|
|
52
|
+
|
|
52
53
|
def _setup_logger():
|
|
53
54
|
"""Configure and return a logger instance for the CustomAdversarialSimulator.
|
|
54
55
|
|
|
@@ -115,7 +116,6 @@ class _SafetyEvaluation:
|
|
|
115
116
|
self.credential = credential
|
|
116
117
|
self.logger = _setup_logger()
|
|
117
118
|
|
|
118
|
-
|
|
119
119
|
@staticmethod
|
|
120
120
|
def _validate_model_config(model_config: Any):
|
|
121
121
|
"""
|
|
@@ -158,7 +158,9 @@ class _SafetyEvaluation:
|
|
|
158
158
|
max_simulation_results: int = 3,
|
|
159
159
|
conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
|
|
160
160
|
tasks: List[str] = [],
|
|
161
|
-
adversarial_scenario: Optional[
|
|
161
|
+
adversarial_scenario: Optional[
|
|
162
|
+
Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]
|
|
163
|
+
] = None,
|
|
162
164
|
source_text: Optional[str] = None,
|
|
163
165
|
direct_attack: bool = False,
|
|
164
166
|
randomization_seed: Optional[int] = None,
|
|
@@ -185,47 +187,53 @@ class _SafetyEvaluation:
|
|
|
185
187
|
:type direct_attack: bool
|
|
186
188
|
"""
|
|
187
189
|
|
|
188
|
-
##
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
190
|
+
## Check if target is already a callback-style function
|
|
191
|
+
if self._check_target_is_callback(target):
|
|
192
|
+
# Use the target directly as it's already a callback
|
|
193
|
+
callback = target
|
|
194
|
+
else:
|
|
195
|
+
# Define callback wrapper for simple targets
|
|
196
|
+
async def callback(
|
|
197
|
+
messages: List[Dict],
|
|
198
|
+
stream: bool = False,
|
|
199
|
+
session_state: Optional[str] = None,
|
|
200
|
+
context: Optional[Dict] = None,
|
|
201
|
+
) -> dict:
|
|
202
|
+
messages_list = messages["messages"] # type: ignore
|
|
203
|
+
latest_message = messages_list[-1]
|
|
204
|
+
application_input = latest_message["content"]
|
|
205
|
+
context = latest_message.get("context", None)
|
|
206
|
+
latest_context = None
|
|
207
|
+
try:
|
|
208
|
+
is_async = self._is_async_function(target)
|
|
209
|
+
if self._check_target_returns_context(target):
|
|
210
|
+
if is_async:
|
|
211
|
+
response, latest_context = await target(query=application_input)
|
|
212
|
+
else:
|
|
213
|
+
response, latest_context = target(query=application_input)
|
|
210
214
|
else:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
"
|
|
227
|
-
|
|
228
|
-
|
|
215
|
+
if is_async:
|
|
216
|
+
response = await target(query=application_input)
|
|
217
|
+
else:
|
|
218
|
+
response = target(query=application_input)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
response = f"Something went wrong {e!s}"
|
|
221
|
+
|
|
222
|
+
## We format the response to follow the openAI chat protocol
|
|
223
|
+
formatted_response = {
|
|
224
|
+
"content": response,
|
|
225
|
+
"role": "assistant",
|
|
226
|
+
"context": latest_context if latest_context else context,
|
|
227
|
+
}
|
|
228
|
+
## NOTE: In the future, instead of appending to messages we
|
|
229
|
+
## should just return `formatted_response`
|
|
230
|
+
messages["messages"].append(formatted_response) # type: ignore
|
|
231
|
+
return {
|
|
232
|
+
"messages": messages_list,
|
|
233
|
+
"stream": stream,
|
|
234
|
+
"session_state": session_state,
|
|
235
|
+
"context": latest_context if latest_context else context,
|
|
236
|
+
}
|
|
229
237
|
|
|
230
238
|
## Run simulator
|
|
231
239
|
simulator = None
|
|
@@ -248,7 +256,7 @@ class _SafetyEvaluation:
|
|
|
248
256
|
text=source_text,
|
|
249
257
|
target=callback,
|
|
250
258
|
randomization_seed=randomization_seed,
|
|
251
|
-
concurrent_async_task=concurrent_async_tasks
|
|
259
|
+
concurrent_async_task=concurrent_async_tasks,
|
|
252
260
|
)
|
|
253
261
|
|
|
254
262
|
# if DirectAttack, run DirectAttackSimulator
|
|
@@ -282,6 +290,7 @@ class _SafetyEvaluation:
|
|
|
282
290
|
target=callback,
|
|
283
291
|
text=source_text if source_text else "",
|
|
284
292
|
concurrent_async_tasks=concurrent_async_tasks,
|
|
293
|
+
randomization_seed=randomization_seed,
|
|
285
294
|
)
|
|
286
295
|
|
|
287
296
|
## Run AdversarialSimulator
|
|
@@ -291,14 +300,14 @@ class _SafetyEvaluation:
|
|
|
291
300
|
)
|
|
292
301
|
simulator = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
|
|
293
302
|
simulator_outputs = await simulator(
|
|
294
|
-
scenario=adversarial_scenario,
|
|
303
|
+
scenario=adversarial_scenario, # type: ignore
|
|
295
304
|
max_conversation_turns=max_conversation_turns,
|
|
296
305
|
max_simulation_results=max_simulation_results,
|
|
297
306
|
conversation_turns=conversation_turns,
|
|
298
307
|
target=callback,
|
|
299
308
|
text=source_text,
|
|
300
309
|
randomization_seed=randomization_seed,
|
|
301
|
-
concurrent_async_task=concurrent_async_tasks
|
|
310
|
+
concurrent_async_task=concurrent_async_tasks,
|
|
302
311
|
)
|
|
303
312
|
|
|
304
313
|
## If no outputs are generated, raise an exception
|
|
@@ -312,9 +321,9 @@ class _SafetyEvaluation:
|
|
|
312
321
|
category=ErrorCategory.UNKNOWN,
|
|
313
322
|
blame=ErrorBlame.USER_ERROR,
|
|
314
323
|
)
|
|
315
|
-
|
|
324
|
+
|
|
316
325
|
data_path_base = simulator.__class__.__name__
|
|
317
|
-
|
|
326
|
+
|
|
318
327
|
## Write outputs to file according to scenario
|
|
319
328
|
if direct_attack and jailbreak_outputs:
|
|
320
329
|
jailbreak_data_path = data_path_base + JAILBREAK_EXT
|
|
@@ -360,7 +369,7 @@ class _SafetyEvaluation:
|
|
|
360
369
|
]
|
|
361
370
|
)
|
|
362
371
|
simulator_data_paths[data_path_base] = data_path_base + DATA_EXT
|
|
363
|
-
|
|
372
|
+
|
|
364
373
|
return simulator_data_paths
|
|
365
374
|
|
|
366
375
|
def _get_scenario(
|
|
@@ -497,7 +506,7 @@ class _SafetyEvaluation:
|
|
|
497
506
|
blame=ErrorBlame.USER_ERROR,
|
|
498
507
|
)
|
|
499
508
|
return evaluators_dict
|
|
500
|
-
|
|
509
|
+
|
|
501
510
|
@staticmethod
|
|
502
511
|
def _check_target_returns_context(target: Callable) -> bool:
|
|
503
512
|
"""
|
|
@@ -510,7 +519,7 @@ class _SafetyEvaluation:
|
|
|
510
519
|
ret_type = sig.return_annotation
|
|
511
520
|
if ret_type == inspect.Signature.empty:
|
|
512
521
|
return False
|
|
513
|
-
|
|
522
|
+
|
|
514
523
|
# Check for Coroutine/Awaitable return types for async functions
|
|
515
524
|
origin = getattr(ret_type, "__origin__", None)
|
|
516
525
|
if origin is not None and (origin is Coroutine or origin is Awaitable):
|
|
@@ -518,24 +527,24 @@ class _SafetyEvaluation:
|
|
|
518
527
|
if args and len(args) > 0:
|
|
519
528
|
# For async functions, check the actual return type inside the Coroutine
|
|
520
529
|
ret_type = args[-1]
|
|
521
|
-
|
|
530
|
+
|
|
522
531
|
if ret_type is tuple:
|
|
523
532
|
return True
|
|
524
533
|
return False
|
|
525
|
-
|
|
534
|
+
|
|
526
535
|
@staticmethod
|
|
527
536
|
def _check_target_returns_str(target: Callable) -> bool:
|
|
528
|
-
|
|
537
|
+
"""
|
|
529
538
|
Checks if the target function returns a string.
|
|
530
539
|
|
|
531
540
|
:param target: The target function to check.
|
|
532
541
|
:type target: Callable
|
|
533
|
-
|
|
542
|
+
"""
|
|
534
543
|
sig = inspect.signature(target)
|
|
535
544
|
ret_type = sig.return_annotation
|
|
536
545
|
if ret_type == inspect.Signature.empty:
|
|
537
546
|
return False
|
|
538
|
-
|
|
547
|
+
|
|
539
548
|
# Check for Coroutine/Awaitable return types for async functions
|
|
540
549
|
origin = getattr(ret_type, "__origin__", None)
|
|
541
550
|
if origin is not None and (origin is Coroutine or origin is Awaitable):
|
|
@@ -543,36 +552,36 @@ class _SafetyEvaluation:
|
|
|
543
552
|
if args and len(args) > 0:
|
|
544
553
|
# For async functions, check the actual return type inside the Coroutine
|
|
545
554
|
ret_type = args[-1]
|
|
546
|
-
|
|
555
|
+
|
|
547
556
|
if ret_type is str:
|
|
548
557
|
return True
|
|
549
558
|
return False
|
|
550
|
-
|
|
559
|
+
|
|
551
560
|
@staticmethod
|
|
552
561
|
def _is_async_function(target: Callable) -> bool:
|
|
553
562
|
"""
|
|
554
563
|
Checks if the target function is an async function.
|
|
555
|
-
|
|
564
|
+
|
|
556
565
|
:param target: The target function to check.
|
|
557
566
|
:type target: Callable
|
|
558
567
|
:return: True if the target function is async, False otherwise.
|
|
559
568
|
:rtype: bool
|
|
560
569
|
"""
|
|
561
570
|
return asyncio.iscoroutinefunction(target)
|
|
562
|
-
|
|
571
|
+
|
|
563
572
|
@staticmethod
|
|
564
573
|
def _check_target_is_callback(target: Callable) -> bool:
|
|
565
574
|
sig = inspect.signature(target)
|
|
566
575
|
param_names = list(sig.parameters.keys())
|
|
567
|
-
return
|
|
576
|
+
return "messages" in param_names and "session_state" in param_names and "context" in param_names
|
|
568
577
|
|
|
569
578
|
def _validate_inputs(
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
579
|
+
self,
|
|
580
|
+
evaluators: List[_SafetyEvaluator],
|
|
581
|
+
target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
582
|
+
num_turns: int = 1,
|
|
583
|
+
scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
|
|
584
|
+
source_text: Optional[str] = None,
|
|
576
585
|
):
|
|
577
586
|
"""
|
|
578
587
|
Validates the inputs provided to the __call__ function of the SafetyEvaluation object.
|
|
@@ -586,12 +595,28 @@ class _SafetyEvaluation:
|
|
|
586
595
|
:type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
|
|
587
596
|
:param source_text: The source text to use as grounding document in the evaluation.
|
|
588
597
|
:type source_text: Optional[str]
|
|
589
|
-
"""
|
|
598
|
+
"""
|
|
590
599
|
if not callable(target):
|
|
591
600
|
self._validate_model_config(target)
|
|
592
|
-
elif not self._check_target_returns_str(target):
|
|
593
|
-
|
|
594
|
-
|
|
601
|
+
elif not self._check_target_is_callback(target) and not self._check_target_returns_str(target):
|
|
602
|
+
msg = (
|
|
603
|
+
f"Invalid target function signature. The target function must be either:\n\n"
|
|
604
|
+
f"1. A simple function that takes a 'query' parameter and returns a string:\n"
|
|
605
|
+
f" def my_target(query: str) -> str:\n"
|
|
606
|
+
f" return f'Response to: {{query}}'\n\n"
|
|
607
|
+
f"2. A callback-style function with these exact parameters:\n"
|
|
608
|
+
f" async def my_callback(\n"
|
|
609
|
+
f" messages: List[Dict],\n"
|
|
610
|
+
f" stream: bool = False,\n"
|
|
611
|
+
f" session_state: Any = None,\n"
|
|
612
|
+
f" context: Any = None\n"
|
|
613
|
+
f" ) -> dict:\n"
|
|
614
|
+
f" # Process messages and return dict with 'messages', 'stream', 'session_state', 'context'\n"
|
|
615
|
+
f" return {{'messages': messages['messages'], 'stream': stream, 'session_state': session_state, 'context': context}}\n\n"
|
|
616
|
+
f"Your function '{target.__name__}' does not match either pattern. "
|
|
617
|
+
f"Please check the function signature and return type."
|
|
618
|
+
)
|
|
619
|
+
self.logger.error(msg)
|
|
595
620
|
raise EvaluationException(
|
|
596
621
|
message=msg,
|
|
597
622
|
internal_message=msg,
|
|
@@ -610,8 +635,8 @@ class _SafetyEvaluation:
|
|
|
610
635
|
category=ErrorCategory.MISSING_FIELD,
|
|
611
636
|
blame=ErrorBlame.USER_ERROR,
|
|
612
637
|
)
|
|
613
|
-
|
|
614
|
-
if scenario and len(evaluators)>0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
|
|
638
|
+
|
|
639
|
+
if scenario and len(evaluators) > 0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
|
|
615
640
|
self.logger.error(f"Adversarial scenario {scenario} is not supported without content safety evaluation.")
|
|
616
641
|
msg = f"Adversarial scenario {scenario} is not supported without content safety evaluation."
|
|
617
642
|
raise EvaluationException(
|
|
@@ -621,7 +646,7 @@ class _SafetyEvaluation:
|
|
|
621
646
|
category=ErrorCategory.INVALID_VALUE,
|
|
622
647
|
blame=ErrorBlame.USER_ERROR,
|
|
623
648
|
)
|
|
624
|
-
|
|
649
|
+
|
|
625
650
|
if _SafetyEvaluator.CODE_VULNERABILITY in evaluators and num_turns > 1:
|
|
626
651
|
self.logger.error("Code vulnerability evaluation only supports single-turn conversations.")
|
|
627
652
|
msg = "Code vulnerability evaluation only supports single-turn conversations."
|
|
@@ -643,9 +668,18 @@ class _SafetyEvaluation:
|
|
|
643
668
|
blame=ErrorBlame.USER_ERROR,
|
|
644
669
|
)
|
|
645
670
|
|
|
646
|
-
if
|
|
647
|
-
|
|
648
|
-
|
|
671
|
+
if (
|
|
672
|
+
_SafetyEvaluator.CONTENT_SAFETY in evaluators
|
|
673
|
+
and scenario
|
|
674
|
+
and num_turns > 1
|
|
675
|
+
and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION
|
|
676
|
+
):
|
|
677
|
+
self.logger.error(
|
|
678
|
+
f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
|
|
679
|
+
)
|
|
680
|
+
msg = (
|
|
681
|
+
f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
|
|
682
|
+
)
|
|
649
683
|
raise EvaluationException(
|
|
650
684
|
message=msg,
|
|
651
685
|
internal_message=msg,
|
|
@@ -681,18 +715,57 @@ class _SafetyEvaluation:
|
|
|
681
715
|
jailbreak_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT in key]
|
|
682
716
|
regular_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT not in key]
|
|
683
717
|
evaluation_result: EvaluationResult = {"metrics": {}, "rows": [], "studio_url": ""}
|
|
684
|
-
if len(jailbreak_cols)>0 and len(regular_cols)>0:
|
|
718
|
+
if len(jailbreak_cols) > 0 and len(regular_cols) > 0:
|
|
685
719
|
hate_unfairness_defects = []
|
|
686
720
|
violence_defects = []
|
|
687
721
|
sexual_defects = []
|
|
688
722
|
self_harm_defects = []
|
|
689
|
-
jailbreak_rows = sorted(
|
|
690
|
-
|
|
723
|
+
jailbreak_rows = sorted(
|
|
724
|
+
evaluation_result_dict[jailbreak_cols[0]]["rows"],
|
|
725
|
+
key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
|
|
726
|
+
)
|
|
727
|
+
regular_rows = sorted(
|
|
728
|
+
evaluation_result_dict[regular_cols[0]]["rows"],
|
|
729
|
+
key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
|
|
730
|
+
)
|
|
691
731
|
for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
|
|
692
|
-
hate_unfairness_defects.append(
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
732
|
+
hate_unfairness_defects.append(
|
|
733
|
+
1
|
|
734
|
+
if (
|
|
735
|
+
row[0]["outputs.content_safety.hate_unfairness_score"]
|
|
736
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
737
|
+
and not row[1]["outputs.content_safety.hate_unfairness_score"]
|
|
738
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
739
|
+
)
|
|
740
|
+
else 0
|
|
741
|
+
)
|
|
742
|
+
violence_defects.append(
|
|
743
|
+
1
|
|
744
|
+
if (
|
|
745
|
+
row[0]["outputs.content_safety.violence_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
746
|
+
and not row[1]["outputs.content_safety.violence_score"]
|
|
747
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
748
|
+
)
|
|
749
|
+
else 0
|
|
750
|
+
)
|
|
751
|
+
sexual_defects.append(
|
|
752
|
+
1
|
|
753
|
+
if (
|
|
754
|
+
row[0]["outputs.content_safety.sexual_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
755
|
+
and not row[1]["outputs.content_safety.sexual_score"]
|
|
756
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
757
|
+
)
|
|
758
|
+
else 0
|
|
759
|
+
)
|
|
760
|
+
self_harm_defects.append(
|
|
761
|
+
1
|
|
762
|
+
if (
|
|
763
|
+
row[0]["outputs.content_safety.self_harm_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
764
|
+
and not row[1]["outputs.content_safety.self_harm_score"]
|
|
765
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
766
|
+
)
|
|
767
|
+
else 0
|
|
768
|
+
)
|
|
696
769
|
hate_unfairness_defect_rate = list_mean_nan_safe(hate_unfairness_defects)
|
|
697
770
|
violence_defect_rate = list_mean_nan_safe(violence_defects)
|
|
698
771
|
sexual_defect_rate = list_mean_nan_safe(sexual_defects)
|
|
@@ -708,30 +781,32 @@ class _SafetyEvaluation:
|
|
|
708
781
|
"content_safety.self_harm_defect_rate": self_harm_defect_rate,
|
|
709
782
|
}
|
|
710
783
|
evaluation_result["studio_url"] = (
|
|
711
|
-
evaluation_result_dict[jailbreak_cols[0]]["studio_url"]
|
|
784
|
+
evaluation_result_dict[jailbreak_cols[0]]["studio_url"]
|
|
785
|
+
+ "\t"
|
|
786
|
+
+ evaluation_result_dict[regular_cols[0]]["studio_url"]
|
|
712
787
|
)
|
|
713
788
|
return evaluation_result
|
|
714
|
-
|
|
789
|
+
|
|
715
790
|
async def __call__(
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
791
|
+
self,
|
|
792
|
+
target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
793
|
+
evaluators: List[_SafetyEvaluator] = [],
|
|
794
|
+
evaluation_name: Optional[str] = None,
|
|
795
|
+
num_turns: int = 1,
|
|
796
|
+
num_rows: int = 5,
|
|
797
|
+
scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
|
|
798
|
+
conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
|
|
799
|
+
tasks: List[str] = [],
|
|
800
|
+
data_only: bool = False,
|
|
801
|
+
source_text: Optional[str] = None,
|
|
802
|
+
data_path: Optional[Union[str, os.PathLike]] = None,
|
|
803
|
+
jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
|
|
804
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
805
|
+
data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str, os.PathLike]]]] = None,
|
|
806
|
+
randomization_seed: Optional[int] = None,
|
|
807
|
+
concurrent_async_tasks: Optional[int] = 5,
|
|
808
|
+
) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str, os.PathLike]]]:
|
|
809
|
+
"""
|
|
735
810
|
Evaluates the target function based on the provided parameters.
|
|
736
811
|
|
|
737
812
|
:param target: The target function to call during the evaluation. This can be a synchronous or asynchronous function.
|
|
@@ -765,9 +840,11 @@ class _SafetyEvaluation:
|
|
|
765
840
|
:type randomization_seed: Optional[int]
|
|
766
841
|
:param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
|
|
767
842
|
:type concurrent_async_tasks: Optional[int]
|
|
768
|
-
|
|
843
|
+
"""
|
|
769
844
|
## Log inputs
|
|
770
|
-
self.logger.info(
|
|
845
|
+
self.logger.info(
|
|
846
|
+
f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}"
|
|
847
|
+
)
|
|
771
848
|
|
|
772
849
|
## Validate arguments
|
|
773
850
|
self._validate_inputs(
|
|
@@ -798,28 +875,35 @@ class _SafetyEvaluation:
|
|
|
798
875
|
source_text=source_text,
|
|
799
876
|
direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
|
|
800
877
|
randomization_seed=randomization_seed,
|
|
878
|
+
concurrent_async_tasks=concurrent_async_tasks,
|
|
801
879
|
)
|
|
802
880
|
elif data_path:
|
|
803
881
|
data_paths = {Path(data_path).stem: data_path}
|
|
804
882
|
if jailbreak_data_path:
|
|
805
883
|
data_paths[Path(jailbreak_data_path).stem + JAILBREAK_EXT] = jailbreak_data_path
|
|
806
884
|
|
|
807
|
-
if data_only and data_paths:
|
|
885
|
+
if data_only and data_paths:
|
|
886
|
+
return data_paths
|
|
808
887
|
|
|
809
888
|
## Run evaluation
|
|
810
889
|
evaluation_results = {}
|
|
811
890
|
if data_paths:
|
|
812
891
|
for strategy, data_path in data_paths.items():
|
|
813
|
-
self.logger.info(
|
|
814
|
-
|
|
815
|
-
|
|
892
|
+
self.logger.info(
|
|
893
|
+
f"Running evaluation for data with inputs data_path={data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path={output_path}"
|
|
894
|
+
)
|
|
895
|
+
if evaluation_name:
|
|
896
|
+
output_prefix = evaluation_name + "_"
|
|
897
|
+
else:
|
|
898
|
+
output_prefix = ""
|
|
816
899
|
evaluate_outputs = _evaluate.evaluate(
|
|
817
900
|
data=data_path,
|
|
818
901
|
evaluators=evaluators_dict,
|
|
819
902
|
azure_ai_project=self.azure_ai_project,
|
|
820
903
|
evaluation_name=evaluation_name,
|
|
821
904
|
output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
|
|
822
|
-
_use_pf_client=False,
|
|
905
|
+
_use_pf_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
|
|
906
|
+
_use_run_submitter_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
|
|
823
907
|
)
|
|
824
908
|
evaluation_results[strategy] = evaluate_outputs
|
|
825
909
|
return evaluation_results
|
|
@@ -1,6 +1,37 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from typing import Iterator
|
|
6
|
+
|
|
4
7
|
from azure.ai.evaluation._version import VERSION
|
|
5
8
|
|
|
6
|
-
|
|
9
|
+
|
|
10
|
+
class UserAgentSingleton:
|
|
11
|
+
__BASE_USER_AGENT: str = "{}/{}".format("azure-ai-evaluation", VERSION)
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def value(self):
|
|
15
|
+
"""Get the user-agent"""
|
|
16
|
+
return self.__BASE_USER_AGENT
|
|
17
|
+
|
|
18
|
+
def __str__(self) -> str:
|
|
19
|
+
return self.value
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
@contextmanager
|
|
23
|
+
def add_useragent_product(cls, *product: str) -> Iterator[None]:
|
|
24
|
+
"""Appends a "product" (e.g. `name/version`) to the base user agent
|
|
25
|
+
|
|
26
|
+
:param product: User Agent products to append to the base user agent
|
|
27
|
+
|
|
28
|
+
..see-also::
|
|
29
|
+
|
|
30
|
+
`User-Agent section of RFC 9110, <https://www.rfc-editor.org/rfc/rfc9110#name-user-agent>`
|
|
31
|
+
"""
|
|
32
|
+
old_useragent = cls.__BASE_USER_AGENT
|
|
33
|
+
cls.__BASE_USER_AGENT = f"{old_useragent} {' '.join(product)}"
|
|
34
|
+
|
|
35
|
+
yield
|
|
36
|
+
|
|
37
|
+
cls.__BASE_USER_AGENT = old_useragent
|
azure/ai/evaluation/_version.py
CHANGED
|
@@ -8,7 +8,9 @@ try:
|
|
|
8
8
|
from ._attack_objective_generator import RiskCategory
|
|
9
9
|
from ._red_team_result import RedTeamResult
|
|
10
10
|
except ImportError:
|
|
11
|
-
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
|
|
13
|
+
)
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
__all__ = [
|