azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +13 -2
- azure/ai/evaluation/_aoai/__init__.py +1 -1
- azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
- azure/ai/evaluation/_aoai/label_grader.py +3 -2
- azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
- azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
- azure/ai/evaluation/_azure/_envs.py +9 -10
- azure/ai/evaluation/_azure/_token_manager.py +7 -1
- azure/ai/evaluation/_common/constants.py +11 -2
- azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
- azure/ai/evaluation/_common/onedp/__init__.py +32 -32
- azure/ai/evaluation/_common/onedp/_client.py +136 -139
- azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
- azure/ai/evaluation/_common/onedp/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -50
- azure/ai/evaluation/_common/onedp/_version.py +9 -9
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
- azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
- azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
- azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
- azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
- azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
- azure/ai/evaluation/_common/rai_service.py +86 -50
- azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
- azure/ai/evaluation/_common/utils.py +124 -3
- azure/ai/evaluation/_constants.py +2 -1
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +9 -8
- azure/ai/evaluation/_converters/_models.py +46 -0
- azure/ai/evaluation/_converters/_sk_services.py +495 -0
- azure/ai/evaluation/_eval_mapping.py +2 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +60 -54
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
- azure/ai/evaluation/_evaluate/_utils.py +24 -15
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +21 -21
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
- azure/ai/evaluation/_exceptions.py +10 -0
- azure/ai/evaluation/_http_utils.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
- azure/ai/evaluation/_user_agent.py +32 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +3 -1
- azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
- azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
- azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
- azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
- azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
- azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
- azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
- azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
- azure/ai/evaluation/red_team/_default_converter.py +1 -1
- azure/ai/evaluation/red_team/_red_team.py +1286 -739
- azure/ai/evaluation/red_team/_red_team_result.py +43 -38
- azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +32 -32
- azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
- azure/ai/evaluation/red_team/_utils/constants.py +2 -12
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
- azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
- azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +26 -15
- azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +10 -8
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- azure/ai/evaluation/simulator/_simulator.py +9 -8
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +15 -1
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -131
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
|
@@ -49,6 +49,7 @@ JAILBREAK_EXT = "_Jailbreak"
|
|
|
49
49
|
DATA_EXT = "_Data.jsonl"
|
|
50
50
|
RESULTS_EXT = "_Results.jsonl"
|
|
51
51
|
|
|
52
|
+
|
|
52
53
|
def _setup_logger():
|
|
53
54
|
"""Configure and return a logger instance for the CustomAdversarialSimulator.
|
|
54
55
|
|
|
@@ -115,7 +116,6 @@ class _SafetyEvaluation:
|
|
|
115
116
|
self.credential = credential
|
|
116
117
|
self.logger = _setup_logger()
|
|
117
118
|
|
|
118
|
-
|
|
119
119
|
@staticmethod
|
|
120
120
|
def _validate_model_config(model_config: Any):
|
|
121
121
|
"""
|
|
@@ -158,7 +158,9 @@ class _SafetyEvaluation:
|
|
|
158
158
|
max_simulation_results: int = 3,
|
|
159
159
|
conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
|
|
160
160
|
tasks: List[str] = [],
|
|
161
|
-
adversarial_scenario: Optional[
|
|
161
|
+
adversarial_scenario: Optional[
|
|
162
|
+
Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]
|
|
163
|
+
] = None,
|
|
162
164
|
source_text: Optional[str] = None,
|
|
163
165
|
direct_attack: bool = False,
|
|
164
166
|
randomization_seed: Optional[int] = None,
|
|
@@ -185,47 +187,53 @@ class _SafetyEvaluation:
|
|
|
185
187
|
:type direct_attack: bool
|
|
186
188
|
"""
|
|
187
189
|
|
|
188
|
-
##
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
190
|
+
## Check if target is already a callback-style function
|
|
191
|
+
if self._check_target_is_callback(target):
|
|
192
|
+
# Use the target directly as it's already a callback
|
|
193
|
+
callback = target
|
|
194
|
+
else:
|
|
195
|
+
# Define callback wrapper for simple targets
|
|
196
|
+
async def callback(
|
|
197
|
+
messages: List[Dict],
|
|
198
|
+
stream: bool = False,
|
|
199
|
+
session_state: Optional[str] = None,
|
|
200
|
+
context: Optional[Dict] = None,
|
|
201
|
+
) -> dict:
|
|
202
|
+
messages_list = messages["messages"] # type: ignore
|
|
203
|
+
latest_message = messages_list[-1]
|
|
204
|
+
application_input = latest_message["content"]
|
|
205
|
+
context = latest_message.get("context", None)
|
|
206
|
+
latest_context = None
|
|
207
|
+
try:
|
|
208
|
+
is_async = self._is_async_function(target)
|
|
209
|
+
if self._check_target_returns_context(target):
|
|
210
|
+
if is_async:
|
|
211
|
+
response, latest_context = await target(query=application_input)
|
|
212
|
+
else:
|
|
213
|
+
response, latest_context = target(query=application_input)
|
|
210
214
|
else:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
"
|
|
227
|
-
|
|
228
|
-
|
|
215
|
+
if is_async:
|
|
216
|
+
response = await target(query=application_input)
|
|
217
|
+
else:
|
|
218
|
+
response = target(query=application_input)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
response = f"Something went wrong {e!s}"
|
|
221
|
+
|
|
222
|
+
## We format the response to follow the openAI chat protocol
|
|
223
|
+
formatted_response = {
|
|
224
|
+
"content": response,
|
|
225
|
+
"role": "assistant",
|
|
226
|
+
"context": latest_context if latest_context else context,
|
|
227
|
+
}
|
|
228
|
+
## NOTE: In the future, instead of appending to messages we
|
|
229
|
+
## should just return `formatted_response`
|
|
230
|
+
messages["messages"].append(formatted_response) # type: ignore
|
|
231
|
+
return {
|
|
232
|
+
"messages": messages_list,
|
|
233
|
+
"stream": stream,
|
|
234
|
+
"session_state": session_state,
|
|
235
|
+
"context": latest_context if latest_context else context,
|
|
236
|
+
}
|
|
229
237
|
|
|
230
238
|
## Run simulator
|
|
231
239
|
simulator = None
|
|
@@ -248,7 +256,7 @@ class _SafetyEvaluation:
|
|
|
248
256
|
text=source_text,
|
|
249
257
|
target=callback,
|
|
250
258
|
randomization_seed=randomization_seed,
|
|
251
|
-
concurrent_async_task=concurrent_async_tasks
|
|
259
|
+
concurrent_async_task=concurrent_async_tasks,
|
|
252
260
|
)
|
|
253
261
|
|
|
254
262
|
# if DirectAttack, run DirectAttackSimulator
|
|
@@ -291,14 +299,14 @@ class _SafetyEvaluation:
|
|
|
291
299
|
)
|
|
292
300
|
simulator = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
|
|
293
301
|
simulator_outputs = await simulator(
|
|
294
|
-
scenario=adversarial_scenario,
|
|
302
|
+
scenario=adversarial_scenario, # type: ignore
|
|
295
303
|
max_conversation_turns=max_conversation_turns,
|
|
296
304
|
max_simulation_results=max_simulation_results,
|
|
297
305
|
conversation_turns=conversation_turns,
|
|
298
306
|
target=callback,
|
|
299
307
|
text=source_text,
|
|
300
308
|
randomization_seed=randomization_seed,
|
|
301
|
-
concurrent_async_task=concurrent_async_tasks
|
|
309
|
+
concurrent_async_task=concurrent_async_tasks,
|
|
302
310
|
)
|
|
303
311
|
|
|
304
312
|
## If no outputs are generated, raise an exception
|
|
@@ -312,9 +320,9 @@ class _SafetyEvaluation:
|
|
|
312
320
|
category=ErrorCategory.UNKNOWN,
|
|
313
321
|
blame=ErrorBlame.USER_ERROR,
|
|
314
322
|
)
|
|
315
|
-
|
|
323
|
+
|
|
316
324
|
data_path_base = simulator.__class__.__name__
|
|
317
|
-
|
|
325
|
+
|
|
318
326
|
## Write outputs to file according to scenario
|
|
319
327
|
if direct_attack and jailbreak_outputs:
|
|
320
328
|
jailbreak_data_path = data_path_base + JAILBREAK_EXT
|
|
@@ -360,7 +368,7 @@ class _SafetyEvaluation:
|
|
|
360
368
|
]
|
|
361
369
|
)
|
|
362
370
|
simulator_data_paths[data_path_base] = data_path_base + DATA_EXT
|
|
363
|
-
|
|
371
|
+
|
|
364
372
|
return simulator_data_paths
|
|
365
373
|
|
|
366
374
|
def _get_scenario(
|
|
@@ -497,7 +505,7 @@ class _SafetyEvaluation:
|
|
|
497
505
|
blame=ErrorBlame.USER_ERROR,
|
|
498
506
|
)
|
|
499
507
|
return evaluators_dict
|
|
500
|
-
|
|
508
|
+
|
|
501
509
|
@staticmethod
|
|
502
510
|
def _check_target_returns_context(target: Callable) -> bool:
|
|
503
511
|
"""
|
|
@@ -510,7 +518,7 @@ class _SafetyEvaluation:
|
|
|
510
518
|
ret_type = sig.return_annotation
|
|
511
519
|
if ret_type == inspect.Signature.empty:
|
|
512
520
|
return False
|
|
513
|
-
|
|
521
|
+
|
|
514
522
|
# Check for Coroutine/Awaitable return types for async functions
|
|
515
523
|
origin = getattr(ret_type, "__origin__", None)
|
|
516
524
|
if origin is not None and (origin is Coroutine or origin is Awaitable):
|
|
@@ -518,24 +526,24 @@ class _SafetyEvaluation:
|
|
|
518
526
|
if args and len(args) > 0:
|
|
519
527
|
# For async functions, check the actual return type inside the Coroutine
|
|
520
528
|
ret_type = args[-1]
|
|
521
|
-
|
|
529
|
+
|
|
522
530
|
if ret_type is tuple:
|
|
523
531
|
return True
|
|
524
532
|
return False
|
|
525
|
-
|
|
533
|
+
|
|
526
534
|
@staticmethod
|
|
527
535
|
def _check_target_returns_str(target: Callable) -> bool:
|
|
528
|
-
|
|
536
|
+
"""
|
|
529
537
|
Checks if the target function returns a string.
|
|
530
538
|
|
|
531
539
|
:param target: The target function to check.
|
|
532
540
|
:type target: Callable
|
|
533
|
-
|
|
541
|
+
"""
|
|
534
542
|
sig = inspect.signature(target)
|
|
535
543
|
ret_type = sig.return_annotation
|
|
536
544
|
if ret_type == inspect.Signature.empty:
|
|
537
545
|
return False
|
|
538
|
-
|
|
546
|
+
|
|
539
547
|
# Check for Coroutine/Awaitable return types for async functions
|
|
540
548
|
origin = getattr(ret_type, "__origin__", None)
|
|
541
549
|
if origin is not None and (origin is Coroutine or origin is Awaitable):
|
|
@@ -543,36 +551,36 @@ class _SafetyEvaluation:
|
|
|
543
551
|
if args and len(args) > 0:
|
|
544
552
|
# For async functions, check the actual return type inside the Coroutine
|
|
545
553
|
ret_type = args[-1]
|
|
546
|
-
|
|
554
|
+
|
|
547
555
|
if ret_type is str:
|
|
548
556
|
return True
|
|
549
557
|
return False
|
|
550
|
-
|
|
558
|
+
|
|
551
559
|
@staticmethod
|
|
552
560
|
def _is_async_function(target: Callable) -> bool:
|
|
553
561
|
"""
|
|
554
562
|
Checks if the target function is an async function.
|
|
555
|
-
|
|
563
|
+
|
|
556
564
|
:param target: The target function to check.
|
|
557
565
|
:type target: Callable
|
|
558
566
|
:return: True if the target function is async, False otherwise.
|
|
559
567
|
:rtype: bool
|
|
560
568
|
"""
|
|
561
569
|
return asyncio.iscoroutinefunction(target)
|
|
562
|
-
|
|
570
|
+
|
|
563
571
|
@staticmethod
|
|
564
572
|
def _check_target_is_callback(target: Callable) -> bool:
|
|
565
573
|
sig = inspect.signature(target)
|
|
566
574
|
param_names = list(sig.parameters.keys())
|
|
567
|
-
return
|
|
575
|
+
return "messages" in param_names and "session_state" in param_names and "context" in param_names
|
|
568
576
|
|
|
569
577
|
def _validate_inputs(
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
578
|
+
self,
|
|
579
|
+
evaluators: List[_SafetyEvaluator],
|
|
580
|
+
target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
581
|
+
num_turns: int = 1,
|
|
582
|
+
scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
|
|
583
|
+
source_text: Optional[str] = None,
|
|
576
584
|
):
|
|
577
585
|
"""
|
|
578
586
|
Validates the inputs provided to the __call__ function of the SafetyEvaluation object.
|
|
@@ -586,12 +594,28 @@ class _SafetyEvaluation:
|
|
|
586
594
|
:type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
|
|
587
595
|
:param source_text: The source text to use as grounding document in the evaluation.
|
|
588
596
|
:type source_text: Optional[str]
|
|
589
|
-
"""
|
|
597
|
+
"""
|
|
590
598
|
if not callable(target):
|
|
591
599
|
self._validate_model_config(target)
|
|
592
|
-
elif not self._check_target_returns_str(target):
|
|
593
|
-
|
|
594
|
-
|
|
600
|
+
elif not self._check_target_is_callback(target) and not self._check_target_returns_str(target):
|
|
601
|
+
msg = (
|
|
602
|
+
f"Invalid target function signature. The target function must be either:\n\n"
|
|
603
|
+
f"1. A simple function that takes a 'query' parameter and returns a string:\n"
|
|
604
|
+
f" def my_target(query: str) -> str:\n"
|
|
605
|
+
f" return f'Response to: {{query}}'\n\n"
|
|
606
|
+
f"2. A callback-style function with these exact parameters:\n"
|
|
607
|
+
f" async def my_callback(\n"
|
|
608
|
+
f" messages: List[Dict],\n"
|
|
609
|
+
f" stream: bool = False,\n"
|
|
610
|
+
f" session_state: Any = None,\n"
|
|
611
|
+
f" context: Any = None\n"
|
|
612
|
+
f" ) -> dict:\n"
|
|
613
|
+
f" # Process messages and return dict with 'messages', 'stream', 'session_state', 'context'\n"
|
|
614
|
+
f" return {{'messages': messages['messages'], 'stream': stream, 'session_state': session_state, 'context': context}}\n\n"
|
|
615
|
+
f"Your function '{target.__name__}' does not match either pattern. "
|
|
616
|
+
f"Please check the function signature and return type."
|
|
617
|
+
)
|
|
618
|
+
self.logger.error(msg)
|
|
595
619
|
raise EvaluationException(
|
|
596
620
|
message=msg,
|
|
597
621
|
internal_message=msg,
|
|
@@ -610,8 +634,8 @@ class _SafetyEvaluation:
|
|
|
610
634
|
category=ErrorCategory.MISSING_FIELD,
|
|
611
635
|
blame=ErrorBlame.USER_ERROR,
|
|
612
636
|
)
|
|
613
|
-
|
|
614
|
-
if scenario and len(evaluators)>0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
|
|
637
|
+
|
|
638
|
+
if scenario and len(evaluators) > 0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
|
|
615
639
|
self.logger.error(f"Adversarial scenario {scenario} is not supported without content safety evaluation.")
|
|
616
640
|
msg = f"Adversarial scenario {scenario} is not supported without content safety evaluation."
|
|
617
641
|
raise EvaluationException(
|
|
@@ -621,7 +645,7 @@ class _SafetyEvaluation:
|
|
|
621
645
|
category=ErrorCategory.INVALID_VALUE,
|
|
622
646
|
blame=ErrorBlame.USER_ERROR,
|
|
623
647
|
)
|
|
624
|
-
|
|
648
|
+
|
|
625
649
|
if _SafetyEvaluator.CODE_VULNERABILITY in evaluators and num_turns > 1:
|
|
626
650
|
self.logger.error("Code vulnerability evaluation only supports single-turn conversations.")
|
|
627
651
|
msg = "Code vulnerability evaluation only supports single-turn conversations."
|
|
@@ -643,9 +667,18 @@ class _SafetyEvaluation:
|
|
|
643
667
|
blame=ErrorBlame.USER_ERROR,
|
|
644
668
|
)
|
|
645
669
|
|
|
646
|
-
if
|
|
647
|
-
|
|
648
|
-
|
|
670
|
+
if (
|
|
671
|
+
_SafetyEvaluator.CONTENT_SAFETY in evaluators
|
|
672
|
+
and scenario
|
|
673
|
+
and num_turns > 1
|
|
674
|
+
and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION
|
|
675
|
+
):
|
|
676
|
+
self.logger.error(
|
|
677
|
+
f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
|
|
678
|
+
)
|
|
679
|
+
msg = (
|
|
680
|
+
f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
|
|
681
|
+
)
|
|
649
682
|
raise EvaluationException(
|
|
650
683
|
message=msg,
|
|
651
684
|
internal_message=msg,
|
|
@@ -681,18 +714,57 @@ class _SafetyEvaluation:
|
|
|
681
714
|
jailbreak_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT in key]
|
|
682
715
|
regular_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT not in key]
|
|
683
716
|
evaluation_result: EvaluationResult = {"metrics": {}, "rows": [], "studio_url": ""}
|
|
684
|
-
if len(jailbreak_cols)>0 and len(regular_cols)>0:
|
|
717
|
+
if len(jailbreak_cols) > 0 and len(regular_cols) > 0:
|
|
685
718
|
hate_unfairness_defects = []
|
|
686
719
|
violence_defects = []
|
|
687
720
|
sexual_defects = []
|
|
688
721
|
self_harm_defects = []
|
|
689
|
-
jailbreak_rows = sorted(
|
|
690
|
-
|
|
722
|
+
jailbreak_rows = sorted(
|
|
723
|
+
evaluation_result_dict[jailbreak_cols[0]]["rows"],
|
|
724
|
+
key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
|
|
725
|
+
)
|
|
726
|
+
regular_rows = sorted(
|
|
727
|
+
evaluation_result_dict[regular_cols[0]]["rows"],
|
|
728
|
+
key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
|
|
729
|
+
)
|
|
691
730
|
for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
|
|
692
|
-
hate_unfairness_defects.append(
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
731
|
+
hate_unfairness_defects.append(
|
|
732
|
+
1
|
|
733
|
+
if (
|
|
734
|
+
row[0]["outputs.content_safety.hate_unfairness_score"]
|
|
735
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
736
|
+
and not row[1]["outputs.content_safety.hate_unfairness_score"]
|
|
737
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
738
|
+
)
|
|
739
|
+
else 0
|
|
740
|
+
)
|
|
741
|
+
violence_defects.append(
|
|
742
|
+
1
|
|
743
|
+
if (
|
|
744
|
+
row[0]["outputs.content_safety.violence_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
745
|
+
and not row[1]["outputs.content_safety.violence_score"]
|
|
746
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
747
|
+
)
|
|
748
|
+
else 0
|
|
749
|
+
)
|
|
750
|
+
sexual_defects.append(
|
|
751
|
+
1
|
|
752
|
+
if (
|
|
753
|
+
row[0]["outputs.content_safety.sexual_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
754
|
+
and not row[1]["outputs.content_safety.sexual_score"]
|
|
755
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
756
|
+
)
|
|
757
|
+
else 0
|
|
758
|
+
)
|
|
759
|
+
self_harm_defects.append(
|
|
760
|
+
1
|
|
761
|
+
if (
|
|
762
|
+
row[0]["outputs.content_safety.self_harm_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
763
|
+
and not row[1]["outputs.content_safety.self_harm_score"]
|
|
764
|
+
> CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
|
|
765
|
+
)
|
|
766
|
+
else 0
|
|
767
|
+
)
|
|
696
768
|
hate_unfairness_defect_rate = list_mean_nan_safe(hate_unfairness_defects)
|
|
697
769
|
violence_defect_rate = list_mean_nan_safe(violence_defects)
|
|
698
770
|
sexual_defect_rate = list_mean_nan_safe(sexual_defects)
|
|
@@ -708,30 +780,32 @@ class _SafetyEvaluation:
|
|
|
708
780
|
"content_safety.self_harm_defect_rate": self_harm_defect_rate,
|
|
709
781
|
}
|
|
710
782
|
evaluation_result["studio_url"] = (
|
|
711
|
-
evaluation_result_dict[jailbreak_cols[0]]["studio_url"]
|
|
783
|
+
evaluation_result_dict[jailbreak_cols[0]]["studio_url"]
|
|
784
|
+
+ "\t"
|
|
785
|
+
+ evaluation_result_dict[regular_cols[0]]["studio_url"]
|
|
712
786
|
)
|
|
713
787
|
return evaluation_result
|
|
714
|
-
|
|
788
|
+
|
|
715
789
|
async def __call__(
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
790
|
+
self,
|
|
791
|
+
target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
792
|
+
evaluators: List[_SafetyEvaluator] = [],
|
|
793
|
+
evaluation_name: Optional[str] = None,
|
|
794
|
+
num_turns: int = 1,
|
|
795
|
+
num_rows: int = 5,
|
|
796
|
+
scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
|
|
797
|
+
conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
|
|
798
|
+
tasks: List[str] = [],
|
|
799
|
+
data_only: bool = False,
|
|
800
|
+
source_text: Optional[str] = None,
|
|
801
|
+
data_path: Optional[Union[str, os.PathLike]] = None,
|
|
802
|
+
jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
|
|
803
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
804
|
+
data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str, os.PathLike]]]] = None,
|
|
805
|
+
randomization_seed: Optional[int] = None,
|
|
806
|
+
concurrent_async_tasks: Optional[int] = 5,
|
|
807
|
+
) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str, os.PathLike]]]:
|
|
808
|
+
"""
|
|
735
809
|
Evaluates the target function based on the provided parameters.
|
|
736
810
|
|
|
737
811
|
:param target: The target function to call during the evaluation. This can be a synchronous or asynchronous function.
|
|
@@ -765,9 +839,11 @@ class _SafetyEvaluation:
|
|
|
765
839
|
:type randomization_seed: Optional[int]
|
|
766
840
|
:param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
|
|
767
841
|
:type concurrent_async_tasks: Optional[int]
|
|
768
|
-
|
|
842
|
+
"""
|
|
769
843
|
## Log inputs
|
|
770
|
-
self.logger.info(
|
|
844
|
+
self.logger.info(
|
|
845
|
+
f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}"
|
|
846
|
+
)
|
|
771
847
|
|
|
772
848
|
## Validate arguments
|
|
773
849
|
self._validate_inputs(
|
|
@@ -798,28 +874,34 @@ class _SafetyEvaluation:
|
|
|
798
874
|
source_text=source_text,
|
|
799
875
|
direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
|
|
800
876
|
randomization_seed=randomization_seed,
|
|
877
|
+
concurrent_async_tasks=concurrent_async_tasks,
|
|
801
878
|
)
|
|
802
879
|
elif data_path:
|
|
803
880
|
data_paths = {Path(data_path).stem: data_path}
|
|
804
881
|
if jailbreak_data_path:
|
|
805
882
|
data_paths[Path(jailbreak_data_path).stem + JAILBREAK_EXT] = jailbreak_data_path
|
|
806
883
|
|
|
807
|
-
if data_only and data_paths:
|
|
884
|
+
if data_only and data_paths:
|
|
885
|
+
return data_paths
|
|
808
886
|
|
|
809
887
|
## Run evaluation
|
|
810
888
|
evaluation_results = {}
|
|
811
889
|
if data_paths:
|
|
812
890
|
for strategy, data_path in data_paths.items():
|
|
813
|
-
self.logger.info(
|
|
814
|
-
|
|
815
|
-
|
|
891
|
+
self.logger.info(
|
|
892
|
+
f"Running evaluation for data with inputs data_path={data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path={output_path}"
|
|
893
|
+
)
|
|
894
|
+
if evaluation_name:
|
|
895
|
+
output_prefix = evaluation_name + "_"
|
|
896
|
+
else:
|
|
897
|
+
output_prefix = ""
|
|
816
898
|
evaluate_outputs = _evaluate.evaluate(
|
|
817
899
|
data=data_path,
|
|
818
900
|
evaluators=evaluators_dict,
|
|
819
901
|
azure_ai_project=self.azure_ai_project,
|
|
820
902
|
evaluation_name=evaluation_name,
|
|
821
903
|
output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
|
|
822
|
-
_use_pf_client=False,
|
|
904
|
+
_use_pf_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
|
|
823
905
|
)
|
|
824
906
|
evaluation_results[strategy] = evaluate_outputs
|
|
825
907
|
return evaluation_results
|
|
@@ -1,6 +1,37 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from typing import Iterator
|
|
6
|
+
|
|
4
7
|
from azure.ai.evaluation._version import VERSION
|
|
5
8
|
|
|
6
|
-
|
|
9
|
+
|
|
10
|
+
class UserAgentSingleton:
|
|
11
|
+
__BASE_USER_AGENT: str = "{}/{}".format("azure-ai-evaluation", VERSION)
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def value(self):
|
|
15
|
+
"""Get the user-agent"""
|
|
16
|
+
return self.__BASE_USER_AGENT
|
|
17
|
+
|
|
18
|
+
def __str__(self) -> str:
|
|
19
|
+
return self.value
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
@contextmanager
|
|
23
|
+
def add_useragent_product(cls, *product: str) -> Iterator[None]:
|
|
24
|
+
"""Appends a "product" (e.g. `name/version`) to the base user agent
|
|
25
|
+
|
|
26
|
+
:param product: User Agent products to append to the base user agent
|
|
27
|
+
|
|
28
|
+
..see-also::
|
|
29
|
+
|
|
30
|
+
`User-Agent section of RFC 9110, <https://www.rfc-editor.org/rfc/rfc9110#name-user-agent>`
|
|
31
|
+
"""
|
|
32
|
+
old_useragent = cls.__BASE_USER_AGENT
|
|
33
|
+
cls.__BASE_USER_AGENT = f"{old_useragent} {' '.join(product)}"
|
|
34
|
+
|
|
35
|
+
yield
|
|
36
|
+
|
|
37
|
+
cls.__BASE_USER_AGENT = old_useragent
|
azure/ai/evaluation/_version.py
CHANGED
|
@@ -8,7 +8,9 @@ try:
|
|
|
8
8
|
from ._attack_objective_generator import RiskCategory
|
|
9
9
|
from ._red_team_result import RedTeamResult
|
|
10
10
|
except ImportError:
|
|
11
|
-
print(
|
|
11
|
+
print(
|
|
12
|
+
"[INFO] Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
|
|
13
|
+
)
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
__all__ = [
|