azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ import itertools
|
|
|
7
7
|
import logging
|
|
8
8
|
import math
|
|
9
9
|
import os
|
|
10
|
+
from pathlib import Path
|
|
10
11
|
import random
|
|
11
12
|
import time
|
|
12
13
|
import uuid
|
|
@@ -17,6 +18,10 @@ from tqdm import tqdm
|
|
|
17
18
|
# Azure AI Evaluation imports
|
|
18
19
|
from azure.ai.evaluation._constants import TokenScope
|
|
19
20
|
from azure.ai.evaluation._common._experimental import experimental
|
|
21
|
+
|
|
22
|
+
from azure.ai.evaluation._evaluate._evaluate import (
|
|
23
|
+
emit_eval_result_events_to_app_insights,
|
|
24
|
+
) # TODO: uncomment when app insights checked in
|
|
20
25
|
from azure.ai.evaluation._model_configurations import EvaluationResult
|
|
21
26
|
from azure.ai.evaluation.simulator._model_tools import ManagedIdentityAPITokenManager
|
|
22
27
|
from azure.ai.evaluation.simulator._model_tools._generated_rai_client import GeneratedRAIClient
|
|
@@ -65,6 +70,7 @@ from ._utils.formatting_utils import (
|
|
|
65
70
|
get_flattened_attack_strategies,
|
|
66
71
|
write_pyrit_outputs_to_file,
|
|
67
72
|
format_scorecard,
|
|
73
|
+
format_content_by_modality,
|
|
68
74
|
)
|
|
69
75
|
from ._utils.strategy_utils import get_chat_target, get_converter_for_strategy
|
|
70
76
|
from ._utils.retry_utils import create_standard_retry_manager
|
|
@@ -208,6 +214,9 @@ class RedTeam:
|
|
|
208
214
|
# keep track of prompt content to context mapping for evaluation
|
|
209
215
|
self.prompt_to_context = {}
|
|
210
216
|
|
|
217
|
+
# keep track of prompt content to risk_sub_type mapping for evaluation
|
|
218
|
+
self.prompt_to_risk_subtype = {}
|
|
219
|
+
|
|
211
220
|
# Initialize PyRIT
|
|
212
221
|
initialize_pyrit(memory_db_type=DUCK_DB)
|
|
213
222
|
|
|
@@ -276,6 +285,7 @@ class RedTeam:
|
|
|
276
285
|
one_dp_project=self._one_dp_project,
|
|
277
286
|
retry_config=retry_config,
|
|
278
287
|
scan_output_dir=self.scan_output_dir,
|
|
288
|
+
red_team=self,
|
|
279
289
|
)
|
|
280
290
|
|
|
281
291
|
# Initialize evaluation processor
|
|
@@ -287,6 +297,7 @@ class RedTeam:
|
|
|
287
297
|
retry_config=retry_config,
|
|
288
298
|
scan_session_id=self.scan_session_id,
|
|
289
299
|
scan_output_dir=self.scan_output_dir,
|
|
300
|
+
taxonomy_risk_categories=getattr(self, "taxonomy_risk_categories", None),
|
|
290
301
|
)
|
|
291
302
|
|
|
292
303
|
# Initialize MLflow integration
|
|
@@ -305,6 +316,7 @@ class RedTeam:
|
|
|
305
316
|
application_scenario=getattr(self, "application_scenario", ""),
|
|
306
317
|
risk_categories=getattr(self, "risk_categories", []),
|
|
307
318
|
ai_studio_url=getattr(self.mlflow_integration, "ai_studio_url", None),
|
|
319
|
+
mlflow_integration=self.mlflow_integration,
|
|
308
320
|
)
|
|
309
321
|
|
|
310
322
|
async def _get_attack_objectives(
|
|
@@ -312,6 +324,8 @@ class RedTeam:
|
|
|
312
324
|
risk_category: Optional[RiskCategory] = None,
|
|
313
325
|
application_scenario: Optional[str] = None,
|
|
314
326
|
strategy: Optional[str] = None,
|
|
327
|
+
is_agent_target: Optional[bool] = None,
|
|
328
|
+
client_id: Optional[str] = None,
|
|
315
329
|
) -> List[str]:
|
|
316
330
|
"""Get attack objectives from the RAI client for a specific risk category or from a custom dataset.
|
|
317
331
|
|
|
@@ -327,6 +341,8 @@ class RedTeam:
|
|
|
327
341
|
:type application_scenario: Optional[str]
|
|
328
342
|
:param strategy: Optional attack strategy to get specific objectives for
|
|
329
343
|
:type strategy: Optional[str]
|
|
344
|
+
:param is_agent_target: Optional boolean indicating if target is an agent (True) or model (False)
|
|
345
|
+
:type is_agent_target: Optional[bool]
|
|
330
346
|
:return: A list of attack objective prompts
|
|
331
347
|
:rtype: List[str]
|
|
332
348
|
"""
|
|
@@ -348,7 +364,39 @@ class RedTeam:
|
|
|
348
364
|
|
|
349
365
|
# Check if custom attack seed prompts are provided in the generator
|
|
350
366
|
if attack_objective_generator.custom_attack_seed_prompts and attack_objective_generator.validated_prompts:
|
|
351
|
-
|
|
367
|
+
# Check if this specific risk category has custom objectives
|
|
368
|
+
custom_objectives = attack_objective_generator.valid_prompts_by_category.get(risk_cat_value, [])
|
|
369
|
+
|
|
370
|
+
if custom_objectives:
|
|
371
|
+
# Use custom objectives for this risk category
|
|
372
|
+
return await self._get_custom_attack_objectives(risk_cat_value, num_objectives, strategy, current_key)
|
|
373
|
+
else:
|
|
374
|
+
# No custom objectives for this risk category, but risk_categories was specified
|
|
375
|
+
# Fetch from service if this risk category is in the requested list
|
|
376
|
+
if (
|
|
377
|
+
self.attack_objective_generator.risk_categories
|
|
378
|
+
and risk_category in self.attack_objective_generator.risk_categories
|
|
379
|
+
):
|
|
380
|
+
self.logger.info(
|
|
381
|
+
f"No custom objectives found for risk category {risk_cat_value}, fetching from service"
|
|
382
|
+
)
|
|
383
|
+
return await self._get_rai_attack_objectives(
|
|
384
|
+
risk_category,
|
|
385
|
+
risk_cat_value,
|
|
386
|
+
application_scenario,
|
|
387
|
+
strategy,
|
|
388
|
+
baseline_objectives_exist,
|
|
389
|
+
baseline_key,
|
|
390
|
+
current_key,
|
|
391
|
+
num_objectives,
|
|
392
|
+
is_agent_target,
|
|
393
|
+
)
|
|
394
|
+
else:
|
|
395
|
+
# Risk category not in requested list, return empty
|
|
396
|
+
self.logger.warning(
|
|
397
|
+
f"No custom objectives found for risk category {risk_cat_value} and it's not in the requested risk categories"
|
|
398
|
+
)
|
|
399
|
+
return []
|
|
352
400
|
else:
|
|
353
401
|
return await self._get_rai_attack_objectives(
|
|
354
402
|
risk_category,
|
|
@@ -359,6 +407,8 @@ class RedTeam:
|
|
|
359
407
|
baseline_key,
|
|
360
408
|
current_key,
|
|
361
409
|
num_objectives,
|
|
410
|
+
is_agent_target,
|
|
411
|
+
client_id,
|
|
362
412
|
)
|
|
363
413
|
|
|
364
414
|
async def _get_custom_attack_objectives(
|
|
@@ -420,6 +470,8 @@ class RedTeam:
|
|
|
420
470
|
baseline_key: tuple,
|
|
421
471
|
current_key: tuple,
|
|
422
472
|
num_objectives: int,
|
|
473
|
+
is_agent_target: Optional[bool] = None,
|
|
474
|
+
client_id: Optional[str] = None,
|
|
423
475
|
) -> List[str]:
|
|
424
476
|
"""Get attack objectives from the RAI service."""
|
|
425
477
|
content_harm_risk = None
|
|
@@ -435,6 +487,8 @@ class RedTeam:
|
|
|
435
487
|
)
|
|
436
488
|
|
|
437
489
|
# Get objectives from RAI service
|
|
490
|
+
target_type_str = "agent" if is_agent_target else "model" if is_agent_target is not None else None
|
|
491
|
+
|
|
438
492
|
if "tense" in strategy:
|
|
439
493
|
objectives_response = await self.generated_rai_client.get_attack_objectives(
|
|
440
494
|
risk_type=content_harm_risk,
|
|
@@ -443,6 +497,8 @@ class RedTeam:
|
|
|
443
497
|
strategy="tense",
|
|
444
498
|
language=self.language.value,
|
|
445
499
|
scan_session_id=self.scan_session_id,
|
|
500
|
+
target=target_type_str,
|
|
501
|
+
client_id=client_id,
|
|
446
502
|
)
|
|
447
503
|
else:
|
|
448
504
|
objectives_response = await self.generated_rai_client.get_attack_objectives(
|
|
@@ -452,11 +508,12 @@ class RedTeam:
|
|
|
452
508
|
strategy=None,
|
|
453
509
|
language=self.language.value,
|
|
454
510
|
scan_session_id=self.scan_session_id,
|
|
511
|
+
target=target_type_str,
|
|
512
|
+
client_id=client_id,
|
|
455
513
|
)
|
|
456
514
|
|
|
457
515
|
if isinstance(objectives_response, list):
|
|
458
516
|
self.logger.debug(f"API returned {len(objectives_response)} objectives")
|
|
459
|
-
|
|
460
517
|
# Handle jailbreak strategy
|
|
461
518
|
if strategy == "jailbreak":
|
|
462
519
|
objectives_response = await self._apply_jailbreak_prefixes(objectives_response)
|
|
@@ -470,8 +527,62 @@ class RedTeam:
|
|
|
470
527
|
if not objectives_response or (
|
|
471
528
|
isinstance(objectives_response, dict) and not objectives_response.get("objectives")
|
|
472
529
|
):
|
|
473
|
-
|
|
474
|
-
|
|
530
|
+
# If we got no agent objectives, fallback to model objectives
|
|
531
|
+
if is_agent_target:
|
|
532
|
+
self.logger.warning(
|
|
533
|
+
f"No agent-type attack objectives found for {risk_cat_value}. "
|
|
534
|
+
"Falling back to model-type objectives."
|
|
535
|
+
)
|
|
536
|
+
try:
|
|
537
|
+
# Retry with model target type
|
|
538
|
+
if "tense" in strategy:
|
|
539
|
+
objectives_response = await self.generated_rai_client.get_attack_objectives(
|
|
540
|
+
risk_type=content_harm_risk,
|
|
541
|
+
risk_category=other_risk,
|
|
542
|
+
application_scenario=application_scenario or "",
|
|
543
|
+
strategy="tense",
|
|
544
|
+
language=self.language.value,
|
|
545
|
+
scan_session_id=self.scan_session_id,
|
|
546
|
+
target="model",
|
|
547
|
+
client_id=client_id,
|
|
548
|
+
)
|
|
549
|
+
else:
|
|
550
|
+
objectives_response = await self.generated_rai_client.get_attack_objectives(
|
|
551
|
+
risk_type=content_harm_risk,
|
|
552
|
+
risk_category=other_risk,
|
|
553
|
+
application_scenario=application_scenario or "",
|
|
554
|
+
strategy=None,
|
|
555
|
+
language=self.language.value,
|
|
556
|
+
scan_session_id=self.scan_session_id,
|
|
557
|
+
target="model",
|
|
558
|
+
client_id=client_id,
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
if isinstance(objectives_response, list):
|
|
562
|
+
self.logger.debug(f"Fallback API returned {len(objectives_response)} model-type objectives")
|
|
563
|
+
|
|
564
|
+
# Apply strategy-specific transformations to fallback objectives
|
|
565
|
+
# Still try agent-type attack techniques (jailbreak/XPIA) even with model-type baseline objectives
|
|
566
|
+
if strategy == "jailbreak":
|
|
567
|
+
objectives_response = await self._apply_jailbreak_prefixes(objectives_response)
|
|
568
|
+
elif strategy == "indirect_jailbreak":
|
|
569
|
+
# Try agent-type XPIA first, will fallback to model-type XPIA within the method
|
|
570
|
+
objectives_response = await self._apply_xpia_prompts(objectives_response, "agent")
|
|
571
|
+
|
|
572
|
+
# Check if fallback response is also empty
|
|
573
|
+
if not objectives_response or (
|
|
574
|
+
isinstance(objectives_response, dict) and not objectives_response.get("objectives")
|
|
575
|
+
):
|
|
576
|
+
self.logger.warning("Fallback to model-type objectives also returned empty list")
|
|
577
|
+
return []
|
|
578
|
+
|
|
579
|
+
except Exception as fallback_error:
|
|
580
|
+
self.logger.error(f"Error calling get_attack_objectives with model fallback: {str(fallback_error)}")
|
|
581
|
+
self.logger.warning("Fallback API call failed, returning empty objectives list")
|
|
582
|
+
return []
|
|
583
|
+
else:
|
|
584
|
+
self.logger.warning("Empty or invalid response, returning empty list")
|
|
585
|
+
return []
|
|
475
586
|
|
|
476
587
|
# Filter and select objectives
|
|
477
588
|
selected_cat_objectives = self._filter_and_select_objectives(
|
|
@@ -484,6 +595,163 @@ class RedTeam:
|
|
|
484
595
|
|
|
485
596
|
return selected_prompts
|
|
486
597
|
|
|
598
|
+
async def _apply_xpia_prompts(self, objectives_list: List, target_type_str: str) -> List:
|
|
599
|
+
"""Apply XPIA prompt formatting to objectives for indirect jailbreak strategy.
|
|
600
|
+
|
|
601
|
+
XPIA prompts are wrapper structures that contain:
|
|
602
|
+
- content: benign user query to trigger tool use
|
|
603
|
+
- context: attack vehicle with {attack_text} placeholder
|
|
604
|
+
- context_type: modality for formatting (email, document, html, code)
|
|
605
|
+
- tool_name: name for mock tool
|
|
606
|
+
|
|
607
|
+
We inject the baseline attack objectives into these XPIA wrapper prompts.
|
|
608
|
+
"""
|
|
609
|
+
self.logger.debug(f"Applying XPIA prompts to objectives for indirect jailbreak (target_type={target_type_str})")
|
|
610
|
+
|
|
611
|
+
try:
|
|
612
|
+
# Fetch XPIA wrapper prompts from RAI service
|
|
613
|
+
@self.retry_manager.create_retry_decorator(context="xpia_prompts")
|
|
614
|
+
async def get_xpia_prompts_with_retry():
|
|
615
|
+
return await self.generated_rai_client.get_attack_objectives(
|
|
616
|
+
risk_type=None,
|
|
617
|
+
risk_category="xpia",
|
|
618
|
+
application_scenario="",
|
|
619
|
+
strategy=None,
|
|
620
|
+
language=self.language.value,
|
|
621
|
+
scan_session_id=self.scan_session_id,
|
|
622
|
+
target=target_type_str,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
xpia_prompts = await get_xpia_prompts_with_retry()
|
|
626
|
+
|
|
627
|
+
# If no agent XPIA prompts and we're trying agent, fallback to model
|
|
628
|
+
if (not xpia_prompts or len(xpia_prompts) == 0) and target_type_str == "agent":
|
|
629
|
+
self.logger.debug("No agent-type XPIA prompts available, falling back to model-type XPIA prompts")
|
|
630
|
+
try:
|
|
631
|
+
xpia_prompts = await self.generated_rai_client.get_attack_objectives(
|
|
632
|
+
risk_type=None,
|
|
633
|
+
risk_category="xpia",
|
|
634
|
+
application_scenario="",
|
|
635
|
+
strategy=None,
|
|
636
|
+
language=self.language.value,
|
|
637
|
+
scan_session_id=self.scan_session_id,
|
|
638
|
+
target="model",
|
|
639
|
+
)
|
|
640
|
+
if xpia_prompts and len(xpia_prompts) > 0:
|
|
641
|
+
self.logger.debug(f"Fetched {len(xpia_prompts)} model-type XPIA wrapper prompts as fallback")
|
|
642
|
+
except Exception as fallback_error:
|
|
643
|
+
self.logger.error(f"Error fetching model-type XPIA prompts as fallback: {str(fallback_error)}")
|
|
644
|
+
|
|
645
|
+
if not xpia_prompts or len(xpia_prompts) == 0:
|
|
646
|
+
self.logger.warning("No XPIA prompts available (even after fallback), returning objectives unchanged")
|
|
647
|
+
return objectives_list
|
|
648
|
+
|
|
649
|
+
self.logger.debug(f"Fetched {len(xpia_prompts)} XPIA wrapper prompts")
|
|
650
|
+
|
|
651
|
+
# Apply XPIA wrapping to each baseline objective
|
|
652
|
+
for objective in objectives_list:
|
|
653
|
+
if "messages" in objective and len(objective["messages"]) > 0:
|
|
654
|
+
message = objective["messages"][0]
|
|
655
|
+
if isinstance(message, dict) and "content" in message:
|
|
656
|
+
# Get the baseline attack content to inject
|
|
657
|
+
baseline_attack_content = message["content"]
|
|
658
|
+
# Preserve the original baseline context if it exists
|
|
659
|
+
baseline_context = message.get("context", "")
|
|
660
|
+
|
|
661
|
+
# Normalize baseline_context to a list of context dicts
|
|
662
|
+
baseline_contexts = []
|
|
663
|
+
if baseline_context:
|
|
664
|
+
# Extract baseline context from RAI service format
|
|
665
|
+
context_dict = {"content": baseline_context}
|
|
666
|
+
if message.get("tool_name"):
|
|
667
|
+
context_dict["tool_name"] = message["tool_name"]
|
|
668
|
+
if message.get("context_type"):
|
|
669
|
+
context_dict["context_type"] = message["context_type"]
|
|
670
|
+
baseline_contexts = [context_dict]
|
|
671
|
+
|
|
672
|
+
# Check if baseline contexts have agent fields (context_type, tool_name)
|
|
673
|
+
baseline_contexts_with_agent_fields = []
|
|
674
|
+
baseline_contexts_without_agent_fields = []
|
|
675
|
+
|
|
676
|
+
for ctx in baseline_contexts:
|
|
677
|
+
if isinstance(ctx, dict):
|
|
678
|
+
if "context_type" in ctx or "tool_name" in ctx:
|
|
679
|
+
# This baseline context has agent fields - preserve it separately
|
|
680
|
+
baseline_contexts_with_agent_fields.append(ctx)
|
|
681
|
+
self.logger.debug(
|
|
682
|
+
f"Found baseline context with agent fields: tool_name={ctx.get('tool_name')}, context_type={ctx.get('context_type')}"
|
|
683
|
+
)
|
|
684
|
+
else:
|
|
685
|
+
# This baseline context has no agent fields - can be embedded
|
|
686
|
+
baseline_contexts_without_agent_fields.append(ctx)
|
|
687
|
+
else:
|
|
688
|
+
baseline_contexts_without_agent_fields.append({"content": str(ctx)})
|
|
689
|
+
|
|
690
|
+
# For baseline contexts without agent fields, embed them in the attack content
|
|
691
|
+
if baseline_contexts_without_agent_fields:
|
|
692
|
+
context_texts = [
|
|
693
|
+
ctx.get("content", "")
|
|
694
|
+
for ctx in baseline_contexts_without_agent_fields
|
|
695
|
+
if ctx.get("content")
|
|
696
|
+
]
|
|
697
|
+
if context_texts:
|
|
698
|
+
combined_context = "\n\n".join(context_texts)
|
|
699
|
+
baseline_attack_content = f"{baseline_attack_content}\n\nContext:\n{combined_context}"
|
|
700
|
+
self.logger.debug(
|
|
701
|
+
f"Embedded {len(context_texts)} baseline context(s) without agent fields into attack content"
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
# Randomly select an XPIA wrapper prompt
|
|
705
|
+
xpia_prompt = random.choice(xpia_prompts)
|
|
706
|
+
xpia_message = xpia_prompt.get("messages", [{}])[0]
|
|
707
|
+
|
|
708
|
+
# Extract XPIA wrapper structure
|
|
709
|
+
user_query = xpia_message.get("content", "")
|
|
710
|
+
attack_vehicle_context = xpia_message.get("context", "")
|
|
711
|
+
context_type = xpia_message.get("context_type") or "text"
|
|
712
|
+
tool_name = xpia_message.get("tool_name", "")
|
|
713
|
+
|
|
714
|
+
# Inject baseline attack (now with appended context) into the {attack_text} placeholder
|
|
715
|
+
if "{attack_text}" in attack_vehicle_context:
|
|
716
|
+
injected_context = attack_vehicle_context.replace("{attack_text}", baseline_attack_content)
|
|
717
|
+
else:
|
|
718
|
+
# No placeholder found, append to end
|
|
719
|
+
injected_context = f"{attack_vehicle_context}\n\n{baseline_attack_content}"
|
|
720
|
+
|
|
721
|
+
# Apply modality-based formatting
|
|
722
|
+
formatted_context = format_content_by_modality(injected_context, context_type)
|
|
723
|
+
|
|
724
|
+
# Update the message with benign user query
|
|
725
|
+
message["content"] = user_query
|
|
726
|
+
|
|
727
|
+
# Build the contexts list: XPIA context + any baseline contexts with agent fields
|
|
728
|
+
contexts = [
|
|
729
|
+
{"content": formatted_context, "context_type": context_type, "tool_name": tool_name}
|
|
730
|
+
]
|
|
731
|
+
|
|
732
|
+
# Add baseline contexts with agent fields as separate context entries
|
|
733
|
+
if baseline_contexts_with_agent_fields:
|
|
734
|
+
contexts.extend(baseline_contexts_with_agent_fields)
|
|
735
|
+
self.logger.debug(
|
|
736
|
+
f"Preserved {len(baseline_contexts_with_agent_fields)} baseline context(s) with agent fields"
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
message["context"] = contexts
|
|
740
|
+
message["context_type"] = (
|
|
741
|
+
context_type # Keep at message level for backward compat (XPIA primary)
|
|
742
|
+
)
|
|
743
|
+
message["tool_name"] = tool_name
|
|
744
|
+
|
|
745
|
+
self.logger.debug(
|
|
746
|
+
f"Wrapped baseline attack in XPIA: total contexts={len(contexts)}, xpia_tool={tool_name}, xpia_type={context_type}"
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
except Exception as e:
|
|
750
|
+
self.logger.error(f"Error applying XPIA prompts: {str(e)}")
|
|
751
|
+
self.logger.warning("XPIA prompt application failed, returning original objectives")
|
|
752
|
+
|
|
753
|
+
return objectives_list
|
|
754
|
+
|
|
487
755
|
async def _apply_jailbreak_prefixes(self, objectives_list: List) -> List:
|
|
488
756
|
"""Apply jailbreak prefixes to objectives."""
|
|
489
757
|
self.logger.debug("Applying jailbreak prefixes to objectives")
|
|
@@ -521,10 +789,52 @@ class RedTeam:
|
|
|
521
789
|
|
|
522
790
|
if baseline_objective_ids:
|
|
523
791
|
self.logger.debug(f"Filtering by {len(baseline_objective_ids)} baseline objective IDs for {strategy}")
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
792
|
+
# Filter by baseline IDs
|
|
793
|
+
filtered_objectives = [obj for obj in objectives_response if obj.get("id") in baseline_objective_ids]
|
|
794
|
+
self.logger.debug(f"Found {len(filtered_objectives)} matching objectives with baseline IDs")
|
|
795
|
+
|
|
796
|
+
# For strategies like indirect_jailbreak, the RAI service may return multiple
|
|
797
|
+
# objectives per baseline ID (e.g., multiple XPIA variations for one baseline objective).
|
|
798
|
+
# We should select num_objectives total, ensuring each baseline objective gets an XPIA attack.
|
|
799
|
+
# Group by baseline ID and select one objective per baseline ID up to num_objectives.
|
|
800
|
+
selected_by_id = {}
|
|
801
|
+
for obj in filtered_objectives:
|
|
802
|
+
obj_id = obj.get("id")
|
|
803
|
+
if obj_id not in selected_by_id:
|
|
804
|
+
selected_by_id[obj_id] = []
|
|
805
|
+
selected_by_id[obj_id].append(obj)
|
|
806
|
+
|
|
807
|
+
# Select objectives to match num_objectives
|
|
808
|
+
selected_cat_objectives = []
|
|
809
|
+
baseline_ids = list(selected_by_id.keys())
|
|
810
|
+
|
|
811
|
+
# If we have enough baseline IDs to cover num_objectives, select one per baseline ID
|
|
812
|
+
if len(baseline_ids) >= num_objectives:
|
|
813
|
+
# Select from the first num_objectives baseline IDs
|
|
814
|
+
for i in range(num_objectives):
|
|
815
|
+
obj_id = baseline_ids[i]
|
|
816
|
+
selected_cat_objectives.append(random.choice(selected_by_id[obj_id]))
|
|
817
|
+
else:
|
|
818
|
+
# If we have fewer baseline IDs than num_objectives, select all and cycle through
|
|
819
|
+
for i in range(num_objectives):
|
|
820
|
+
obj_id = baseline_ids[i % len(baseline_ids)]
|
|
821
|
+
# For repeated IDs, try to select different variations if available
|
|
822
|
+
available_variations = selected_by_id[obj_id].copy()
|
|
823
|
+
# Remove already selected variations for this baseline ID
|
|
824
|
+
already_selected = [obj for obj in selected_cat_objectives if obj.get("id") == obj_id]
|
|
825
|
+
for selected_obj in already_selected:
|
|
826
|
+
if selected_obj in available_variations:
|
|
827
|
+
available_variations.remove(selected_obj)
|
|
828
|
+
|
|
829
|
+
if available_variations:
|
|
830
|
+
selected_cat_objectives.append(random.choice(available_variations))
|
|
831
|
+
else:
|
|
832
|
+
# If no more variations, reuse one (shouldn't happen with proper XPIA generation)
|
|
833
|
+
selected_cat_objectives.append(random.choice(selected_by_id[obj_id]))
|
|
834
|
+
|
|
835
|
+
self.logger.debug(
|
|
836
|
+
f"Selected {len(selected_cat_objectives)} objectives from {len(baseline_ids)} baseline IDs and {len(filtered_objectives)} total variations for {strategy} strategy"
|
|
837
|
+
)
|
|
528
838
|
else:
|
|
529
839
|
self.logger.warning("No baseline objective IDs found, using random selection")
|
|
530
840
|
selected_cat_objectives = random.sample(
|
|
@@ -543,17 +853,87 @@ class RedTeam:
|
|
|
543
853
|
return selected_cat_objectives
|
|
544
854
|
|
|
545
855
|
def _extract_objective_content(self, selected_objectives: List) -> List[str]:
|
|
546
|
-
"""Extract content from selected objectives."""
|
|
856
|
+
"""Extract content from selected objectives and build prompt-to-context mapping."""
|
|
547
857
|
selected_prompts = []
|
|
548
858
|
for obj in selected_objectives:
|
|
859
|
+
risk_subtype = None
|
|
860
|
+
# Extract risk-subtype from target_harms if present
|
|
861
|
+
target_harms = obj.get("metadata", {}).get("target_harms", [])
|
|
862
|
+
if target_harms and isinstance(target_harms, list):
|
|
863
|
+
for harm in target_harms:
|
|
864
|
+
if isinstance(harm, dict) and "risk-subtype" in harm:
|
|
865
|
+
subtype_value = harm.get("risk-subtype")
|
|
866
|
+
if subtype_value:
|
|
867
|
+
risk_subtype = subtype_value
|
|
868
|
+
break
|
|
549
869
|
if "messages" in obj and len(obj["messages"]) > 0:
|
|
550
870
|
message = obj["messages"][0]
|
|
551
871
|
if isinstance(message, dict) and "content" in message:
|
|
552
872
|
content = message["content"]
|
|
553
|
-
|
|
873
|
+
context_raw = message.get("context", "")
|
|
874
|
+
# TODO is first if necessary?
|
|
875
|
+
# Normalize context to always be a list of dicts with 'content' key
|
|
876
|
+
if isinstance(context_raw, list):
|
|
877
|
+
# Already a list - ensure each item is a dict with 'content' key
|
|
878
|
+
contexts = []
|
|
879
|
+
for ctx in context_raw:
|
|
880
|
+
if isinstance(ctx, dict) and "content" in ctx:
|
|
881
|
+
# Preserve all keys including context_type, tool_name if present
|
|
882
|
+
contexts.append(ctx)
|
|
883
|
+
elif isinstance(ctx, str):
|
|
884
|
+
contexts.append({"content": ctx})
|
|
885
|
+
elif context_raw:
|
|
886
|
+
# Single string value - wrap in dict
|
|
887
|
+
contexts = [{"content": context_raw}]
|
|
888
|
+
if message.get("tool_name"):
|
|
889
|
+
contexts[0]["tool_name"] = message["tool_name"]
|
|
890
|
+
if message.get("context_type"):
|
|
891
|
+
contexts[0]["context_type"] = message["context_type"]
|
|
892
|
+
else:
|
|
893
|
+
contexts = []
|
|
894
|
+
|
|
895
|
+
# Check if any context has agent-specific fields
|
|
896
|
+
has_agent_fields = any(
|
|
897
|
+
isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
# For contexts without agent fields, append them to the content
|
|
901
|
+
# This applies to baseline and any other attack objectives with plain context
|
|
902
|
+
if contexts and not has_agent_fields:
|
|
903
|
+
# Extract all context content and append to the attack content
|
|
904
|
+
context_texts = []
|
|
905
|
+
for ctx in contexts:
|
|
906
|
+
if isinstance(ctx, dict):
|
|
907
|
+
ctx_content = ctx.get("content", "")
|
|
908
|
+
if ctx_content:
|
|
909
|
+
context_texts.append(ctx_content)
|
|
910
|
+
|
|
911
|
+
if context_texts:
|
|
912
|
+
# Append context to content
|
|
913
|
+
combined_context = "\n\n".join(context_texts)
|
|
914
|
+
content = f"{content}\n\nContext:\n{combined_context}"
|
|
915
|
+
self.logger.debug(
|
|
916
|
+
f"Appended {len(context_texts)} context source(s) to attack content (total context length={len(combined_context)})"
|
|
917
|
+
)
|
|
918
|
+
|
|
554
919
|
selected_prompts.append(content)
|
|
555
|
-
|
|
556
|
-
|
|
920
|
+
|
|
921
|
+
# Store risk_subtype mapping if it exists
|
|
922
|
+
if risk_subtype:
|
|
923
|
+
self.prompt_to_risk_subtype[content] = risk_subtype
|
|
924
|
+
|
|
925
|
+
# Always store contexts if they exist (whether or not they have agent fields)
|
|
926
|
+
if contexts:
|
|
927
|
+
context_dict = {"contexts": contexts}
|
|
928
|
+
if has_agent_fields:
|
|
929
|
+
self.logger.debug(f"Stored context with agent fields: {len(contexts)} context source(s)")
|
|
930
|
+
else:
|
|
931
|
+
self.logger.debug(
|
|
932
|
+
f"Stored context without agent fields: {len(contexts)} context source(s) (also embedded in content)"
|
|
933
|
+
)
|
|
934
|
+
self.prompt_to_context[content] = context_dict
|
|
935
|
+
else:
|
|
936
|
+
self.logger.debug(f"No context to store")
|
|
557
937
|
return selected_prompts
|
|
558
938
|
|
|
559
939
|
def _cache_attack_objectives(
|
|
@@ -573,6 +953,18 @@ class RedTeam:
|
|
|
573
953
|
target_harms = obj.get("metadata", {}).get("target_harms", [])
|
|
574
954
|
content = ""
|
|
575
955
|
context = ""
|
|
956
|
+
risk_subtype = None
|
|
957
|
+
|
|
958
|
+
# Extract risk-subtype from target_harms if present
|
|
959
|
+
if target_harms and isinstance(target_harms, list):
|
|
960
|
+
for harm in target_harms:
|
|
961
|
+
if isinstance(harm, dict) and "risk-subtype" in harm:
|
|
962
|
+
subtype_value = harm.get("risk-subtype")
|
|
963
|
+
# Only store non-empty risk-subtype values
|
|
964
|
+
if subtype_value:
|
|
965
|
+
risk_subtype = subtype_value
|
|
966
|
+
break # Use the first non-empty risk-subtype found
|
|
967
|
+
|
|
576
968
|
if "messages" in obj and len(obj["messages"]) > 0:
|
|
577
969
|
|
|
578
970
|
message = obj["messages"][0]
|
|
@@ -580,6 +972,9 @@ class RedTeam:
|
|
|
580
972
|
context = message.get("context", "")
|
|
581
973
|
if content:
|
|
582
974
|
obj_data = {"id": obj_id, "content": content, "context": context}
|
|
975
|
+
# Add risk_subtype to obj_data if it exists
|
|
976
|
+
if risk_subtype:
|
|
977
|
+
obj_data["risk_subtype"] = risk_subtype
|
|
583
978
|
objectives_by_category[risk_cat_value].append(obj_data)
|
|
584
979
|
|
|
585
980
|
self.attack_objectives[current_key] = {
|
|
@@ -783,6 +1178,16 @@ class RedTeam:
|
|
|
783
1178
|
:rtype: RedTeamResult
|
|
784
1179
|
"""
|
|
785
1180
|
user_agent: Optional[str] = kwargs.get("user_agent", "(type=redteam; subtype=RedTeam)")
|
|
1181
|
+
run_id_override = kwargs.get("run_id") or kwargs.get("runId")
|
|
1182
|
+
eval_id_override = kwargs.get("eval_id") or kwargs.get("evalId")
|
|
1183
|
+
created_at_override = kwargs.get("created_at") or kwargs.get("createdAt")
|
|
1184
|
+
taxonomy_risk_categories = kwargs.get("taxonomy_risk_categories") # key is risk category value is taxonomy
|
|
1185
|
+
_app_insights_configuration = kwargs.get("_app_insights_configuration")
|
|
1186
|
+
self._app_insights_configuration = _app_insights_configuration
|
|
1187
|
+
self.taxonomy_risk_categories = taxonomy_risk_categories or {}
|
|
1188
|
+
is_agent_target: Optional[bool] = kwargs.get("is_agent_target", False)
|
|
1189
|
+
client_id: Optional[str] = kwargs.get("client_id")
|
|
1190
|
+
|
|
786
1191
|
with UserAgentSingleton().add_useragent_product(user_agent):
|
|
787
1192
|
# Initialize scan
|
|
788
1193
|
self._initialize_scan(scan_name, application_scenario)
|
|
@@ -802,6 +1207,12 @@ class RedTeam:
|
|
|
802
1207
|
self.mlflow_integration.logger = self.logger
|
|
803
1208
|
self.result_processor.logger = self.logger
|
|
804
1209
|
|
|
1210
|
+
self.mlflow_integration.set_run_identity_overrides(
|
|
1211
|
+
run_id=run_id_override,
|
|
1212
|
+
eval_id=eval_id_override,
|
|
1213
|
+
created_at=created_at_override,
|
|
1214
|
+
)
|
|
1215
|
+
|
|
805
1216
|
# Validate attack objective generator
|
|
806
1217
|
if not self.attack_objective_generator:
|
|
807
1218
|
raise EvaluationException(
|
|
@@ -825,6 +1236,19 @@ class RedTeam:
|
|
|
825
1236
|
self.risk_categories = self.attack_objective_generator.risk_categories
|
|
826
1237
|
self.result_processor.risk_categories = self.risk_categories
|
|
827
1238
|
|
|
1239
|
+
# Validate risk categories for target type
|
|
1240
|
+
if not is_agent_target:
|
|
1241
|
+
# Check if any agent-only risk categories are used with model targets
|
|
1242
|
+
for risk_cat in self.risk_categories:
|
|
1243
|
+
if risk_cat == RiskCategory.SensitiveDataLeakage:
|
|
1244
|
+
raise EvaluationException(
|
|
1245
|
+
message=f"Risk category '{risk_cat.value}' is only available for agent targets",
|
|
1246
|
+
internal_message=f"Risk category {risk_cat.value} requires agent target",
|
|
1247
|
+
target=ErrorTarget.RED_TEAM,
|
|
1248
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
1249
|
+
blame=ErrorBlame.USER_ERROR,
|
|
1250
|
+
)
|
|
1251
|
+
|
|
828
1252
|
# Show risk categories to user
|
|
829
1253
|
tqdm.write(f"📊 Risk categories: {[rc.value for rc in self.risk_categories]}")
|
|
830
1254
|
self.logger.info(f"Risk categories to process: {[rc.value for rc in self.risk_categories]}")
|
|
@@ -853,9 +1277,11 @@ class RedTeam:
|
|
|
853
1277
|
self._initialize_tracking_dict(flattened_attack_strategies)
|
|
854
1278
|
|
|
855
1279
|
# Fetch attack objectives
|
|
856
|
-
all_objectives = await self._fetch_all_objectives(
|
|
1280
|
+
all_objectives = await self._fetch_all_objectives(
|
|
1281
|
+
flattened_attack_strategies, application_scenario, is_agent_target, client_id
|
|
1282
|
+
)
|
|
857
1283
|
|
|
858
|
-
chat_target = get_chat_target(target
|
|
1284
|
+
chat_target = get_chat_target(target)
|
|
859
1285
|
self.chat_target = chat_target
|
|
860
1286
|
|
|
861
1287
|
# Execute attacks
|
|
@@ -872,7 +1298,7 @@ class RedTeam:
|
|
|
872
1298
|
)
|
|
873
1299
|
|
|
874
1300
|
# Process and return results
|
|
875
|
-
return await self._finalize_results(skip_upload, skip_evals, eval_run, output_path)
|
|
1301
|
+
return await self._finalize_results(skip_upload, skip_evals, eval_run, output_path, scan_name)
|
|
876
1302
|
|
|
877
1303
|
def _initialize_scan(self, scan_name: Optional[str], application_scenario: Optional[str]):
|
|
878
1304
|
"""Initialize scan-specific variables."""
|
|
@@ -944,11 +1370,10 @@ class RedTeam:
|
|
|
944
1370
|
)
|
|
945
1371
|
raise ValueError("MultiTurn and Crescendo strategies are not compatible with multiple attack strategies.")
|
|
946
1372
|
if AttackStrategy.Tense in flattened_attack_strategies and (
|
|
947
|
-
RiskCategory.
|
|
948
|
-
or RiskCategory.UngroundedAttributes in self.risk_categories
|
|
1373
|
+
RiskCategory.UngroundedAttributes in self.risk_categories
|
|
949
1374
|
):
|
|
950
1375
|
self.logger.warning(
|
|
951
|
-
"Tense strategy is not compatible with
|
|
1376
|
+
"Tense strategy is not compatible with UngroundedAttributes risk categories. Skipping Tense strategy."
|
|
952
1377
|
)
|
|
953
1378
|
raise ValueError(
|
|
954
1379
|
"Tense strategy is not compatible with IndirectAttack or UngroundedAttributes risk categories."
|
|
@@ -968,7 +1393,13 @@ class RedTeam:
|
|
|
968
1393
|
"status": TASK_STATUS["PENDING"],
|
|
969
1394
|
}
|
|
970
1395
|
|
|
971
|
-
async def _fetch_all_objectives(
|
|
1396
|
+
async def _fetch_all_objectives(
|
|
1397
|
+
self,
|
|
1398
|
+
flattened_attack_strategies: List,
|
|
1399
|
+
application_scenario: str,
|
|
1400
|
+
is_agent_target: bool,
|
|
1401
|
+
client_id: Optional[str] = None,
|
|
1402
|
+
) -> Dict:
|
|
972
1403
|
"""Fetch all attack objectives for all strategies and risk categories."""
|
|
973
1404
|
log_section_header(self.logger, "Fetching attack objectives")
|
|
974
1405
|
all_objectives = {}
|
|
@@ -980,6 +1411,8 @@ class RedTeam:
|
|
|
980
1411
|
risk_category=risk_category,
|
|
981
1412
|
application_scenario=application_scenario,
|
|
982
1413
|
strategy="baseline",
|
|
1414
|
+
is_agent_target=is_agent_target,
|
|
1415
|
+
client_id=client_id,
|
|
983
1416
|
)
|
|
984
1417
|
if "baseline" not in all_objectives:
|
|
985
1418
|
all_objectives["baseline"] = {}
|
|
@@ -1003,6 +1436,8 @@ class RedTeam:
|
|
|
1003
1436
|
risk_category=risk_category,
|
|
1004
1437
|
application_scenario=application_scenario,
|
|
1005
1438
|
strategy=strategy_name,
|
|
1439
|
+
is_agent_target=is_agent_target,
|
|
1440
|
+
client_id=client_id,
|
|
1006
1441
|
)
|
|
1007
1442
|
all_objectives[strategy_name][risk_category.value] = objectives
|
|
1008
1443
|
|
|
@@ -1104,47 +1539,78 @@ class RedTeam:
|
|
|
1104
1539
|
self.logger.error(f"Error processing task {i+1}: {str(e)}")
|
|
1105
1540
|
continue
|
|
1106
1541
|
|
|
1107
|
-
async def _finalize_results(
|
|
1542
|
+
async def _finalize_results(
|
|
1543
|
+
self, skip_upload: bool, skip_evals: bool, eval_run, output_path: str, scan_name: str
|
|
1544
|
+
) -> RedTeamResult:
|
|
1108
1545
|
"""Process and finalize scan results."""
|
|
1109
1546
|
log_section_header(self.logger, "Processing results")
|
|
1110
1547
|
|
|
1111
|
-
# Convert results to RedTeamResult
|
|
1112
|
-
red_team_result = self.result_processor.to_red_team_result(
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
attack_details=red_team_result["attack_details"],
|
|
1548
|
+
# Convert results to RedTeamResult (now builds AOAI summary internally)
|
|
1549
|
+
red_team_result = self.result_processor.to_red_team_result(
|
|
1550
|
+
red_team_info=self.red_team_info,
|
|
1551
|
+
eval_run=eval_run,
|
|
1552
|
+
scan_name=scan_name,
|
|
1117
1553
|
)
|
|
1118
1554
|
|
|
1555
|
+
# Extract AOAI summary for passing to MLflow logging
|
|
1556
|
+
aoai_summary = red_team_result.scan_result.get("AOAI_Compatible_Summary")
|
|
1557
|
+
if self._app_insights_configuration:
|
|
1558
|
+
emit_eval_result_events_to_app_insights(
|
|
1559
|
+
self._app_insights_configuration, aoai_summary["output_items"]["data"]
|
|
1560
|
+
)
|
|
1119
1561
|
# Log results to MLFlow if not skipping upload
|
|
1120
1562
|
if not skip_upload:
|
|
1121
1563
|
self.logger.info("Logging results to AI Foundry")
|
|
1122
1564
|
await self.mlflow_integration.log_redteam_results_to_mlflow(
|
|
1123
|
-
redteam_result=
|
|
1565
|
+
redteam_result=red_team_result,
|
|
1566
|
+
eval_run=eval_run,
|
|
1567
|
+
red_team_info=self.red_team_info,
|
|
1568
|
+
_skip_evals=skip_evals,
|
|
1569
|
+
aoai_summary=aoai_summary,
|
|
1124
1570
|
)
|
|
1125
|
-
|
|
1126
1571
|
# Write output to specified path
|
|
1127
|
-
if output_path and
|
|
1572
|
+
if output_path and red_team_result.scan_result:
|
|
1128
1573
|
abs_output_path = output_path if os.path.isabs(output_path) else os.path.abspath(output_path)
|
|
1129
1574
|
self.logger.info(f"Writing output to {abs_output_path}")
|
|
1130
|
-
|
|
1575
|
+
|
|
1576
|
+
# Ensure output_path is treated as a directory
|
|
1577
|
+
# If it exists as a file, remove it first
|
|
1578
|
+
if os.path.exists(abs_output_path) and not os.path.isdir(abs_output_path):
|
|
1579
|
+
os.remove(abs_output_path)
|
|
1580
|
+
os.makedirs(abs_output_path, exist_ok=True)
|
|
1581
|
+
|
|
1582
|
+
# Create a copy of scan_result without AOAI properties for eval_result.json
|
|
1583
|
+
scan_result_without_aoai = {
|
|
1584
|
+
key: value
|
|
1585
|
+
for key, value in red_team_result.scan_result.items()
|
|
1586
|
+
if not key.startswith("AOAI_Compatible")
|
|
1587
|
+
}
|
|
1588
|
+
|
|
1589
|
+
# Write scan result without AOAI properties to eval_result.json
|
|
1590
|
+
_write_output(abs_output_path, scan_result_without_aoai)
|
|
1591
|
+
|
|
1592
|
+
# Write the AOAI summary to results.json
|
|
1593
|
+
if aoai_summary:
|
|
1594
|
+
_write_output(os.path.join(abs_output_path, "results.json"), aoai_summary)
|
|
1595
|
+
else:
|
|
1596
|
+
self.logger.warning("AOAI summary not available for output_path write")
|
|
1131
1597
|
|
|
1132
1598
|
# Also save a copy to the scan output directory if available
|
|
1133
1599
|
if self.scan_output_dir:
|
|
1134
1600
|
final_output = os.path.join(self.scan_output_dir, "final_results.json")
|
|
1135
|
-
_write_output(final_output,
|
|
1136
|
-
elif
|
|
1601
|
+
_write_output(final_output, red_team_result.scan_result)
|
|
1602
|
+
elif red_team_result.scan_result and self.scan_output_dir:
|
|
1137
1603
|
# If no output_path was specified but we have scan_output_dir, save there
|
|
1138
1604
|
final_output = os.path.join(self.scan_output_dir, "final_results.json")
|
|
1139
|
-
_write_output(final_output,
|
|
1605
|
+
_write_output(final_output, red_team_result.scan_result)
|
|
1140
1606
|
|
|
1141
1607
|
# Display final scorecard and results
|
|
1142
|
-
if
|
|
1143
|
-
scorecard = format_scorecard(
|
|
1608
|
+
if red_team_result.scan_result:
|
|
1609
|
+
scorecard = format_scorecard(red_team_result.scan_result)
|
|
1144
1610
|
tqdm.write(scorecard)
|
|
1145
1611
|
|
|
1146
1612
|
# Print URL for detailed results
|
|
1147
|
-
studio_url =
|
|
1613
|
+
studio_url = red_team_result.scan_result.get("studio_url", "")
|
|
1148
1614
|
if studio_url:
|
|
1149
1615
|
tqdm.write(f"\nDetailed results available at:\n{studio_url}")
|
|
1150
1616
|
|
|
@@ -1161,4 +1627,4 @@ class RedTeam:
|
|
|
1161
1627
|
handler.close()
|
|
1162
1628
|
self.logger.removeHandler(handler)
|
|
1163
1629
|
|
|
1164
|
-
return
|
|
1630
|
+
return red_team_result
|