azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
  3. azure/ai/evaluation/_aoai/label_grader.py +6 -10
  4. azure/ai/evaluation/_aoai/python_grader.py +7 -10
  5. azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
  6. azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +241 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -2
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
  33. azure/ai/evaluation/_evaluate/_utils.py +10 -3
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
  38. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  39. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
  40. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  41. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  42. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  43. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
  44. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  45. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  46. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  47. azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
  48. azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
  49. azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
  50. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
  52. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  53. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  55. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  56. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  57. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  58. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  59. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  60. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  61. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  62. azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
  63. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  64. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  65. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  66. azure/ai/evaluation/_exceptions.py +6 -1
  67. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  68. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  69. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  70. azure/ai/evaluation/_model_configurations.py +26 -0
  71. azure/ai/evaluation/_version.py +1 -1
  72. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  73. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  74. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  75. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  76. azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
  77. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  78. azure/ai/evaluation/red_team/_red_team.py +494 -37
  79. azure/ai/evaluation/red_team/_red_team_result.py +48 -28
  80. azure/ai/evaluation/red_team/_result_processor.py +558 -29
  81. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  82. azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
  83. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  84. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  85. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  86. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  87. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  88. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  90. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  91. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  92. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  94. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  95. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
  96. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
  97. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  98. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  99. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ import itertools
7
7
  import logging
8
8
  import math
9
9
  import os
10
+ from pathlib import Path
10
11
  import random
11
12
  import time
12
13
  import uuid
@@ -17,6 +18,10 @@ from tqdm import tqdm
17
18
  # Azure AI Evaluation imports
18
19
  from azure.ai.evaluation._constants import TokenScope
19
20
  from azure.ai.evaluation._common._experimental import experimental
21
+
22
+ from azure.ai.evaluation._evaluate._evaluate import (
23
+ emit_eval_result_events_to_app_insights,
24
+ ) # TODO: uncomment when app insights checked in
20
25
  from azure.ai.evaluation._model_configurations import EvaluationResult
21
26
  from azure.ai.evaluation.simulator._model_tools import ManagedIdentityAPITokenManager
22
27
  from azure.ai.evaluation.simulator._model_tools._generated_rai_client import GeneratedRAIClient
@@ -65,6 +70,7 @@ from ._utils.formatting_utils import (
65
70
  get_flattened_attack_strategies,
66
71
  write_pyrit_outputs_to_file,
67
72
  format_scorecard,
73
+ format_content_by_modality,
68
74
  )
69
75
  from ._utils.strategy_utils import get_chat_target, get_converter_for_strategy
70
76
  from ._utils.retry_utils import create_standard_retry_manager
@@ -208,6 +214,9 @@ class RedTeam:
208
214
  # keep track of prompt content to context mapping for evaluation
209
215
  self.prompt_to_context = {}
210
216
 
217
+ # keep track of prompt content to risk_sub_type mapping for evaluation
218
+ self.prompt_to_risk_subtype = {}
219
+
211
220
  # Initialize PyRIT
212
221
  initialize_pyrit(memory_db_type=DUCK_DB)
213
222
 
@@ -276,6 +285,7 @@ class RedTeam:
276
285
  one_dp_project=self._one_dp_project,
277
286
  retry_config=retry_config,
278
287
  scan_output_dir=self.scan_output_dir,
288
+ red_team=self,
279
289
  )
280
290
 
281
291
  # Initialize evaluation processor
@@ -287,6 +297,7 @@ class RedTeam:
287
297
  retry_config=retry_config,
288
298
  scan_session_id=self.scan_session_id,
289
299
  scan_output_dir=self.scan_output_dir,
300
+ taxonomy_risk_categories=getattr(self, "taxonomy_risk_categories", None),
290
301
  )
291
302
 
292
303
  # Initialize MLflow integration
@@ -305,6 +316,7 @@ class RedTeam:
305
316
  application_scenario=getattr(self, "application_scenario", ""),
306
317
  risk_categories=getattr(self, "risk_categories", []),
307
318
  ai_studio_url=getattr(self.mlflow_integration, "ai_studio_url", None),
319
+ mlflow_integration=self.mlflow_integration,
308
320
  )
309
321
 
310
322
  async def _get_attack_objectives(
@@ -312,6 +324,8 @@ class RedTeam:
312
324
  risk_category: Optional[RiskCategory] = None,
313
325
  application_scenario: Optional[str] = None,
314
326
  strategy: Optional[str] = None,
327
+ is_agent_target: Optional[bool] = None,
328
+ client_id: Optional[str] = None,
315
329
  ) -> List[str]:
316
330
  """Get attack objectives from the RAI client for a specific risk category or from a custom dataset.
317
331
 
@@ -327,6 +341,8 @@ class RedTeam:
327
341
  :type application_scenario: Optional[str]
328
342
  :param strategy: Optional attack strategy to get specific objectives for
329
343
  :type strategy: Optional[str]
344
+ :param is_agent_target: Optional boolean indicating if target is an agent (True) or model (False)
345
+ :type is_agent_target: Optional[bool]
330
346
  :return: A list of attack objective prompts
331
347
  :rtype: List[str]
332
348
  """
@@ -348,7 +364,39 @@ class RedTeam:
348
364
 
349
365
  # Check if custom attack seed prompts are provided in the generator
350
366
  if attack_objective_generator.custom_attack_seed_prompts and attack_objective_generator.validated_prompts:
351
- return await self._get_custom_attack_objectives(risk_cat_value, num_objectives, strategy, current_key)
367
+ # Check if this specific risk category has custom objectives
368
+ custom_objectives = attack_objective_generator.valid_prompts_by_category.get(risk_cat_value, [])
369
+
370
+ if custom_objectives:
371
+ # Use custom objectives for this risk category
372
+ return await self._get_custom_attack_objectives(risk_cat_value, num_objectives, strategy, current_key)
373
+ else:
374
+ # No custom objectives for this risk category, but risk_categories was specified
375
+ # Fetch from service if this risk category is in the requested list
376
+ if (
377
+ self.attack_objective_generator.risk_categories
378
+ and risk_category in self.attack_objective_generator.risk_categories
379
+ ):
380
+ self.logger.info(
381
+ f"No custom objectives found for risk category {risk_cat_value}, fetching from service"
382
+ )
383
+ return await self._get_rai_attack_objectives(
384
+ risk_category,
385
+ risk_cat_value,
386
+ application_scenario,
387
+ strategy,
388
+ baseline_objectives_exist,
389
+ baseline_key,
390
+ current_key,
391
+ num_objectives,
392
+ is_agent_target,
393
+ )
394
+ else:
395
+ # Risk category not in requested list, return empty
396
+ self.logger.warning(
397
+ f"No custom objectives found for risk category {risk_cat_value} and it's not in the requested risk categories"
398
+ )
399
+ return []
352
400
  else:
353
401
  return await self._get_rai_attack_objectives(
354
402
  risk_category,
@@ -359,6 +407,8 @@ class RedTeam:
359
407
  baseline_key,
360
408
  current_key,
361
409
  num_objectives,
410
+ is_agent_target,
411
+ client_id,
362
412
  )
363
413
 
364
414
  async def _get_custom_attack_objectives(
@@ -420,6 +470,8 @@ class RedTeam:
420
470
  baseline_key: tuple,
421
471
  current_key: tuple,
422
472
  num_objectives: int,
473
+ is_agent_target: Optional[bool] = None,
474
+ client_id: Optional[str] = None,
423
475
  ) -> List[str]:
424
476
  """Get attack objectives from the RAI service."""
425
477
  content_harm_risk = None
@@ -435,6 +487,8 @@ class RedTeam:
435
487
  )
436
488
 
437
489
  # Get objectives from RAI service
490
+ target_type_str = "agent" if is_agent_target else "model" if is_agent_target is not None else None
491
+
438
492
  if "tense" in strategy:
439
493
  objectives_response = await self.generated_rai_client.get_attack_objectives(
440
494
  risk_type=content_harm_risk,
@@ -443,6 +497,8 @@ class RedTeam:
443
497
  strategy="tense",
444
498
  language=self.language.value,
445
499
  scan_session_id=self.scan_session_id,
500
+ target=target_type_str,
501
+ client_id=client_id,
446
502
  )
447
503
  else:
448
504
  objectives_response = await self.generated_rai_client.get_attack_objectives(
@@ -452,11 +508,12 @@ class RedTeam:
452
508
  strategy=None,
453
509
  language=self.language.value,
454
510
  scan_session_id=self.scan_session_id,
511
+ target=target_type_str,
512
+ client_id=client_id,
455
513
  )
456
514
 
457
515
  if isinstance(objectives_response, list):
458
516
  self.logger.debug(f"API returned {len(objectives_response)} objectives")
459
-
460
517
  # Handle jailbreak strategy
461
518
  if strategy == "jailbreak":
462
519
  objectives_response = await self._apply_jailbreak_prefixes(objectives_response)
@@ -470,8 +527,62 @@ class RedTeam:
470
527
  if not objectives_response or (
471
528
  isinstance(objectives_response, dict) and not objectives_response.get("objectives")
472
529
  ):
473
- self.logger.warning("Empty or invalid response, returning empty list")
474
- return []
530
+ # If we got no agent objectives, fallback to model objectives
531
+ if is_agent_target:
532
+ self.logger.warning(
533
+ f"No agent-type attack objectives found for {risk_cat_value}. "
534
+ "Falling back to model-type objectives."
535
+ )
536
+ try:
537
+ # Retry with model target type
538
+ if "tense" in strategy:
539
+ objectives_response = await self.generated_rai_client.get_attack_objectives(
540
+ risk_type=content_harm_risk,
541
+ risk_category=other_risk,
542
+ application_scenario=application_scenario or "",
543
+ strategy="tense",
544
+ language=self.language.value,
545
+ scan_session_id=self.scan_session_id,
546
+ target="model",
547
+ client_id=client_id,
548
+ )
549
+ else:
550
+ objectives_response = await self.generated_rai_client.get_attack_objectives(
551
+ risk_type=content_harm_risk,
552
+ risk_category=other_risk,
553
+ application_scenario=application_scenario or "",
554
+ strategy=None,
555
+ language=self.language.value,
556
+ scan_session_id=self.scan_session_id,
557
+ target="model",
558
+ client_id=client_id,
559
+ )
560
+
561
+ if isinstance(objectives_response, list):
562
+ self.logger.debug(f"Fallback API returned {len(objectives_response)} model-type objectives")
563
+
564
+ # Apply strategy-specific transformations to fallback objectives
565
+ # Still try agent-type attack techniques (jailbreak/XPIA) even with model-type baseline objectives
566
+ if strategy == "jailbreak":
567
+ objectives_response = await self._apply_jailbreak_prefixes(objectives_response)
568
+ elif strategy == "indirect_jailbreak":
569
+ # Try agent-type XPIA first, will fallback to model-type XPIA within the method
570
+ objectives_response = await self._apply_xpia_prompts(objectives_response, "agent")
571
+
572
+ # Check if fallback response is also empty
573
+ if not objectives_response or (
574
+ isinstance(objectives_response, dict) and not objectives_response.get("objectives")
575
+ ):
576
+ self.logger.warning("Fallback to model-type objectives also returned empty list")
577
+ return []
578
+
579
+ except Exception as fallback_error:
580
+ self.logger.error(f"Error calling get_attack_objectives with model fallback: {str(fallback_error)}")
581
+ self.logger.warning("Fallback API call failed, returning empty objectives list")
582
+ return []
583
+ else:
584
+ self.logger.warning("Empty or invalid response, returning empty list")
585
+ return []
475
586
 
476
587
  # Filter and select objectives
477
588
  selected_cat_objectives = self._filter_and_select_objectives(
@@ -484,6 +595,163 @@ class RedTeam:
484
595
 
485
596
  return selected_prompts
486
597
 
598
+ async def _apply_xpia_prompts(self, objectives_list: List, target_type_str: str) -> List:
599
+ """Apply XPIA prompt formatting to objectives for indirect jailbreak strategy.
600
+
601
+ XPIA prompts are wrapper structures that contain:
602
+ - content: benign user query to trigger tool use
603
+ - context: attack vehicle with {attack_text} placeholder
604
+ - context_type: modality for formatting (email, document, html, code)
605
+ - tool_name: name for mock tool
606
+
607
+ We inject the baseline attack objectives into these XPIA wrapper prompts.
608
+ """
609
+ self.logger.debug(f"Applying XPIA prompts to objectives for indirect jailbreak (target_type={target_type_str})")
610
+
611
+ try:
612
+ # Fetch XPIA wrapper prompts from RAI service
613
+ @self.retry_manager.create_retry_decorator(context="xpia_prompts")
614
+ async def get_xpia_prompts_with_retry():
615
+ return await self.generated_rai_client.get_attack_objectives(
616
+ risk_type=None,
617
+ risk_category="xpia",
618
+ application_scenario="",
619
+ strategy=None,
620
+ language=self.language.value,
621
+ scan_session_id=self.scan_session_id,
622
+ target=target_type_str,
623
+ )
624
+
625
+ xpia_prompts = await get_xpia_prompts_with_retry()
626
+
627
+ # If no agent XPIA prompts and we're trying agent, fallback to model
628
+ if (not xpia_prompts or len(xpia_prompts) == 0) and target_type_str == "agent":
629
+ self.logger.debug("No agent-type XPIA prompts available, falling back to model-type XPIA prompts")
630
+ try:
631
+ xpia_prompts = await self.generated_rai_client.get_attack_objectives(
632
+ risk_type=None,
633
+ risk_category="xpia",
634
+ application_scenario="",
635
+ strategy=None,
636
+ language=self.language.value,
637
+ scan_session_id=self.scan_session_id,
638
+ target="model",
639
+ )
640
+ if xpia_prompts and len(xpia_prompts) > 0:
641
+ self.logger.debug(f"Fetched {len(xpia_prompts)} model-type XPIA wrapper prompts as fallback")
642
+ except Exception as fallback_error:
643
+ self.logger.error(f"Error fetching model-type XPIA prompts as fallback: {str(fallback_error)}")
644
+
645
+ if not xpia_prompts or len(xpia_prompts) == 0:
646
+ self.logger.warning("No XPIA prompts available (even after fallback), returning objectives unchanged")
647
+ return objectives_list
648
+
649
+ self.logger.debug(f"Fetched {len(xpia_prompts)} XPIA wrapper prompts")
650
+
651
+ # Apply XPIA wrapping to each baseline objective
652
+ for objective in objectives_list:
653
+ if "messages" in objective and len(objective["messages"]) > 0:
654
+ message = objective["messages"][0]
655
+ if isinstance(message, dict) and "content" in message:
656
+ # Get the baseline attack content to inject
657
+ baseline_attack_content = message["content"]
658
+ # Preserve the original baseline context if it exists
659
+ baseline_context = message.get("context", "")
660
+
661
+ # Normalize baseline_context to a list of context dicts
662
+ baseline_contexts = []
663
+ if baseline_context:
664
+ # Extract baseline context from RAI service format
665
+ context_dict = {"content": baseline_context}
666
+ if message.get("tool_name"):
667
+ context_dict["tool_name"] = message["tool_name"]
668
+ if message.get("context_type"):
669
+ context_dict["context_type"] = message["context_type"]
670
+ baseline_contexts = [context_dict]
671
+
672
+ # Check if baseline contexts have agent fields (context_type, tool_name)
673
+ baseline_contexts_with_agent_fields = []
674
+ baseline_contexts_without_agent_fields = []
675
+
676
+ for ctx in baseline_contexts:
677
+ if isinstance(ctx, dict):
678
+ if "context_type" in ctx or "tool_name" in ctx:
679
+ # This baseline context has agent fields - preserve it separately
680
+ baseline_contexts_with_agent_fields.append(ctx)
681
+ self.logger.debug(
682
+ f"Found baseline context with agent fields: tool_name={ctx.get('tool_name')}, context_type={ctx.get('context_type')}"
683
+ )
684
+ else:
685
+ # This baseline context has no agent fields - can be embedded
686
+ baseline_contexts_without_agent_fields.append(ctx)
687
+ else:
688
+ baseline_contexts_without_agent_fields.append({"content": str(ctx)})
689
+
690
+ # For baseline contexts without agent fields, embed them in the attack content
691
+ if baseline_contexts_without_agent_fields:
692
+ context_texts = [
693
+ ctx.get("content", "")
694
+ for ctx in baseline_contexts_without_agent_fields
695
+ if ctx.get("content")
696
+ ]
697
+ if context_texts:
698
+ combined_context = "\n\n".join(context_texts)
699
+ baseline_attack_content = f"{baseline_attack_content}\n\nContext:\n{combined_context}"
700
+ self.logger.debug(
701
+ f"Embedded {len(context_texts)} baseline context(s) without agent fields into attack content"
702
+ )
703
+
704
+ # Randomly select an XPIA wrapper prompt
705
+ xpia_prompt = random.choice(xpia_prompts)
706
+ xpia_message = xpia_prompt.get("messages", [{}])[0]
707
+
708
+ # Extract XPIA wrapper structure
709
+ user_query = xpia_message.get("content", "")
710
+ attack_vehicle_context = xpia_message.get("context", "")
711
+ context_type = xpia_message.get("context_type") or "text"
712
+ tool_name = xpia_message.get("tool_name", "")
713
+
714
+ # Inject baseline attack (now with appended context) into the {attack_text} placeholder
715
+ if "{attack_text}" in attack_vehicle_context:
716
+ injected_context = attack_vehicle_context.replace("{attack_text}", baseline_attack_content)
717
+ else:
718
+ # No placeholder found, append to end
719
+ injected_context = f"{attack_vehicle_context}\n\n{baseline_attack_content}"
720
+
721
+ # Apply modality-based formatting
722
+ formatted_context = format_content_by_modality(injected_context, context_type)
723
+
724
+ # Update the message with benign user query
725
+ message["content"] = user_query
726
+
727
+ # Build the contexts list: XPIA context + any baseline contexts with agent fields
728
+ contexts = [
729
+ {"content": formatted_context, "context_type": context_type, "tool_name": tool_name}
730
+ ]
731
+
732
+ # Add baseline contexts with agent fields as separate context entries
733
+ if baseline_contexts_with_agent_fields:
734
+ contexts.extend(baseline_contexts_with_agent_fields)
735
+ self.logger.debug(
736
+ f"Preserved {len(baseline_contexts_with_agent_fields)} baseline context(s) with agent fields"
737
+ )
738
+
739
+ message["context"] = contexts
740
+ message["context_type"] = (
741
+ context_type # Keep at message level for backward compat (XPIA primary)
742
+ )
743
+ message["tool_name"] = tool_name
744
+
745
+ self.logger.debug(
746
+ f"Wrapped baseline attack in XPIA: total contexts={len(contexts)}, xpia_tool={tool_name}, xpia_type={context_type}"
747
+ )
748
+
749
+ except Exception as e:
750
+ self.logger.error(f"Error applying XPIA prompts: {str(e)}")
751
+ self.logger.warning("XPIA prompt application failed, returning original objectives")
752
+
753
+ return objectives_list
754
+
487
755
  async def _apply_jailbreak_prefixes(self, objectives_list: List) -> List:
488
756
  """Apply jailbreak prefixes to objectives."""
489
757
  self.logger.debug("Applying jailbreak prefixes to objectives")
@@ -521,10 +789,52 @@ class RedTeam:
521
789
 
522
790
  if baseline_objective_ids:
523
791
  self.logger.debug(f"Filtering by {len(baseline_objective_ids)} baseline objective IDs for {strategy}")
524
- selected_cat_objectives = [
525
- obj for obj in objectives_response if obj.get("id") in baseline_objective_ids
526
- ]
527
- self.logger.debug(f"Found {len(selected_cat_objectives)} matching objectives with baseline IDs")
792
+ # Filter by baseline IDs
793
+ filtered_objectives = [obj for obj in objectives_response if obj.get("id") in baseline_objective_ids]
794
+ self.logger.debug(f"Found {len(filtered_objectives)} matching objectives with baseline IDs")
795
+
796
+ # For strategies like indirect_jailbreak, the RAI service may return multiple
797
+ # objectives per baseline ID (e.g., multiple XPIA variations for one baseline objective).
798
+ # We should select num_objectives total, ensuring each baseline objective gets an XPIA attack.
799
+ # Group by baseline ID and select one objective per baseline ID up to num_objectives.
800
+ selected_by_id = {}
801
+ for obj in filtered_objectives:
802
+ obj_id = obj.get("id")
803
+ if obj_id not in selected_by_id:
804
+ selected_by_id[obj_id] = []
805
+ selected_by_id[obj_id].append(obj)
806
+
807
+ # Select objectives to match num_objectives
808
+ selected_cat_objectives = []
809
+ baseline_ids = list(selected_by_id.keys())
810
+
811
+ # If we have enough baseline IDs to cover num_objectives, select one per baseline ID
812
+ if len(baseline_ids) >= num_objectives:
813
+ # Select from the first num_objectives baseline IDs
814
+ for i in range(num_objectives):
815
+ obj_id = baseline_ids[i]
816
+ selected_cat_objectives.append(random.choice(selected_by_id[obj_id]))
817
+ else:
818
+ # If we have fewer baseline IDs than num_objectives, select all and cycle through
819
+ for i in range(num_objectives):
820
+ obj_id = baseline_ids[i % len(baseline_ids)]
821
+ # For repeated IDs, try to select different variations if available
822
+ available_variations = selected_by_id[obj_id].copy()
823
+ # Remove already selected variations for this baseline ID
824
+ already_selected = [obj for obj in selected_cat_objectives if obj.get("id") == obj_id]
825
+ for selected_obj in already_selected:
826
+ if selected_obj in available_variations:
827
+ available_variations.remove(selected_obj)
828
+
829
+ if available_variations:
830
+ selected_cat_objectives.append(random.choice(available_variations))
831
+ else:
832
+ # If no more variations, reuse one (shouldn't happen with proper XPIA generation)
833
+ selected_cat_objectives.append(random.choice(selected_by_id[obj_id]))
834
+
835
+ self.logger.debug(
836
+ f"Selected {len(selected_cat_objectives)} objectives from {len(baseline_ids)} baseline IDs and {len(filtered_objectives)} total variations for {strategy} strategy"
837
+ )
528
838
  else:
529
839
  self.logger.warning("No baseline objective IDs found, using random selection")
530
840
  selected_cat_objectives = random.sample(
@@ -543,17 +853,87 @@ class RedTeam:
543
853
  return selected_cat_objectives
544
854
 
545
855
  def _extract_objective_content(self, selected_objectives: List) -> List[str]:
546
- """Extract content from selected objectives."""
856
+ """Extract content from selected objectives and build prompt-to-context mapping."""
547
857
  selected_prompts = []
548
858
  for obj in selected_objectives:
859
+ risk_subtype = None
860
+ # Extract risk-subtype from target_harms if present
861
+ target_harms = obj.get("metadata", {}).get("target_harms", [])
862
+ if target_harms and isinstance(target_harms, list):
863
+ for harm in target_harms:
864
+ if isinstance(harm, dict) and "risk-subtype" in harm:
865
+ subtype_value = harm.get("risk-subtype")
866
+ if subtype_value:
867
+ risk_subtype = subtype_value
868
+ break
549
869
  if "messages" in obj and len(obj["messages"]) > 0:
550
870
  message = obj["messages"][0]
551
871
  if isinstance(message, dict) and "content" in message:
552
872
  content = message["content"]
553
- context = message.get("context", "")
873
+ context_raw = message.get("context", "")
874
+ # TODO is first if necessary?
875
+ # Normalize context to always be a list of dicts with 'content' key
876
+ if isinstance(context_raw, list):
877
+ # Already a list - ensure each item is a dict with 'content' key
878
+ contexts = []
879
+ for ctx in context_raw:
880
+ if isinstance(ctx, dict) and "content" in ctx:
881
+ # Preserve all keys including context_type, tool_name if present
882
+ contexts.append(ctx)
883
+ elif isinstance(ctx, str):
884
+ contexts.append({"content": ctx})
885
+ elif context_raw:
886
+ # Single string value - wrap in dict
887
+ contexts = [{"content": context_raw}]
888
+ if message.get("tool_name"):
889
+ contexts[0]["tool_name"] = message["tool_name"]
890
+ if message.get("context_type"):
891
+ contexts[0]["context_type"] = message["context_type"]
892
+ else:
893
+ contexts = []
894
+
895
+ # Check if any context has agent-specific fields
896
+ has_agent_fields = any(
897
+ isinstance(ctx, dict) and ("context_type" in ctx or "tool_name" in ctx) for ctx in contexts
898
+ )
899
+
900
+ # For contexts without agent fields, append them to the content
901
+ # This applies to baseline and any other attack objectives with plain context
902
+ if contexts and not has_agent_fields:
903
+ # Extract all context content and append to the attack content
904
+ context_texts = []
905
+ for ctx in contexts:
906
+ if isinstance(ctx, dict):
907
+ ctx_content = ctx.get("content", "")
908
+ if ctx_content:
909
+ context_texts.append(ctx_content)
910
+
911
+ if context_texts:
912
+ # Append context to content
913
+ combined_context = "\n\n".join(context_texts)
914
+ content = f"{content}\n\nContext:\n{combined_context}"
915
+ self.logger.debug(
916
+ f"Appended {len(context_texts)} context source(s) to attack content (total context length={len(combined_context)})"
917
+ )
918
+
554
919
  selected_prompts.append(content)
555
- # Store mapping of content to context for later evaluation
556
- self.prompt_to_context[content] = context
920
+
921
+ # Store risk_subtype mapping if it exists
922
+ if risk_subtype:
923
+ self.prompt_to_risk_subtype[content] = risk_subtype
924
+
925
+ # Always store contexts if they exist (whether or not they have agent fields)
926
+ if contexts:
927
+ context_dict = {"contexts": contexts}
928
+ if has_agent_fields:
929
+ self.logger.debug(f"Stored context with agent fields: {len(contexts)} context source(s)")
930
+ else:
931
+ self.logger.debug(
932
+ f"Stored context without agent fields: {len(contexts)} context source(s) (also embedded in content)"
933
+ )
934
+ self.prompt_to_context[content] = context_dict
935
+ else:
936
+ self.logger.debug(f"No context to store")
557
937
  return selected_prompts
558
938
 
559
939
  def _cache_attack_objectives(
@@ -573,6 +953,18 @@ class RedTeam:
573
953
  target_harms = obj.get("metadata", {}).get("target_harms", [])
574
954
  content = ""
575
955
  context = ""
956
+ risk_subtype = None
957
+
958
+ # Extract risk-subtype from target_harms if present
959
+ if target_harms and isinstance(target_harms, list):
960
+ for harm in target_harms:
961
+ if isinstance(harm, dict) and "risk-subtype" in harm:
962
+ subtype_value = harm.get("risk-subtype")
963
+ # Only store non-empty risk-subtype values
964
+ if subtype_value:
965
+ risk_subtype = subtype_value
966
+ break # Use the first non-empty risk-subtype found
967
+
576
968
  if "messages" in obj and len(obj["messages"]) > 0:
577
969
 
578
970
  message = obj["messages"][0]
@@ -580,6 +972,9 @@ class RedTeam:
580
972
  context = message.get("context", "")
581
973
  if content:
582
974
  obj_data = {"id": obj_id, "content": content, "context": context}
975
+ # Add risk_subtype to obj_data if it exists
976
+ if risk_subtype:
977
+ obj_data["risk_subtype"] = risk_subtype
583
978
  objectives_by_category[risk_cat_value].append(obj_data)
584
979
 
585
980
  self.attack_objectives[current_key] = {
@@ -786,6 +1181,13 @@ class RedTeam:
786
1181
  run_id_override = kwargs.get("run_id") or kwargs.get("runId")
787
1182
  eval_id_override = kwargs.get("eval_id") or kwargs.get("evalId")
788
1183
  created_at_override = kwargs.get("created_at") or kwargs.get("createdAt")
1184
+ taxonomy_risk_categories = kwargs.get("taxonomy_risk_categories") # key is risk category value is taxonomy
1185
+ _app_insights_configuration = kwargs.get("_app_insights_configuration")
1186
+ self._app_insights_configuration = _app_insights_configuration
1187
+ self.taxonomy_risk_categories = taxonomy_risk_categories or {}
1188
+ is_agent_target: Optional[bool] = kwargs.get("is_agent_target", False)
1189
+ client_id: Optional[str] = kwargs.get("client_id")
1190
+
789
1191
  with UserAgentSingleton().add_useragent_product(user_agent):
790
1192
  # Initialize scan
791
1193
  self._initialize_scan(scan_name, application_scenario)
@@ -834,6 +1236,19 @@ class RedTeam:
834
1236
  self.risk_categories = self.attack_objective_generator.risk_categories
835
1237
  self.result_processor.risk_categories = self.risk_categories
836
1238
 
1239
+ # Validate risk categories for target type
1240
+ if not is_agent_target:
1241
+ # Check if any agent-only risk categories are used with model targets
1242
+ for risk_cat in self.risk_categories:
1243
+ if risk_cat == RiskCategory.SensitiveDataLeakage:
1244
+ raise EvaluationException(
1245
+ message=f"Risk category '{risk_cat.value}' is only available for agent targets",
1246
+ internal_message=f"Risk category {risk_cat.value} requires agent target",
1247
+ target=ErrorTarget.RED_TEAM,
1248
+ category=ErrorCategory.INVALID_VALUE,
1249
+ blame=ErrorBlame.USER_ERROR,
1250
+ )
1251
+
837
1252
  # Show risk categories to user
838
1253
  tqdm.write(f"📊 Risk categories: {[rc.value for rc in self.risk_categories]}")
839
1254
  self.logger.info(f"Risk categories to process: {[rc.value for rc in self.risk_categories]}")
@@ -862,9 +1277,11 @@ class RedTeam:
862
1277
  self._initialize_tracking_dict(flattened_attack_strategies)
863
1278
 
864
1279
  # Fetch attack objectives
865
- all_objectives = await self._fetch_all_objectives(flattened_attack_strategies, application_scenario)
1280
+ all_objectives = await self._fetch_all_objectives(
1281
+ flattened_attack_strategies, application_scenario, is_agent_target, client_id
1282
+ )
866
1283
 
867
- chat_target = get_chat_target(target, self.prompt_to_context)
1284
+ chat_target = get_chat_target(target)
868
1285
  self.chat_target = chat_target
869
1286
 
870
1287
  # Execute attacks
@@ -881,7 +1298,7 @@ class RedTeam:
881
1298
  )
882
1299
 
883
1300
  # Process and return results
884
- return await self._finalize_results(skip_upload, skip_evals, eval_run, output_path)
1301
+ return await self._finalize_results(skip_upload, skip_evals, eval_run, output_path, scan_name)
885
1302
 
886
1303
  def _initialize_scan(self, scan_name: Optional[str], application_scenario: Optional[str]):
887
1304
  """Initialize scan-specific variables."""
@@ -953,11 +1370,10 @@ class RedTeam:
953
1370
  )
954
1371
  raise ValueError("MultiTurn and Crescendo strategies are not compatible with multiple attack strategies.")
955
1372
  if AttackStrategy.Tense in flattened_attack_strategies and (
956
- RiskCategory.IndirectAttack in self.risk_categories
957
- or RiskCategory.UngroundedAttributes in self.risk_categories
1373
+ RiskCategory.UngroundedAttributes in self.risk_categories
958
1374
  ):
959
1375
  self.logger.warning(
960
- "Tense strategy is not compatible with IndirectAttack or UngroundedAttributes risk categories. Skipping Tense strategy."
1376
+ "Tense strategy is not compatible with UngroundedAttributes risk categories. Skipping Tense strategy."
961
1377
  )
962
1378
  raise ValueError(
963
1379
  "Tense strategy is not compatible with IndirectAttack or UngroundedAttributes risk categories."
@@ -977,7 +1393,13 @@ class RedTeam:
977
1393
  "status": TASK_STATUS["PENDING"],
978
1394
  }
979
1395
 
980
- async def _fetch_all_objectives(self, flattened_attack_strategies: List, application_scenario: str) -> Dict:
1396
+ async def _fetch_all_objectives(
1397
+ self,
1398
+ flattened_attack_strategies: List,
1399
+ application_scenario: str,
1400
+ is_agent_target: bool,
1401
+ client_id: Optional[str] = None,
1402
+ ) -> Dict:
981
1403
  """Fetch all attack objectives for all strategies and risk categories."""
982
1404
  log_section_header(self.logger, "Fetching attack objectives")
983
1405
  all_objectives = {}
@@ -989,6 +1411,8 @@ class RedTeam:
989
1411
  risk_category=risk_category,
990
1412
  application_scenario=application_scenario,
991
1413
  strategy="baseline",
1414
+ is_agent_target=is_agent_target,
1415
+ client_id=client_id,
992
1416
  )
993
1417
  if "baseline" not in all_objectives:
994
1418
  all_objectives["baseline"] = {}
@@ -1012,6 +1436,8 @@ class RedTeam:
1012
1436
  risk_category=risk_category,
1013
1437
  application_scenario=application_scenario,
1014
1438
  strategy=strategy_name,
1439
+ is_agent_target=is_agent_target,
1440
+ client_id=client_id,
1015
1441
  )
1016
1442
  all_objectives[strategy_name][risk_category.value] = objectives
1017
1443
 
@@ -1113,47 +1539,78 @@ class RedTeam:
1113
1539
  self.logger.error(f"Error processing task {i+1}: {str(e)}")
1114
1540
  continue
1115
1541
 
1116
- async def _finalize_results(self, skip_upload: bool, skip_evals: bool, eval_run, output_path: str) -> RedTeamResult:
1542
+ async def _finalize_results(
1543
+ self, skip_upload: bool, skip_evals: bool, eval_run, output_path: str, scan_name: str
1544
+ ) -> RedTeamResult:
1117
1545
  """Process and finalize scan results."""
1118
1546
  log_section_header(self.logger, "Processing results")
1119
1547
 
1120
- # Convert results to RedTeamResult
1121
- red_team_result = self.result_processor.to_red_team_result(self.red_team_info)
1122
-
1123
- output = RedTeamResult(
1124
- scan_result=red_team_result,
1125
- attack_details=red_team_result["attack_details"],
1548
+ # Convert results to RedTeamResult (now builds AOAI summary internally)
1549
+ red_team_result = self.result_processor.to_red_team_result(
1550
+ red_team_info=self.red_team_info,
1551
+ eval_run=eval_run,
1552
+ scan_name=scan_name,
1126
1553
  )
1127
1554
 
1555
+ # Extract AOAI summary for passing to MLflow logging
1556
+ aoai_summary = red_team_result.scan_result.get("AOAI_Compatible_Summary")
1557
+ if self._app_insights_configuration:
1558
+ emit_eval_result_events_to_app_insights(
1559
+ self._app_insights_configuration, aoai_summary["output_items"]["data"]
1560
+ )
1128
1561
  # Log results to MLFlow if not skipping upload
1129
1562
  if not skip_upload:
1130
1563
  self.logger.info("Logging results to AI Foundry")
1131
1564
  await self.mlflow_integration.log_redteam_results_to_mlflow(
1132
- redteam_result=output, eval_run=eval_run, red_team_info=self.red_team_info, _skip_evals=skip_evals
1565
+ redteam_result=red_team_result,
1566
+ eval_run=eval_run,
1567
+ red_team_info=self.red_team_info,
1568
+ _skip_evals=skip_evals,
1569
+ aoai_summary=aoai_summary,
1133
1570
  )
1134
-
1135
1571
  # Write output to specified path
1136
- if output_path and output.scan_result:
1572
+ if output_path and red_team_result.scan_result:
1137
1573
  abs_output_path = output_path if os.path.isabs(output_path) else os.path.abspath(output_path)
1138
1574
  self.logger.info(f"Writing output to {abs_output_path}")
1139
- _write_output(abs_output_path, output.scan_result)
1575
+
1576
+ # Ensure output_path is treated as a directory
1577
+ # If it exists as a file, remove it first
1578
+ if os.path.exists(abs_output_path) and not os.path.isdir(abs_output_path):
1579
+ os.remove(abs_output_path)
1580
+ os.makedirs(abs_output_path, exist_ok=True)
1581
+
1582
+ # Create a copy of scan_result without AOAI properties for eval_result.json
1583
+ scan_result_without_aoai = {
1584
+ key: value
1585
+ for key, value in red_team_result.scan_result.items()
1586
+ if not key.startswith("AOAI_Compatible")
1587
+ }
1588
+
1589
+ # Write scan result without AOAI properties to eval_result.json
1590
+ _write_output(abs_output_path, scan_result_without_aoai)
1591
+
1592
+ # Write the AOAI summary to results.json
1593
+ if aoai_summary:
1594
+ _write_output(os.path.join(abs_output_path, "results.json"), aoai_summary)
1595
+ else:
1596
+ self.logger.warning("AOAI summary not available for output_path write")
1140
1597
 
1141
1598
  # Also save a copy to the scan output directory if available
1142
1599
  if self.scan_output_dir:
1143
1600
  final_output = os.path.join(self.scan_output_dir, "final_results.json")
1144
- _write_output(final_output, output.scan_result)
1145
- elif output.scan_result and self.scan_output_dir:
1601
+ _write_output(final_output, red_team_result.scan_result)
1602
+ elif red_team_result.scan_result and self.scan_output_dir:
1146
1603
  # If no output_path was specified but we have scan_output_dir, save there
1147
1604
  final_output = os.path.join(self.scan_output_dir, "final_results.json")
1148
- _write_output(final_output, output.scan_result)
1605
+ _write_output(final_output, red_team_result.scan_result)
1149
1606
 
1150
1607
  # Display final scorecard and results
1151
- if output.scan_result:
1152
- scorecard = format_scorecard(output.scan_result)
1608
+ if red_team_result.scan_result:
1609
+ scorecard = format_scorecard(red_team_result.scan_result)
1153
1610
  tqdm.write(scorecard)
1154
1611
 
1155
1612
  # Print URL for detailed results
1156
- studio_url = output.scan_result.get("studio_url", "")
1613
+ studio_url = red_team_result.scan_result.get("studio_url", "")
1157
1614
  if studio_url:
1158
1615
  tqdm.write(f"\nDetailed results available at:\n{studio_url}")
1159
1616
 
@@ -1170,4 +1627,4 @@ class RedTeam:
1170
1627
  handler.close()
1171
1628
  self.logger.removeHandler(handler)
1172
1629
 
1173
- return output
1630
+ return red_team_result