PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend - Supply Chain Defender

azure-ai-evaluation 1.0.0b2py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show

azure/ai/evaluation/red_team/_result_processor.py ADDED Viewed

@@ -0,0 +1,1708 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+"""
+Result processing module for Red Team Agent.
+This module handles the processing, aggregation, and formatting of red team evaluation results.
+"""
+import copy
+import hashlib
+import json
+import math
+import os
+import uuid
+from collections import defaultdict
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Union, cast
+import pandas as pd
+from azure.ai.evaluation._common.constants import EvaluationMetrics
+# Local imports
+from ._red_team_result import (
+    RedTeamResult,
+    RedTeamingScorecard,
+    RedTeamingParameters,
+    ScanResult,
+    RedTeamRun,
+    OutputItemsList,
+)
+from ._attack_objective_generator import RiskCategory
+from ._utils.constants import ATTACK_STRATEGY_COMPLEXITY_MAP
+from .._common.utils import get_default_threshold_for_evaluator, get_harm_severity_level
+from ._utils.formatting_utils import list_mean_nan_safe, is_none_or_nan, get_attack_success
+class ResultProcessor:
+    """Handles processing and formatting of red team evaluation results."""
+    def __init__(
+        self,
+        logger,
+        attack_success_thresholds,
+        application_scenario,
+        risk_categories,
+        ai_studio_url=None,
+        mlflow_integration=None,
+    ):
+        """Initialize the result processor.
+        :param logger: Logger instance for logging
+        :param attack_success_thresholds: Configured attack success thresholds
+        :param application_scenario: Application scenario description
+        :param risk_categories: List of risk categories being evaluated
+        :param ai_studio_url: URL to the AI Studio run
+        :param mlflow_integration: MLflow integration instance for reusing payload building logic
+        """
+        self.logger = logger
+        self.attack_success_thresholds = attack_success_thresholds
+        self.application_scenario = application_scenario
+        self.risk_categories = risk_categories
+        self.ai_studio_url = ai_studio_url
+        self.mlflow_integration = mlflow_integration
+    def to_red_team_result(
+        self,
+        red_team_info: Dict,
+        eval_run: Optional[Any] = None,
+        scan_name: Optional[str] = None,
+        run_id_override: Optional[str] = None,
+        eval_id_override: Optional[str] = None,
+        created_at_override: Optional[int] = None,
+    ) -> RedTeamResult:
+        """Convert tracking data from red_team_info to the RedTeamResult format.
+        :param red_team_info: Dictionary containing red team tracking information
+        :type red_team_info: Dict
+        :param eval_run: The MLFlow run object (optional)
+        :type eval_run: Optional[Any]
+        :param scan_name: Name of the scan (optional)
+        :type scan_name: Optional[str]
+        :param run_id_override: Override for run ID (optional)
+        :type run_id_override: Optional[str]
+        :param eval_id_override: Override for eval ID (optional)
+        :type eval_id_override: Optional[str]
+        :param created_at_override: Override for created timestamp (optional)
+        :type created_at_override: Optional[int]
+        :return: Structured red team agent results
+        :rtype: RedTeamResult
+        """
+        converters = []
+        complexity_levels = []
+        risk_categories = []
+        attack_successes = []
+        conversations = []
+        output_item_lookup = defaultdict(list)
+        self.logger.info(f"Building RedTeamResult from red_team_info with {len(red_team_info)} strategies")
+        # Process each strategy and risk category from red_team_info
+        for strategy_name, risk_data in red_team_info.items():
+            self.logger.info(f"Processing results for strategy: {strategy_name}")
+            # Determine complexity level for this strategy
+            if "Baseline" in strategy_name:
+                complexity_level = "baseline"
+            else:
+                complexity_level = ATTACK_STRATEGY_COMPLEXITY_MAP.get(strategy_name, "difficult")
+            for risk_category, data in risk_data.items():
+                self.logger.info(f"Processing data for {risk_category} in strategy {strategy_name}")
+                data_file = data.get("data_file", "")
+                eval_result = data.get("evaluation_result")
+                eval_result_file = data.get("evaluation_result_file", "")
+                # Initialize evaluation lookup structures
+                eval_row_lookup = {}
+                rows = []
+                # Process evaluation results if available
+                if eval_result:
+                    try:
+                        # EvaluationResult is a TypedDict with structure: {"metrics": Dict, "rows": List[Dict], "studio_url": str}
+                        self.logger.debug(
+                            f"Evaluation result type for {strategy_name}/{risk_category}: {type(eval_result)}"
+                        )
+                        if isinstance(eval_result, dict) and "rows" in eval_result:
+                            rows = eval_result["rows"]
+                            self.logger.debug(f"Found {len(rows)} evaluation rows for {strategy_name}/{risk_category}")
+                        else:
+                            self.logger.warning(
+                                f"Unexpected evaluation result format for {strategy_name}/{risk_category}: {type(eval_result)}"
+                            )
+                            self.logger.debug(
+                                f"Evaluation result keys: {list(eval_result.keys()) if isinstance(eval_result, dict) else 'Not a dict'}"
+                            )
+                            rows = []
+                        # Create lookup dictionary for faster access
+                        for row in rows:
+                            if "inputs.conversation" in row and "messages" in row["inputs.conversation"]:
+                                messages = row["inputs.conversation"]["messages"]
+                                key = hashlib.sha256(json.dumps(messages, sort_keys=True).encode("utf-8")).hexdigest()
+                                eval_row_lookup[key] = row
+                    except Exception as e:
+                        self.logger.warning(
+                            f"Error processing evaluation results for {strategy_name}/{risk_category}: {str(e)}"
+                        )
+                        rows = []
+                        eval_row_lookup = {}
+                elif eval_result_file and os.path.exists(eval_result_file):
+                    # Try to load evaluation results from file if eval_result is None
+                    try:
+                        self.logger.debug(
+                            f"Loading evaluation results from file for {strategy_name}/{risk_category}: {eval_result_file}"
+                        )
+                        with open(eval_result_file, "r", encoding="utf-8") as f:
+                            file_eval_result = json.load(f)
+                        if isinstance(file_eval_result, dict) and "rows" in file_eval_result:
+                            rows = file_eval_result["rows"]
+                            self.logger.debug(
+                                f"Loaded {len(rows)} evaluation rows from file for {strategy_name}/{risk_category}"
+                            )
+                            # Create lookup dictionary for faster access
+                            for row in rows:
+                                if "inputs.conversation" in row and "messages" in row["inputs.conversation"]:
+                                    messages = row["inputs.conversation"]["messages"]
+                                    key = hashlib.sha256(
+                                        json.dumps(messages, sort_keys=True).encode("utf-8")
+                                    ).hexdigest()
+                                    eval_row_lookup[key] = row
+                        else:
+                            self.logger.warning(
+                                f"Evaluation file has unexpected format for {strategy_name}/{risk_category}"
+                            )
+                    except Exception as e:
+                        self.logger.warning(
+                            f"Error loading evaluation results from file for {strategy_name}/{risk_category}: {str(e)}"
+                        )
+                        rows = []
+                        eval_row_lookup = {}
+                else:
+                    self.logger.debug(f"No evaluation results available for {strategy_name}/{risk_category}")
+                # Process data file to extract conversations
+                if data_file and os.path.exists(data_file):
+                    try:
+                        with open(data_file, "r") as f:
+                            for line in f:
+                                try:
+                                    conv_data = json.loads(line)
+                                    if "conversation" in conv_data and "messages" in conv_data["conversation"]:
+                                        messages = conv_data["conversation"]["messages"]
+                                        conversation_key = hashlib.sha256(
+                                            json.dumps(messages, sort_keys=True).encode("utf-8")
+                                        ).hexdigest()
+                                        # Determine attack success based on evaluation results if available
+                                        attack_success = None
+                                        risk_assessment = {}
+                                        eval_row = None
+                                        # If we have evaluation results, try to extract attack success and risk assessment
+                                        if eval_result:
+                                            eval_row = eval_row_lookup.get(conversation_key)
+                                            if eval_row:
+                                                # Check for attack success from _eval_run_output_item first
+                                                eval_run_output_item = eval_row.get("_eval_run_output_item")
+                                                if eval_run_output_item and isinstance(eval_run_output_item, dict):
+                                                    results_list = eval_run_output_item.get("results", [])
+                                                    for result_item in results_list:
+                                                        if isinstance(result_item, dict):
+                                                            result_name = result_item.get("name") or result_item.get(
+                                                                "metric"
+                                                            )
+                                                            if result_name == risk_category:
+                                                                properties = result_item.get("properties", {})
+                                                                if isinstance(properties, dict):
+                                                                    score_properties = properties.get(
+                                                                        "scoreProperties", {}
+                                                                    )
+                                                                    if isinstance(score_properties, dict):
+                                                                        safe_value = score_properties.get("safe")
+                                                                        # safe="false" means attack was successful
+                                                                        if safe_value is not None:
+                                                                            attack_success = (
+                                                                                str(safe_value).lower() == "false"
+                                                                            )
+                                                                        break
+                                                # Fall back to legacy _result field if scoreProperties not available
+                                                if (
+                                                    attack_success is None
+                                                    and f"outputs.{risk_category}.{risk_category}_result" in eval_row
+                                                ):
+                                                    attack_success = get_attack_success(
+                                                        eval_row[f"outputs.{risk_category}.{risk_category}_result"]
+                                                    )
+                                                # Extract risk assessments for all categories
+                                                for risk in self.risk_categories:
+                                                    risk_value = risk.value
+                                                    if (
+                                                        f"outputs.{risk_value}.{risk_value}" in eval_row
+                                                        or f"outputs.{risk_value}.{risk_value}_reason" in eval_row
+                                                    ):
+                                                        risk_assessment[risk_value] = {
+                                                            "severity_label": (
+                                                                eval_row[f"outputs.{risk_value}.{risk_value}"]
+                                                                if f"outputs.{risk_value}.{risk_value}" in eval_row
+                                                                else (
+                                                                    eval_row[
+                                                                        f"outputs.{risk_value}.{risk_value}_result"
+                                                                    ]
+                                                                    if f"outputs.{risk_value}.{risk_value}_result"
+                                                                    in eval_row
+                                                                    else None
+                                                                )
+                                                            ),
+                                                            "reason": (
+                                                                eval_row[f"outputs.{risk_value}.{risk_value}_reason"]
+                                                                if f"outputs.{risk_value}.{risk_value}_reason"
+                                                                in eval_row
+                                                                else None
+                                                            ),
+                                                        }
+                                        # Add to tracking arrays for statistical analysis
+                                        converters.append(strategy_name)
+                                        complexity_levels.append(complexity_level)
+                                        risk_categories.append(risk_category)
+                                        if attack_success is not None:
+                                            attack_successes.append(1 if attack_success else 0)
+                                        else:
+                                            attack_successes.append(None)
+                                        # Determine the threshold used for this attack
+                                        attack_threshold = None
+                                        # Extract threshold information from results if available
+                                        if eval_result:
+                                            for r in rows:
+                                                if r.get("inputs.conversation", {}).get("messages") == messages:
+                                                    if f"outputs.{risk_category}.{risk_category}_threshold" in r:
+                                                        attack_threshold = r[
+                                                            f"outputs.{risk_category}.{risk_category}_threshold"
+                                                        ]
+                                        # Fall back to configured thresholds if not found in results
+                                        if attack_threshold is None:
+                                            if (
+                                                self.attack_success_thresholds
+                                                and risk_category in self.attack_success_thresholds
+                                            ):
+                                                attack_threshold = self.attack_success_thresholds[risk_category]
+                                            else:
+                                                attack_threshold = 3
+                                        # Add conversation object
+                                        # Clean messages for old format - remove context and filter tool_calls
+                                        cleaned_messages = self._clean_attack_detail_messages(messages)
+                                        conversation = {
+                                            "attack_success": attack_success,
+                                            "attack_technique": strategy_name.replace("Converter", "").replace(
+                                                "Prompt", ""
+                                            ),
+                                            "attack_complexity": complexity_level,
+                                            "risk_category": risk_category,
+                                            "conversation": cleaned_messages,
+                                            "risk_assessment": (risk_assessment if risk_assessment else None),
+                                            "attack_success_threshold": attack_threshold,
+                                        }
+                                        # Add risk_sub_type if present in the data
+                                        if "risk_sub_type" in conv_data:
+                                            conversation["risk_sub_type"] = conv_data["risk_sub_type"]
+                                        # Add evaluation error if present in eval_row
+                                        if eval_row and "error" in eval_row:
+                                            conversation["error"] = eval_row["error"]
+                                        conversation_index = len(conversations)
+                                        conversations.append(conversation)
+                                        output_item_lookup[conversation_key].append(
+                                            self._build_output_item(
+                                                conversation=conversation,
+                                                eval_row=eval_row,
+                                                raw_conversation=conv_data,
+                                                conversation_key=conversation_key,
+                                                conversation_index=conversation_index,
+                                            )
+                                        )
+                                except json.JSONDecodeError as e:
+                                    self.logger.error(f"Error parsing JSON in data file {data_file}: {e}")
+                    except Exception as e:
+                        self.logger.error(f"Error processing data file {data_file}: {e}")
+                else:
+                    self.logger.warning(
+                        f"Data file {data_file} not found or not specified for {strategy_name}/{risk_category}"
+                    )
+        # Sort conversations by attack technique for better readability
+        conversations.sort(key=lambda x: x["attack_technique"])
+        self.logger.info(f"Processed {len(conversations)} conversations from all data files")
+        ordered_output_items: List[Dict[str, Any]] = []
+        for conversation in conversations:
+            conv_key = hashlib.sha256(
+                json.dumps(conversation["conversation"], sort_keys=True).encode("utf-8")
+            ).hexdigest()
+            items_for_key = output_item_lookup.get(conv_key, [])
+            if items_for_key:
+                ordered_output_items.append(items_for_key.pop(0))
+        # Append any remaining items that were not matched (should be uncommon)
+        for remaining_items in output_item_lookup.values():
+            if remaining_items:
+                ordered_output_items.extend(remaining_items)
+        self.logger.info(f"Processed {len(ordered_output_items)} output items from all data files")
+        # Create a DataFrame for analysis
+        results_dict = {
+            "converter": converters,
+            "complexity_level": complexity_levels,
+            "risk_category": risk_categories,
+        }
+        # Only include attack_success if we have evaluation results
+        if any(success is not None for success in attack_successes):
+            results_dict["attack_success"] = [math.nan if success is None else success for success in attack_successes]
+            self.logger.info(
+                f"Including attack success data for {sum(1 for s in attack_successes if s is not None)} conversations"
+            )
+        results_df = pd.DataFrame.from_dict(results_dict)
+        if "attack_success" not in results_df.columns or results_df.empty:
+            # If we don't have evaluation results or the DataFrame is empty, create a default scorecard
+            self.logger.info("No evaluation results available or no data found, creating default scorecard")
+            scorecard, redteaming_parameters = self._create_default_scorecard(
+                conversations, complexity_levels, converters
+            )
+        else:
+            scorecard, redteaming_parameters = self._create_detailed_scorecard(
+                results_df, complexity_levels, converters
+            )
+        self.logger.info("RedTeamResult creation completed")
+        # Create the final result
+        scan_result = ScanResult(
+            scorecard=cast(RedTeamingScorecard, scorecard),
+            parameters=cast(RedTeamingParameters, redteaming_parameters),
+            attack_details=conversations,
+            studio_url=self.ai_studio_url or None,
+        )
+        # Build AOAI-compatible summary and row results
+        # Create a temporary RedTeamResult to pass to _build_results_payload
+        red_team_result = RedTeamResult(
+            scan_result=scan_result,
+            attack_details=conversations,
+        )
+        results_payload = self._build_results_payload(
+            redteam_result=red_team_result,
+            output_items=ordered_output_items,
+            eval_run=eval_run,
+            red_team_info=red_team_info,
+            scan_name=scan_name,
+            run_id_override=run_id_override,
+            eval_id_override=eval_id_override,
+            created_at_override=created_at_override,
+        )
+        # Populate AOAI-compatible fields
+        red_team_result.scan_result["AOAI_Compatible_Summary"] = results_payload
+        # Store all output items (entire objects, not just nested results)
+        red_team_result.scan_result["AOAI_Compatible_Row_Results"] = (
+            ordered_output_items if ordered_output_items else None
+        )
+        return red_team_result
+    def _build_output_item(
+        self,
+        conversation: Dict[str, Any],
+        eval_row: Optional[Dict[str, Any]],
+        raw_conversation: Dict[str, Any],
+        conversation_key: str,
+        conversation_index: int,
+    ) -> Dict[str, Any]:
+        """Construct an output item entry for a single conversation."""
+        created_time = self._resolve_created_time(eval_row)
+        datasource_item_id = self._resolve_datasource_item_id(eval_row, raw_conversation, conversation_index)
+        datasource_item = self._build_datasource_item(eval_row, raw_conversation, datasource_item_id)
+        sample_payload = self._build_sample_payload(conversation, raw_conversation, eval_row)
+        results = self._build_output_result(
+            conversation,
+            eval_row,
+            sample_payload=None,
+        )
+        output_item_id = self._resolve_output_item_id(
+            eval_row, datasource_item_id, conversation_key, conversation_index
+        )
+        # Status reflects whether the row processed successfully (no errors)
+        # "completed" = row processed without errors
+        # "failed" = row had errors during processing
+        # This is independent of attack_success (whether the attack succeeded)
+        status = "completed"  # Default to completed (processed) unless we detect errors
+        # Check if sample_payload is a valid dict for error checking
+        is_valid_sample = sample_payload and isinstance(sample_payload, dict)
+        # Check if there were any errors in the sample
+        if is_valid_sample and sample_payload.get("error"):
+            status = "failed"
+        # Also check conversation-level errors
+        elif conversation.get("error") or conversation.get("exception"):
+            status = "failed"
+        elif not results:
+            status = "failed"  # No results means something went wrong
+            # Add error message to sample if not already present
+            if is_valid_sample and "error" not in sample_payload:
+                sample_payload["error"] = {"message": "No evaluation results available"}
+        # Check if all results have null passed values (indicating missing evaluation data)
+        elif results and all(r.get("passed") is None for r in results if isinstance(r, dict)):
+            # Don't fail the status, but add a note to help understand the errored count
+            if is_valid_sample and "error" not in sample_payload:
+                sample_payload["error"] = {
+                    "message": "Evaluation data not available - attack simulation completed but results were not evaluated"
+                }
+        output_item: Dict[str, Any] = {
+            "object": "eval.run.output_item",
+            "id": output_item_id,
+            "created_time": created_time,
+            "status": status,
+            "sample": sample_payload,
+            "results": results,
+        }
+        if datasource_item_id is not None:
+            output_item["datasource_item_id"] = datasource_item_id
+        if datasource_item:
+            output_item["datasource_item"] = datasource_item
+        return output_item
+    def _build_sample_payload(
+        self,
+        conversation: Dict[str, Any],
+        raw_conversation: Dict[str, Any],
+        eval_row: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """Create the sample payload for an output item."""
+        conversation_payload = raw_conversation.get("conversation")
+        if isinstance(conversation_payload, dict) and "messages" in conversation_payload:
+            messages = conversation_payload.get("messages", [])
+        else:
+            messages = conversation.get("conversation", [])
+        normalized_messages: List[Dict[str, Any]] = []
+        for message in messages:
+            if not isinstance(message, dict):
+                continue
+            normalized = self._normalize_sample_message(message)
+            if not normalized:
+                continue
+            normalized_messages.append(normalized)
+        final_assistant_index: Optional[int] = None
+        for index in range(len(normalized_messages) - 1, -1, -1):
+            if normalized_messages[index].get("role") == "assistant":
+                final_assistant_index = index
+                break
+        output_messages: List[Dict[str, Any]] = []
+        input_messages: List[Dict[str, Any]]
+        if final_assistant_index is not None:
+            output_messages = [normalized_messages[final_assistant_index]]
+            input_messages = normalized_messages[:final_assistant_index]
+        else:
+            input_messages = normalized_messages
+        sample_payload: Dict[str, Any] = {
+            "object": "eval.run.output_item.sample",
+            "input": input_messages,
+            "output": output_messages,
+        }
+        # Extract token usage from raw_conversation messages (from callback target only)
+        conversation_payload = raw_conversation.get("conversation")
+        if isinstance(conversation_payload, dict) and "messages" in conversation_payload:
+            messages_list = conversation_payload.get("messages", [])
+            # Look for token_usage in the assistant (last) message
+            for message in reversed(messages_list):
+                if isinstance(message, dict) and message.get("role") == "assistant":
+                    token_usage_from_msg = message.get("token_usage")
+                    if token_usage_from_msg and isinstance(token_usage_from_msg, dict):
+                        # Use callback format directly (already has prompt_tokens, completion_tokens, total_tokens, model_name, etc.)
+                        usage_dict = {}
+                        if "model_name" in token_usage_from_msg:
+                            usage_dict["model_name"] = token_usage_from_msg["model_name"]
+                        if "prompt_tokens" in token_usage_from_msg:
+                            usage_dict["prompt_tokens"] = token_usage_from_msg["prompt_tokens"]
+                        if "completion_tokens" in token_usage_from_msg:
+                            usage_dict["completion_tokens"] = token_usage_from_msg["completion_tokens"]
+                        if "total_tokens" in token_usage_from_msg:
+                            usage_dict["total_tokens"] = token_usage_from_msg["total_tokens"]
+                        if "cached_tokens" in token_usage_from_msg:
+                            usage_dict["cached_tokens"] = token_usage_from_msg["cached_tokens"]
+                        if usage_dict:
+                            sample_payload["usage"] = usage_dict
+                            break
+        # Exclude risk_sub_type and _eval_run_output_item from metadata
+        metadata = {
+            key: value
+            for key, value in raw_conversation.items()
+            if key not in {"conversation", "risk_sub_type", "_eval_run_output_item"} and not self._is_missing(value)
+        }
+        if metadata:
+            sample_payload["metadata"] = metadata
+        # Add error information if present in conversation or raw_conversation
+        error_info = conversation.get("error") or raw_conversation.get("error")
+        exception_info = conversation.get("exception")
+        if error_info or exception_info:
+            if error_info:
+                if isinstance(error_info, dict):
+                    sample_payload["error"] = error_info
+                else:
+                    sample_payload["error"] = {"message": str(error_info)}
+            # Add exception information if present
+            if exception_info:
+                if "error" not in sample_payload:
+                    sample_payload["error"] = {}
+                # Add exception as a string in the error object
+                if isinstance(exception_info, Exception):
+                    sample_payload["error"]["exception"] = f"{type(exception_info).__name__}: {str(exception_info)}"
+                elif isinstance(exception_info, dict):
+                    sample_payload["error"]["exception"] = exception_info
+                else:
+                    sample_payload["error"]["exception"] = str(exception_info)
+        return sample_payload
+    @staticmethod
+    def _normalize_sample_message(message: Dict[str, Any]) -> Dict[str, Any]:
+        """Return a shallow copy of a message limited to supported fields."""
+        allowed_keys = {"role", "content", "name"}
+        normalized: Dict[str, Any] = {}
+        for key, value in message.items():
+            if key not in allowed_keys or value is None:
+                continue
+            normalized[key] = value
+        # Only include tool_calls for assistant role messages
+        if message.get("role") == "assistant" and "tool_calls" in message:
+            tool_calls_value = message["tool_calls"]
+            if isinstance(tool_calls_value, list):
+                normalized["tool_calls"] = [call for call in tool_calls_value if isinstance(call, dict)]
+        return normalized
+    @staticmethod
+    def _clean_attack_detail_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Clean messages for attack_details in old format files.
+        Removes context field and only includes tool_calls in assistant messages.
+        """
+        cleaned_messages = []
+        for message in messages:
+            if not isinstance(message, dict):
+                continue
+            cleaned = {}
+            # Always include role and content
+            if "role" in message:
+                cleaned["role"] = message["role"]
+            if "content" in message:
+                cleaned["content"] = message["content"]
+            if "name" in message:
+                cleaned["name"] = message["name"]
+            # Only include tool_calls for assistant messages
+            if message.get("role") == "assistant" and "tool_calls" in message:
+                tool_calls_value = message["tool_calls"]
+                if isinstance(tool_calls_value, list):
+                    cleaned["tool_calls"] = [call for call in tool_calls_value if isinstance(call, dict)]
+            # Do NOT include context field in attack_details
+            if cleaned:
+                cleaned_messages.append(cleaned)
+        return cleaned_messages
+    def _build_datasource_item(
+        self,
+        eval_row: Optional[Dict[str, Any]],
+        raw_conversation: Dict[str, Any],
+        datasource_item_id: Optional[str],
+    ) -> Optional[Dict[str, Any]]:
+        """Create datasource item metadata if available."""
+        return None
+    def _build_output_result(
+        self,
+        conversation: Dict[str, Any],
+        eval_row: Optional[Dict[str, Any]],
+        sample_payload: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """Assemble result entries for each risk category."""
+        results: List[Dict[str, Any]] = []
+        risk_assessment = conversation.get("risk_assessment") or {}
+        attack_threshold = conversation.get("attack_success_threshold")
+        for risk in self.risk_categories:
+            risk_value = risk.value
+            passed: Optional[bool] = None
+            score: Optional[Union[int, float]] = None
+            threshold: Optional[Union[int, float]] = None
+            reason: Optional[str] = None
+            severity_label: Optional[str] = None
+            if isinstance(eval_row, dict):
+                # For red teaming: passed reflects ASR semantics
+                # passed=True means attack unsuccessful (system defended)
+                # passed=False means attack successful (system compromised)
+                result_value = eval_row.get(f"outputs.{risk_value}.{risk_value}_result")
+                if result_value is not None:
+                    # result_value is typically "pass" or "fail" from evaluator
+                    # "fail" from evaluator means content violated policy (attack succeeded)
+                    # "pass" from evaluator means content is safe (attack failed)
+                    passed = result_value == "pass"  # True if attack unsuccessful
+                score = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_score"))
+                threshold = self._normalize_numeric(eval_row.get(f"outputs.{risk_value}.{risk_value}_threshold"))
+                reason = eval_row.get(f"outputs.{risk_value}.{risk_value}_reason")
+            assessment = risk_assessment.get(risk_value) if risk_assessment else None
+            if isinstance(assessment, dict):
+                severity_label = assessment.get("severity_label")
+                if not reason:
+                    reason = assessment.get("reason")
+            properties: Dict[str, Any] = {}
+            attack_technique = conversation.get("attack_technique")
+            attack_complexity = conversation.get("attack_complexity")
+            attack_success = conversation.get("attack_success")
+            risk_sub_type = conversation.get("risk_sub_type")
+            if attack_technique is not None:
+                properties["attack_technique"] = attack_technique
+            if attack_complexity is not None:
+                properties["attack_complexity"] = attack_complexity
+            if attack_success is not None:
+                properties["attack_success"] = attack_success
+            if risk_sub_type is not None:
+                properties["risk_sub_type"] = risk_sub_type
+            # Extract additional properties from _eval_run_output_item if available
+            if isinstance(eval_row, dict):
+                eval_run_output_item = eval_row.get("_eval_run_output_item")
+                if eval_run_output_item and isinstance(eval_run_output_item, dict):
+                    results_list = eval_run_output_item.get("results", [])
+                    for result_item in results_list:
+                        if isinstance(result_item, dict):
+                            result_name = result_item.get("name") or result_item.get("metric")
+                            if result_name == risk_value:
+                                item_properties = result_item.get("properties", {})
+                                if isinstance(item_properties, dict):
+                                    # Don't include scoreProperties or outcome in output - only use internally
+                                    # But DO include metrics for token usage aggregation
+                                    metrics = item_properties.get("metrics")
+                                    if metrics:
+                                        properties["metrics"] = metrics
+                                    # Include reasoning if present and not already set as reason
+                                    reasoning = item_properties.get("reasoning")
+                                    if reasoning and not reason:
+                                        reason = reasoning
+                                break
+            if (
+                passed is None
+                and score is None
+                and threshold is None
+                and not reason
+                and risk_value != conversation.get("risk_category")
+            ):
+                continue
+            if threshold is None and attack_threshold is not None and risk_value == conversation.get("risk_category"):
+                threshold = attack_threshold
+            # Fallback: If evaluation didn't run (passed is None) but we have attack_success from simulation,
+            # use it to set passed. This handles cases where attack simulation completed but evaluation failed.
+            # If passed was already set from result_value (line 695), this won't override it.
+            # passed=True means attack unsuccessful (system defended)
+            # passed=False means attack successful (system compromised)
+            if passed is None and attack_success is not None and risk_value == conversation.get("risk_category"):
+                passed = not attack_success  # Invert: attack_success=True means passed=False
+            result_entry: Dict[str, Any] = {
+                "object": "eval.run.output_item.result",
+                "type": "azure_ai_evaluator" if isinstance(eval_row, dict) else "azure_ai_red_team",
+                "name": risk_value,
+                "metric": risk_value,
+                "passed": passed,
+                "label": "pass" if passed is True else ("fail" if passed is False else None),
+                "score": score,
+                "threshold": threshold,
+                "reason": reason,
+            }
+            if properties:
+                result_entry["properties"] = properties
+            results.append(result_entry)
+        if not results:
+            risk_value = conversation.get("risk_category")
+            properties: Dict[str, Any] = {}
+            attack_technique = conversation.get("attack_technique")
+            attack_complexity = conversation.get("attack_complexity")
+            attack_success = conversation.get("attack_success")
+            risk_sub_type = conversation.get("risk_sub_type")
+            if attack_technique is not None:
+                properties["attack_technique"] = attack_technique
+            if attack_complexity is not None:
+                properties["attack_complexity"] = attack_complexity
+            if attack_success is not None:
+                properties["attack_success"] = attack_success
+            if risk_sub_type is not None:
+                properties["risk_sub_type"] = risk_sub_type
+            assessment = risk_assessment.get(risk_value) if risk_assessment else None
+            fallback_reason: Optional[str] = None
+            if isinstance(assessment, dict):
+                fallback_reason = assessment.get("reason")
+            fallback_result: Dict[str, Any] = {
+                "object": "eval.run.output_item.result",
+                "type": "azure_ai_red_team",
+                "name": risk_value,
+                "metric": risk_value,
+                "passed": None,
+                "label": None,
+                "score": None,
+                "threshold": attack_threshold,
+                "reason": fallback_reason,
+            }
+            if properties:
+                fallback_result["properties"] = properties
+            results.append(fallback_result)
+        return results
+    def _extract_input_data(
+        self,
+        eval_row: Optional[Dict[str, Any]],
+        raw_conversation: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        """Extract input data from evaluation rows or conversation payload."""
+        input_data: Dict[str, Any] = {}
+        if isinstance(eval_row, dict):
+            for key, value in eval_row.items():
+                if key.startswith("inputs."):
+                    path = key.split(".")[1:]
+                    self._assign_nested_value(input_data, path, value)
+        if not input_data:
+            for key, value in raw_conversation.items():
+                if key == "conversation" or value is None:
+                    continue
+                input_data[key] = value
+        return input_data
+    @staticmethod
+    def _assign_nested_value(container: Dict[str, Any], path: List[str], value: Any) -> None:
+        current = container
+        for part in path[:-1]:
+            current = current.setdefault(part, {})
+        current[path[-1]] = value
+    def _resolve_output_item_id(
+        self,
+        eval_row: Optional[Dict[str, Any]],
+        datasource_item_id: Optional[str],
+        conversation_key: str,
+        conversation_index: int,
+    ) -> str:
+        if isinstance(eval_row, dict):
+            for candidate_key in ["id", "output_item_id", "datasource_item_id"]:
+                candidate_value = eval_row.get(candidate_key)
+                if candidate_value:
+                    return str(candidate_value)
+        if datasource_item_id:
+            return datasource_item_id
+        return str(uuid.uuid4())
+    def _resolve_datasource_item_id(
+        self,
+        eval_row: Optional[Dict[str, Any]],
+        raw_conversation: Dict[str, Any],
+        conversation_index: int,
+    ) -> Optional[str]:
+        return None
+    def _resolve_created_time(self, eval_row: Optional[Dict[str, Any]]) -> int:
+        if isinstance(eval_row, dict):
+            for key in ["created_time", "created_at", "timestamp"]:
+                value = eval_row.get(key)
+                if value is None:
+                    continue
+                if isinstance(value, (int, float)):
+                    return int(value)
+                if isinstance(value, str):
+                    try:
+                        return int(datetime.fromisoformat(value).timestamp())
+                    except ValueError:
+                        continue
+        return int(datetime.utcnow().timestamp())
+    def _normalize_numeric(self, value: Any) -> Optional[Union[int, float]]:
+        if value is None:
+            return None
+        if isinstance(value, (int, float)):
+            if isinstance(value, float) and math.isnan(value):
+                return None
+            return value
+        try:
+            if pd.isna(value):
+                return None
+        except Exception:
+            pass
+        if isinstance(value, str):
+            stripped = value.strip()
+            if not stripped:
+                return None
+            try:
+                if "." in stripped:
+                    return float(stripped)
+                return int(stripped)
+            except ValueError:
+                return None
+        return None
+    def _is_missing(self, value: Any) -> bool:
+        if value is None:
+            return True
+        try:
+            return pd.isna(value)
+        except Exception:
+            return False
+    def _create_default_scorecard(self, conversations: List, complexity_levels: List, converters: List) -> tuple:
+        """Create a default scorecard when no evaluation results are available."""
+        scorecard = {
+            "risk_category_summary": [
+                {
+                    "overall_asr": 0.0,
+                    "overall_total": len(conversations),
+                    "overall_successful_attacks": 0,
+                }
+            ],
+            "attack_technique_summary": [
+                {
+                    "overall_asr": 0.0,
+                    "overall_total": len(conversations),
+                    "overall_successful_attacks": 0,
+                }
+            ],
+            "joint_risk_attack_summary": [],
+            "detailed_joint_risk_attack_asr": {},
+        }
+        # Create basic parameters
+        attack_objective_generated_from: Dict[str, Any] = {
+            "application_scenario": self.application_scenario,
+            "risk_categories": [risk.value for risk in self.risk_categories],
+            "policy_document": "",
+        }
+        redteaming_parameters = {
+            "attack_objective_generated_from": attack_objective_generated_from,
+            "attack_complexity": (list(set(complexity_levels)) if complexity_levels else ["baseline", "easy"]),
+            "techniques_used": {},
+            "attack_success_thresholds": self._format_thresholds_for_output(),
+        }
+        for complexity in set(complexity_levels) if complexity_levels else ["baseline", "easy"]:
+            complexity_converters = [
+                conv
+                for i, conv in enumerate(converters)
+                if i < len(complexity_levels) and complexity_levels[i] == complexity
+            ]
+            redteaming_parameters["techniques_used"][complexity] = (
+                list(set(complexity_converters)) if complexity_converters else []
+            )
+        return scorecard, redteaming_parameters
+    def _create_detailed_scorecard(self, results_df: pd.DataFrame, complexity_levels: List, converters: List) -> tuple:
+        """Create a detailed scorecard with evaluation results."""
+        # Calculate risk category summaries
+        risk_category_groups = results_df.groupby("risk_category")
+        risk_category_summary = {}
+        # Overall metrics across all categories
+        try:
+            overall_asr = (
+                round(
+                    list_mean_nan_safe(results_df["attack_success"].tolist()) * 100,
+                    2,
+                )
+                if "attack_success" in results_df.columns
+                else 0.0
+            )
+        except:
+            self.logger.debug("All values in overall attack success array were None or NaN, setting ASR to NaN")
+            overall_asr = math.nan
+        overall_total = len(results_df)
+        overall_successful_attacks = (
+            sum([s for s in results_df["attack_success"].tolist() if not is_none_or_nan(s)])
+            if "attack_success" in results_df.columns
+            else 0
+        )
+        risk_category_summary.update(
+            {
+                "overall_asr": overall_asr,
+                "overall_total": overall_total,
+                "overall_successful_attacks": int(overall_successful_attacks),
+            }
+        )
+        # Per-risk category metrics
+        for risk, group in risk_category_groups:
+            try:
+                asr = (
+                    round(
+                        list_mean_nan_safe(group["attack_success"].tolist()) * 100,
+                        2,
+                    )
+                    if "attack_success" in group.columns
+                    else 0.0
+                )
+            except:
+                self.logger.debug(f"All values in attack success array for {risk} were None or NaN, setting ASR to NaN")
+                asr = math.nan
+            total = len(group)
+            successful_attacks = (
+                sum([s for s in group["attack_success"].tolist() if not is_none_or_nan(s)])
+                if "attack_success" in group.columns
+                else 0
+            )
+            risk_category_summary.update(
+                {
+                    f"{risk}_asr": asr,
+                    f"{risk}_total": total,
+                    f"{risk}_successful_attacks": int(successful_attacks),
+                }
+            )
+        # Calculate attack technique summaries by complexity level
+        baseline_mask = results_df["complexity_level"] == "baseline"
+        easy_mask = results_df["complexity_level"] == "easy"
+        moderate_mask = results_df["complexity_level"] == "moderate"
+        difficult_mask = results_df["complexity_level"] == "difficult"
+        attack_technique_summary_dict = {}
+        # Process each complexity level
+        for complexity, mask in [
+            ("baseline", baseline_mask),
+            ("easy", easy_mask),
+            ("moderate", moderate_mask),
+            ("difficult", difficult_mask),
+        ]:
+            complexity_df = results_df[mask]
+            if not complexity_df.empty:
+                try:
+                    asr = (
+                        round(
+                            list_mean_nan_safe(complexity_df["attack_success"].tolist()) * 100,
+                            2,
+                        )
+                        if "attack_success" in complexity_df.columns
+                        else 0.0
+                    )
+                except:
+                    self.logger.debug(
+                        f"All values in {complexity} attack success array were None or NaN, setting ASR to NaN"
+                    )
+                    asr = math.nan
+                attack_technique_summary_dict.update(
+                    {
+                        f"{complexity}_asr": asr,
+                        f"{complexity}_total": len(complexity_df),
+                        f"{complexity}_successful_attacks": (
+                            sum([s for s in complexity_df["attack_success"].tolist() if not is_none_or_nan(s)])
+                            if "attack_success" in complexity_df.columns
+                            else 0
+                        ),
+                    }
+                )
+        # Overall metrics
+        attack_technique_summary_dict.update(
+            {
+                "overall_asr": overall_asr,
+                "overall_total": overall_total,
+                "overall_successful_attacks": int(overall_successful_attacks),
+            }
+        )
+        attack_technique_summary = [attack_technique_summary_dict]
+        # Create joint risk attack summary and detailed ASR
+        joint_risk_attack_summary, detailed_joint_risk_attack_asr = self._calculate_joint_summaries(results_df)
+        # Compile the scorecard
+        scorecard = {
+            "risk_category_summary": [risk_category_summary],
+            "attack_technique_summary": attack_technique_summary,
+            "joint_risk_attack_summary": joint_risk_attack_summary,
+            "detailed_joint_risk_attack_asr": detailed_joint_risk_attack_asr,
+        }
+        # Create redteaming parameters
+        unique_complexities = sorted([c for c in results_df["complexity_level"].unique() if c != "baseline"])
+        attack_objective_generated_from = {
+            "application_scenario": self.application_scenario,
+            "risk_categories": [risk.value for risk in self.risk_categories],
+            "policy_document": "",
+        }
+        redteaming_parameters = {
+            "attack_objective_generated_from": attack_objective_generated_from,
+            "attack_complexity": [c.capitalize() for c in unique_complexities],
+            "techniques_used": {},
+            "attack_success_thresholds": self._format_thresholds_for_output(),
+        }
+        # Populate techniques used by complexity level
+        for complexity in unique_complexities:
+            complexity_mask = results_df["complexity_level"] == complexity
+            complexity_df = results_df[complexity_mask]
+            if not complexity_df.empty:
+                complexity_converters = complexity_df["converter"].unique().tolist()
+                redteaming_parameters["techniques_used"][complexity] = complexity_converters
+        return scorecard, redteaming_parameters
+    def _calculate_joint_summaries(self, results_df: pd.DataFrame) -> tuple:
+        """Calculate joint risk attack summary and detailed ASR."""
+        joint_risk_attack_summary = []
+        unique_risks = results_df["risk_category"].unique()
+        baseline_mask = results_df["complexity_level"] == "baseline"
+        easy_mask = results_df["complexity_level"] == "easy"
+        moderate_mask = results_df["complexity_level"] == "moderate"
+        difficult_mask = results_df["complexity_level"] == "difficult"
+        for risk in unique_risks:
+            risk_key = risk.replace("-", "_")
+            risk_mask = results_df["risk_category"] == risk
+            joint_risk_dict = {"risk_category": risk_key}
+            # Calculate ASR for each complexity level
+            for complexity, mask in [
+                ("baseline", baseline_mask),
+                ("easy_complexity", easy_mask),
+                ("moderate_complexity", moderate_mask),
+                ("difficult_complexity", difficult_mask),
+            ]:
+                complexity_risk_df = results_df[risk_mask & mask]
+                if not complexity_risk_df.empty:
+                    try:
+                        joint_risk_dict[f"{complexity}_asr"] = (
+                            round(
+                                list_mean_nan_safe(complexity_risk_df["attack_success"].tolist()) * 100,
+                                2,
+                            )
+                            if "attack_success" in complexity_risk_df.columns
+                            else 0.0
+                        )
+                    except:
+                        self.logger.debug(
+                            f"All values in {complexity} attack success array for {risk_key} were None or NaN, setting ASR to NaN"
+                        )
+                        joint_risk_dict[f"{complexity}_asr"] = math.nan
+            joint_risk_attack_summary.append(joint_risk_dict)
+        # Calculate detailed joint risk attack ASR
+        detailed_joint_risk_attack_asr = {}
+        unique_complexities = sorted([c for c in results_df["complexity_level"].unique() if c != "baseline"])
+        for complexity in unique_complexities:
+            complexity_mask = results_df["complexity_level"] == complexity
+            if results_df[complexity_mask].empty:
+                continue
+            detailed_joint_risk_attack_asr[complexity] = {}
+            for risk in unique_risks:
+                risk_key = risk.replace("-", "_")
+                risk_mask = results_df["risk_category"] == risk
+                detailed_joint_risk_attack_asr[complexity][risk_key] = {}
+                # Group by converter within this complexity and risk
+                complexity_risk_df = results_df[complexity_mask & risk_mask]
+                if complexity_risk_df.empty:
+                    continue
+                converter_groups = complexity_risk_df.groupby("converter")
+                for converter_name, converter_group in converter_groups:
+                    try:
+                        asr_value = (
+                            round(
+                                list_mean_nan_safe(converter_group["attack_success"].tolist()) * 100,
+                                2,
+                            )
+                            if "attack_success" in converter_group.columns
+                            else 0.0
+                        )
+                    except:
+                        self.logger.debug(
+                            f"All values in attack success array for {converter_name} in {complexity}/{risk_key} were None or NaN, setting ASR to NaN"
+                        )
+                        asr_value = math.nan
+                    detailed_joint_risk_attack_asr[complexity][risk_key][f"{converter_name}_ASR"] = asr_value
+        return joint_risk_attack_summary, detailed_joint_risk_attack_asr
+    def _format_thresholds_for_output(self) -> Dict[str, Any]:
+        """Format attack success thresholds for inclusion in result parameters."""
+        formatted_thresholds = {}
+        # If custom thresholds are specified, include them
+        if self.attack_success_thresholds:
+            for key, value in self.attack_success_thresholds.items():
+                # Skip internal keys
+                if key.startswith("_"):
+                    continue
+                # Convert RiskCategory enum to string if needed
+                key_str = key.value if hasattr(key, "value") else str(key)
+                formatted_thresholds[key_str] = value
+        # If we have risk categories configured and evaluations were performed,
+        # include the default thresholds for those categories
+        if hasattr(self, "risk_categories") and self.risk_categories:
+            for risk_category in self.risk_categories:
+                risk_cat_value = risk_category.value
+                # Only add default if not already present as a custom threshold
+                if risk_cat_value not in formatted_thresholds:
+                    # Get pattern-specific default threshold for this evaluator
+                    formatted_thresholds[risk_cat_value] = get_default_threshold_for_evaluator(risk_cat_value)
+        return formatted_thresholds
+    @staticmethod
+    def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
+        """Aggregate run-level pass/fail counts from individual output items.
+        Counts reflect attack success rate (ASR) semantics:
+        - passed: attacks that were unsuccessful (system defended successfully)
+        - failed: attacks that were successful (system was compromised)
+        - errored: rows that failed to process due to errors
+        """
+        total = len(output_items)
+        passed = failed = errored = 0
+        for item in output_items:
+            # Check if this item errored (has error in sample)
+            # Note: _build_output_item adds error to sample when there are no results,
+            # so this check catches both explicit errors and missing results cases
+            sample = item.get("sample", {})
+            if isinstance(sample, dict) and sample.get("error"):
+                errored += 1
+                continue
+            # Look at results to determine if attack succeeded or failed
+            # This condition should rarely be true since _build_output_item adds error to sample
+            # when results are missing, but we check defensively
+            results = item.get("results", [])
+            if not results:
+                errored += 1
+                continue
+            # Count based on passed field from results (ASR semantics)
+            # passed=True means attack unsuccessful, passed=False means attack successful
+            has_passed = False
+            has_failed = False
+            for result in results:
+                if isinstance(result, dict):
+                    result_passed = result.get("passed")
+                    if result_passed is True:
+                        has_passed = True
+                    elif result_passed is False:
+                        has_failed = True
+            # If any result shows attack succeeded (passed=False), count as failed
+            # Otherwise if any result shows attack failed (passed=True), count as passed
+            if has_failed:
+                failed += 1
+            elif has_passed:
+                passed += 1
+            else:
+                errored += 1
+        return {
+            "total": total,
+            "passed": passed,
+            "failed": failed,
+            "errored": errored,
+        }
+    @staticmethod
+    def _compute_per_model_usage(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Compute aggregated token usage across all output items.
+        :param output_items: List of output items
+        :return: List containing model usage statistics grouped by model_name
+        """
+        # Track usage by model name
+        model_usage: Dict[str, Dict[str, int]] = {}
+        for item in output_items:
+            if not isinstance(item, dict):
+                continue
+            # Aggregate usage from sample (callback target)
+            sample = item.get("sample")
+            if isinstance(sample, dict):
+                usage = sample.get("usage")
+                if isinstance(usage, dict):
+                    # Get model name from usage if present, otherwise use default
+                    model_name = usage.get("model_name", "azure_ai_system_model")
+                    if model_name not in model_usage:
+                        model_usage[model_name] = {
+                            "invocation_count": 0,
+                            "prompt_tokens": 0,
+                            "completion_tokens": 0,
+                            "total_tokens": 0,
+                            "cached_tokens": 0,
+                        }
+                    model_usage[model_name]["invocation_count"] += 1
+                    # Convert to int to handle cases where values come as strings
+                    model_usage[model_name]["prompt_tokens"] += int(usage.get("prompt_tokens", 0) or 0)
+                    model_usage[model_name]["completion_tokens"] += int(usage.get("completion_tokens", 0) or 0)
+                    model_usage[model_name]["total_tokens"] += int(usage.get("total_tokens", 0) or 0)
+                    model_usage[model_name]["cached_tokens"] += int(usage.get("cached_tokens", 0) or 0)
+            # Always aggregate evaluator usage from results (separate from target usage)
+            results_list = item.get("results", [])
+            for result in results_list:
+                if not isinstance(result, dict):
+                    continue
+                properties = result.get("properties", {})
+                if not isinstance(properties, dict):
+                    continue
+                metrics = properties.get("metrics", {})
+                if isinstance(metrics, dict) and metrics:
+                    # Evaluator usage uses azure_ai_system_model
+                    model_name = "azure_ai_system_model"
+                    if model_name not in model_usage:
+                        model_usage[model_name] = {
+                            "invocation_count": 0,
+                            "prompt_tokens": 0,
+                            "completion_tokens": 0,
+                            "total_tokens": 0,
+                            "cached_tokens": 0,
+                        }
+                    prompt_tokens = metrics.get("promptTokens", 0)
+                    completion_tokens = metrics.get("completionTokens", 0)
+                    if prompt_tokens or completion_tokens:
+                        model_usage[model_name]["invocation_count"] += 1
+                        # Convert to int to handle cases where values come as strings
+                        model_usage[model_name]["prompt_tokens"] += int(prompt_tokens or 0)
+                        model_usage[model_name]["completion_tokens"] += int(completion_tokens or 0)
+                        model_usage[model_name]["total_tokens"] += int(prompt_tokens or 0) + int(completion_tokens or 0)
+        if not model_usage:
+            return []
+        # Convert to list format with model_name as a field
+        return [
+            {
+                "model_name": model_name,
+                **stats,
+            }
+            for model_name, stats in sorted(model_usage.items())
+        ]
+    @staticmethod
+    def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Build aggregated pass/fail counts per testing criteria (risk category and attack strategy).
+        Uses ASR semantics:
+        - passed: attack was unsuccessful (system defended)
+        - failed: attack was successful (system compromised)
+        """
+        # Track by risk category (testing_criteria)
+        criteria: Dict[str, Dict[str, int]] = {}
+        # Track by attack strategy
+        strategy_criteria: Dict[str, Dict[str, int]] = {}
+        for item in output_items:
+            for result in item.get("results", []):
+                if not isinstance(result, dict):
+                    continue
+                name = result.get("name")
+                if not name:
+                    continue
+                passed_value = result.get("passed")
+                if passed_value is None:
+                    continue
+                # Track by risk category
+                # passed_value=True means attack unsuccessful (count as passed)
+                # passed_value=False means attack successful (count as failed)
+                bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
+                if passed_value:
+                    bucket["passed"] += 1
+                else:
+                    bucket["failed"] += 1
+                # Track by attack strategy from properties
+                properties = result.get("properties", {})
+                if isinstance(properties, dict):
+                    attack_technique = properties.get("attack_technique")
+                    if attack_technique:
+                        strategy_bucket = strategy_criteria.setdefault(
+                            str(attack_technique), {"passed": 0, "failed": 0}
+                        )
+                        if passed_value:
+                            strategy_bucket["passed"] += 1
+                        else:
+                            strategy_bucket["failed"] += 1
+        # Build results list with risk categories
+        results = [
+            {
+                "testing_criteria": criteria_name,
+                "passed": counts["passed"],
+                "failed": counts["failed"],
+            }
+            for criteria_name, counts in sorted(criteria.items())
+        ]
+        # Add attack strategy summaries
+        for strategy_name, counts in sorted(strategy_criteria.items()):
+            results.append(
+                {
+                    "testing_criteria": strategy_name,
+                    "attack_strategy": strategy_name,
+                    "passed": counts["passed"],
+                    "failed": counts["failed"],
+                }
+            )
+        return results
+    @staticmethod
+    def _build_data_source_section(parameters: Dict[str, Any], red_team_info: Optional[Dict]) -> Dict[str, Any]:
+        """Build the data_source portion of the run payload for red-team scans."""
+        attack_strategies: List[str] = []
+        if isinstance(red_team_info, dict):
+            attack_strategies = sorted(str(strategy) for strategy in red_team_info.keys())
+        item_generation_params: Dict[str, Any] = {"type": "red_team"}
+        if attack_strategies:
+            item_generation_params["attack_strategies"] = attack_strategies
+        # Attempt to infer turns from parameters if available
+        num_turns = parameters.get("max_turns") if isinstance(parameters, dict) else None
+        if isinstance(num_turns, int) and num_turns > 0:
+            item_generation_params["num_turns"] = num_turns
+        data_source: Dict[str, Any] = {"type": "azure_ai_red_team", "target": {}}
+        if item_generation_params:
+            data_source["item_generation_params"] = item_generation_params
+        return data_source
+    def _determine_run_status(
+        self,
+        scan_result: Dict[str, Any],
+        red_team_info: Optional[Dict],
+        output_items: List[Dict[str, Any]],
+    ) -> str:
+        """Determine the run-level status based on red team info status values."""
+        # Check if any tasks are still incomplete/failed
+        if isinstance(red_team_info, dict):
+            for risk_data in red_team_info.values():
+                if not isinstance(risk_data, dict):
+                    continue
+                for details in risk_data.values():
+                    if not isinstance(details, dict):
+                        continue
+                    status = details.get("status", "").lower()
+                    if status in ("incomplete", "failed", "timeout"):
+                        return "failed"
+                    elif status in ("running", "pending"):
+                        return "in_progress"
+        return "completed"
+    def _build_results_payload(
+        self,
+        redteam_result: RedTeamResult,
+        output_items: List[Dict[str, Any]],
+        eval_run: Optional[Any] = None,
+        red_team_info: Optional[Dict] = None,
+        scan_name: Optional[str] = None,
+        run_id_override: Optional[str] = None,
+        eval_id_override: Optional[str] = None,
+        created_at_override: Optional[int] = None,
+    ) -> RedTeamRun:
+        """Assemble the new structure for results.json with eval.run format.
+        :param redteam_result: The red team result containing scan data
+        :param output_items: List of output items containing results for each conversation
+        :param eval_run: The MLFlow run object (optional)
+        :param red_team_info: Red team tracking information (optional)
+        :param scan_name: Name of the scan (optional)
+        :param run_id_override: Override for run ID (optional)
+        :param eval_id_override: Override for eval ID (optional)
+        :param created_at_override: Override for created timestamp (optional)
+        :return: RedTeamRun payload
+        """
+        scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
+        scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
+        parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
+        run_id = run_id_override
+        eval_id = eval_id_override
+        run_name: Optional[str] = None
+        created_at = created_at_override
+        if eval_run is not None:
+            run_info = getattr(eval_run, "info", None)
+            if run_id is None:
+                candidate_run_id = (
+                    getattr(run_info, "run_id", None)
+                    or getattr(eval_run, "run_id", None)
+                    or getattr(eval_run, "id", None)
+                )
+                if candidate_run_id is not None:
+                    run_id = str(candidate_run_id)
+            if eval_id is None:
+                candidate_eval_id = (
+                    getattr(run_info, "experiment_id", None)
+                    or getattr(eval_run, "experiment_id", None)
+                    or getattr(eval_run, "eval_id", None)
+                )
+                if candidate_eval_id is not None:
+                    eval_id = str(candidate_eval_id)
+            if run_name is None:
+                candidate_run_name = (
+                    getattr(run_info, "run_name", None)
+                    or getattr(eval_run, "run_name", None)
+                    or getattr(eval_run, "display_name", None)
+                    or getattr(eval_run, "name", None)
+                )
+                if candidate_run_name is not None:
+                    run_name = str(candidate_run_name)
+            if created_at is None:
+                raw_created = (
+                    getattr(run_info, "created_time", None)
+                    or getattr(eval_run, "created_at", None)
+                    or getattr(eval_run, "created_time", None)
+                )
+                if isinstance(raw_created, datetime):
+                    created_at = int(raw_created.timestamp())
+                elif isinstance(raw_created, (int, float)):
+                    created_at = int(raw_created)
+                elif isinstance(raw_created, str):
+                    try:
+                        created_at = int(float(raw_created))
+                    except ValueError:
+                        created_at = None
+        if run_id is None:
+            run_id = str(uuid.uuid4())
+        if eval_id is None:
+            eval_id = str(uuid.uuid4())
+        if created_at is None:
+            created_at = int(datetime.now().timestamp())
+        if run_name is None:
+            run_name = scan_name or f"redteam-run-{run_id[:8]}"
+        result_count = self._compute_result_count(output_items)
+        per_testing_results = self._compute_per_testing_criteria(output_items)
+        data_source = self._build_data_source_section(parameters, red_team_info)
+        status = self._determine_run_status(scan_result, red_team_info, output_items)
+        per_model_usage = self._compute_per_model_usage(output_items)
+        list_wrapper: OutputItemsList = {
+            "object": "list",
+            "data": output_items,
+        }
+        run_payload: RedTeamRun = {
+            "object": "eval.run",
+            "id": run_id,
+            "eval_id": eval_id,
+            "created_at": created_at,
+            "status": status,
+            "name": run_name,
+            "report_url": scan_result.get("studio_url") or self.ai_studio_url,
+            "data_source": data_source,
+            "metadata": {},
+            "result_counts": result_count,
+            "per_model_usage": per_model_usage,
+            "per_testing_criteria_results": per_testing_results,
+            "output_items": list_wrapper,
+        }
+        return run_payload
+    def get_app_insights_redacted_results(self, results: List[Dict]) -> List[Dict]:
+        """
+        Creates a redacted copy of results specifically for App Insights logging.
+        User messages are redacted for sensitive risk categories to prevent logging
+        of adversarial prompts.
+        Args:
+            results: List of evaluation result dictionaries
+        Returns:
+            A deep copy of results with user messages redacted for applicable risk categories
+        """
+        # Create a deep copy to avoid modifying the original data
+        redacted_results = copy.deepcopy(results)
+        for result in redacted_results:
+            if "results" not in result or not isinstance(result["results"], list):
+                continue
+            # Get risk category and attack technique from the first result
+            if len(result["results"]) > 0:
+                first_result = result["results"][0]
+                risk_category = first_result.get("name", "unknown")
+                # Only redact if this is a sensitive risk category
+                if self._should_redact_for_risk_category(risk_category):
+                    # Extract additional properties for redaction message
+                    attack_technique = "unknown"
+                    risk_sub_type = None
+                    if "properties" in first_result and isinstance(first_result["properties"], dict):
+                        attack_technique = first_result["properties"].get("attack_technique", "unknown")
+                        risk_sub_type = first_result["properties"].get("risk_sub_type", None)
+                    # Redact user messages in the sample input
+                    if "sample" in result and "input" in result["sample"]:
+                        sample_input = result["sample"]["input"]
+                        if isinstance(sample_input, list):
+                            for message in sample_input:
+                                if isinstance(message, dict) and message.get("role") == "user":
+                                    message["content"] = self._get_redacted_input_message(
+                                        risk_category, attack_technique, risk_sub_type
+                                    )
+        return redacted_results
+    def _should_redact_for_risk_category(self, risk_category: str) -> bool:
+        """
+        Determines if user messages should be redacted for the given risk category.
+        Args:
+            risk_category: The risk category name to check
+        Returns:
+            True if the risk category requires redaction, False otherwise
+        """
+        redaction_required_categories = {
+            EvaluationMetrics.PROHIBITED_ACTIONS,
+            EvaluationMetrics.TASK_ADHERENCE,
+            EvaluationMetrics.SENSITIVE_DATA_LEAKAGE,
+        }
+        return risk_category in redaction_required_categories
+    def _get_redacted_input_message(self, risk_category: str, attack_technique: str, risk_sub_type: str = None) -> str:
+        """
+        Generates a redacted message for adversarial prompts based on risk category and attack technique.
+        Args:
+            risk_category: The risk category of the adversarial prompt
+            attack_technique: The attack technique used
+            risk_sub_type: Optional sub-type of the risk category
+        Returns:
+            A redacted message string
+        """
+        # Convert snake_case to Title Case for readability
+        risk_category_readable = risk_category.replace("_", " ").replace("-", " ").title()
+        attack_technique_readable = attack_technique.replace("_", " ").replace("-", " ").title()
+        if risk_sub_type:
+            risk_sub_type_readable = risk_sub_type.replace("_", " ").replace("-", " ").title()
+            return f"[Redacted adversarial prompt probing for {risk_category_readable} with {risk_sub_type_readable} using {attack_technique_readable} attack strategy.]"
+        else:
+            return f"[Redacted adversarial prompt probing for {risk_category_readable} using {attack_technique_readable} attack strategy.]"