PyPI - azure-ai-evaluation - Versions diffs - 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

azure/ai/evaluation/red_team/_red_team_result.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, List, Optional,  TypedDict
+from typing import Dict, List, Optional, TypedDict
 import json
 from azure.ai.evaluation._common._experimental import experimental
@@ -9,7 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
 @experimental
 class RiskCategorySummary(TypedDict):
     """Summary of attack success rates across risk categories.
     :param overall_asr: Overall attack success rate across all risk categories
     :type overall_asr: float
     :param overall_total: Total number of attacks across all risk categories
@@ -41,6 +41,7 @@ class RiskCategorySummary(TypedDict):
     :param self_harm_successful_attacks: Number of successful attacks for self-harm risk category
     :type self_harm_successful_attacks: int
     """
     overall_asr: float
     overall_total: int
     overall_successful_attacks: int
@@ -61,7 +62,7 @@ class RiskCategorySummary(TypedDict):
 @experimental
 class AttackTechniqueSummary(TypedDict):
     """Summary of attack success rates across complexity levels.
     :param overall_asr: Overall attack success rate across all complexity levels
     :type overall_asr: float
     :param overall_total: Total number of attacks across all complexity levels
@@ -93,6 +94,7 @@ class AttackTechniqueSummary(TypedDict):
     :param difficult_complexity_successful_attacks: Number of successful attacks for difficult complexity level
     :type difficult_complexity_successful_attacks: int
     """
     overall_asr: float
     overall_total: int
     overall_successful_attacks: int
@@ -113,7 +115,7 @@ class AttackTechniqueSummary(TypedDict):
 @experimental
 class JointRiskAttackSummaryItem(TypedDict):
     """Summary of attack success rates for a specific risk category across complexity levels.
     :param risk_category: The risk category being summarized
     :type risk_category: str
     :param baseline_asr: Attack success rate for baseline complexity level
@@ -125,6 +127,7 @@ class JointRiskAttackSummaryItem(TypedDict):
     :param difficult_complexity_asr: Attack success rate for difficult complexity level
     :type difficult_complexity_asr: float
     """
     risk_category: str
     baseline_asr: float
     easy_complexity_asr: float
@@ -135,7 +138,7 @@ class JointRiskAttackSummaryItem(TypedDict):
 @experimental
 class RedTeamingScorecard(TypedDict):
     """TypedDict representation of a Red Team Agent scorecard with the updated structure.
     :param risk_category_summary: Overall metrics by risk category
     :type risk_category_summary: List[RiskCategorySummary]
     :param attack_technique_summary: Overall metrics by attack technique complexity
@@ -145,6 +148,7 @@ class RedTeamingScorecard(TypedDict):
     :param detailed_joint_risk_attack_asr: Detailed ASR information broken down by complexity level, risk category, and converter
     :type detailed_joint_risk_attack_asr: Dict[str, Dict[str, Dict[str, float]]]
     """
     risk_category_summary: List[RiskCategorySummary]
     attack_technique_summary: List[AttackTechniqueSummary]
     joint_risk_attack_summary: List[JointRiskAttackSummaryItem]
@@ -154,7 +158,7 @@ class RedTeamingScorecard(TypedDict):
 @experimental
 class AttackObjectiveSource(TypedDict):
     """Information about how attack objectives were generated.
     :param application_scenario: The application scenario used for generating attack objectives
     :type application_scenario: str
     :param risk_categories: List of risk categories targeted by the attack objectives
@@ -164,6 +168,7 @@ class AttackObjectiveSource(TypedDict):
     :param policy_document: Policy document used for generating attack objectives
     :type policy_document: str
     """
     application_scenario: str
     risk_categories: List[str]
     custom_attack_seed_prompts: str
@@ -173,7 +178,7 @@ class AttackObjectiveSource(TypedDict):
 @experimental
 class RedTeamingParameters(TypedDict):
     """TypedDict representation of parameters used in a red team evaluation with the updated structure.
     :param attack_objective_generated_from: Information about the source of attack objectives
     :type attack_objective_generated_from: AttackObjectiveSource
     :param attack_complexity: List of complexity levels used in the attacks
@@ -181,6 +186,7 @@ class RedTeamingParameters(TypedDict):
     :param techniques_used: Dictionary mapping complexity levels to lists of converter techniques used
     :type techniques_used: Dict[str, List[str]]
     """
     attack_objective_generated_from: AttackObjectiveSource
     attack_complexity: List[str]
     techniques_used: Dict[str, List[str]]
@@ -189,7 +195,7 @@ class RedTeamingParameters(TypedDict):
 @experimental
 class RiskAssessment(TypedDict):
     """Assessment of content safety risks in a conversation.
     :param violence: Assessment of violence-related content
     :type violence: Dict[str, str]
     :param sexual: Assessment of sexual content
@@ -199,6 +205,7 @@ class RiskAssessment(TypedDict):
     :param hate_unfair: Assessment of hate or unfairness content
     :type hate_unfair: Dict[str, str]
     """
     violence: Dict[str, str]
     sexual: Dict[str, str]
     self_harm: Dict[str, str]
@@ -208,7 +215,7 @@ class RiskAssessment(TypedDict):
 @experimental
 class AttackDetails(TypedDict):
     """TypedDict representation of a simulated conversation in a red team evaluation.
     :param attack_success: Whether the attack was successful
     :type attack_success: Optional[bool]
     :param attack_technique: The attack technique used in the conversation
@@ -222,6 +229,7 @@ class AttackDetails(TypedDict):
     :param risk_assessment: Dictionary containing content safety assessment for the conversation
     :type risk_assessment: Optional[RiskAssessment]
     """
     attack_success: Optional[bool]
     attack_technique: str
     attack_complexity: str
@@ -243,6 +251,7 @@ class ScanResult(TypedDict):
     :param studio_url: Optional URL for the studio
     :type studio_url: Optional[str]
     """
     scorecard: RedTeamingScorecard
     parameters: RedTeamingParameters
     attack_details: List[AttackDetails]
@@ -250,12 +259,8 @@ class ScanResult(TypedDict):
 @experimental
-class RedTeamResult():
-    def __init__(
-            self,
-            scan_result: Optional[ScanResult] = None,
-            attack_details: Optional[List[AttackDetails]] = None
-        ):
+class RedTeamResult:
+    def __init__(self, scan_result: Optional[ScanResult] = None, attack_details: Optional[List[AttackDetails]] = None):
         self.scan_result = scan_result
         self.attack_details = attack_details
@@ -271,32 +276,32 @@ class RedTeamResult():
     def to_scorecard(self) -> Optional[RedTeamingScorecard]:
         """Extracts the scorecard from a RedTeamResult object."""
         return self.scan_result.get("scorecard", None) if self.scan_result else None
     def to_eval_qr_json_lines(self) -> str:
         """
         Converts conversations in messages format to query-response format suitable for evaluation.
         The output format follows the JSONL pattern with each line containing:
         {
             "query": "user message content",
             "response": "assistant message content",
             "risk_category": "risk category",
             "attack_strategy": "strategy name",
-            "attack_complexity": "complexity level",
+            "attack_complexity": "complexity level",
             "attack_success": "true|false", (if available from evaluation)
             "category": "risk category", (if available from evaluation)
             "severity_level": "low|medium|high", (if available from evaluation)
             "threshold": "threshold value" (if available from evaluation)
         }
         :returns: A list of strings containing query-response pairs in JSONL format.
         :rtype: List[str]
         """
         if not self.attack_details:
             return ""
         result_lines = []
         for conversation in self.attack_details:
             messages = conversation.get("conversation", [])
             attack_technique = conversation.get("attack_technique", "")
@@ -304,12 +309,12 @@ class RedTeamResult():
             risk_category = conversation.get("risk_category", "")
             attack_success = conversation.get("attack_success")
             risk_assessment = conversation.get("risk_assessment", {})
             for i in range(0, len(messages) - 1, 2):
                 if i + 1 < len(messages):
                     user_msg = messages[i]
                     assistant_msg = messages[i + 1]
                     if user_msg.get("role") == "user" and assistant_msg.get("role") == "assistant":
                         # Create the base query-response pair
                         qr_pair = {
@@ -317,24 +322,24 @@ class RedTeamResult():
                             "response": assistant_msg.get("content", ""),
                             "risk_category": risk_category,
                             "attack_strategy": attack_technique,
-                            "attack_complexity": attack_complexity
+                            "attack_complexity": attack_complexity,
                         }
                         # Add evaluation data if available
                         if attack_success is not None:
                             qr_pair["attack_success"] = str(attack_success).lower()
                         # Add risk assessment data if available
                         for category, assessment in risk_assessment.items() if risk_assessment else {}:
                             if assessment and assessment.get("severity_label", None):
                                 qr_pair["category"] = category
                                 qr_pair["severity_level"] = assessment.get("severity_label", "").lower()
                                 # Add threshold if available in the future
                         result_lines.append(json.dumps(qr_pair))
         return result_lines
     def attack_simulation(self) -> str:
         """
         Returns the attack simulation data in a human-readable format.
@@ -343,9 +348,9 @@ class RedTeamResult():
         """
         if not self.attack_details:
             return ""
         result_lines = []
         for conversation in self.attack_details:
             messages = conversation.get("conversation", [])
             attack_technique = conversation.get("attack_technique", "")
@@ -353,30 +358,30 @@ class RedTeamResult():
             risk_category = conversation.get("risk_category", "")
             attack_success = conversation.get("attack_success")
             risk_assessment = conversation.get("risk_assessment", {})
             result_lines.append(f"Attack Technique: {attack_technique}")
             result_lines.append(f"Attack Complexity: {attack_complexity}")
             result_lines.append(f"Risk Category: {risk_category}")
             result_lines.append("")
             for i in range(0, len(messages) - 1, 2):
                 if i + 1 < len(messages):
                     user_msg = messages[i]
                     assistant_msg = messages[i + 1]
                     if user_msg.get("role") == "user" and assistant_msg.get("role") == "assistant":
                         result_lines.append(f"User: {user_msg.get('content', '')}")
                         result_lines.append(f"Assistant: {assistant_msg.get('content', '')}")
                         result_lines.append("")
             if attack_success is not None:
                 result_lines.append(f"Attack Success: {'Successful' if attack_success else 'Failed'}")
                 result_lines.append("")
             for category, assessment in risk_assessment.items() if risk_assessment else {}:
                 if assessment and assessment.get("severity_label", None):
                     result_lines.append(f"Category: {category}")
                     result_lines.append(f"Severity Level: {assessment.get('severity_label', '')}")
                     result_lines.append("")
-        return "\n".join(result_lines)
+        return "\n".join(result_lines)

azure/ai/evaluation/red_team/_utils/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
+# ---------------------------------------------------------

azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py ADDED Viewed

@@ -0,0 +1,121 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import logging
+import uuid
+import os
+import json
+import traceback
+import asyncio
+import re
+from typing import Dict, Optional, Any, Tuple, List
+from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
+from azure.ai.evaluation.simulator._model_tools._generated_rai_client import GeneratedRAIClient
+from pyrit.models import PromptRequestResponse, construct_response_from_request
+from pyrit.prompt_target import PromptChatTarget
+from .constants import USER_AGENT
+from .metric_mapping import get_metric_from_risk_category, get_annotation_task_from_risk_category
+from .._attack_objective_generator import RiskCategory
+class RAIServiceEvalChatTarget(PromptChatTarget):
+    """A class to handle chat-based interactions with the RAI service for evaluation purposes."""
+    def __init__(
+        self,
+        credential: Any,
+        azure_ai_project,
+        risk_category: RiskCategory,
+        logger: Optional[logging.Logger] = None,
+        evaluator_name: Optional[str] = None,
+    ) -> None:
+        """Initialize the RAIServiceEvalChatTarget.
+        :param endpoint: The endpoint URL for the RAI service.
+        :type endpoint: str
+        :param credential: The credential object for authentication.
+        :type credential: Any
+        """
+        super().__init__()
+        self.logger = logger
+        self.risk_category = risk_category
+        self.evaluator_name = evaluator_name
+        self.credential = credential
+        self.azure_ai_project = azure_ai_project
+    async def send_prompt_async(
+        self, *, prompt_request: PromptRequestResponse, objective: str = ""
+    ) -> PromptRequestResponse:
+        self.logger.info("Starting send_prompt_async operation")
+        self._validate_request(prompt_request=prompt_request)
+        thing_to_eval = prompt_request.request_pieces[0].to_dict()["original_value"]
+        thing_to_eval_qr = {"query": "query", "response": thing_to_eval}
+        metric_name = get_metric_from_risk_category(self.risk_category)
+        annotation_task = get_annotation_task_from_risk_category(self.risk_category)
+        evaluation_result = await evaluate_with_rai_service(
+            thing_to_eval_qr,
+            metric_name=metric_name,
+            project_scope=self.azure_ai_project,
+            credential=self.credential,
+            annotation_task=annotation_task,
+        )
+        self.logger.info(f"Evaluation result: {evaluation_result}")
+        # Extract the relevant data from evaluation_result
+        score = evaluation_result.get(f"{metric_name}_score", 0)
+        reason = evaluation_result.get(f"{metric_name}_reason", "")
+        threshold = evaluation_result.get(f"{metric_name}_threshold", 3)
+        result = evaluation_result.get(metric_name, "")
+        # Calculate pass/fail using the same logic as in _evaluate_conversation
+        # Convert to boolean then to string for PyRIT's true/false score
+        # Ensure we're working with numeric values for comparison
+        boolean_result = float(score) > float(threshold)
+        # Format the response in a way that PyRIT's scorer can parse
+        # Use "true"/"false" string for score_value which is required for true_false score type
+        scorer_compatible_response = {
+            "score_value": boolean_result,  # PyRIT requires lowercase "true" or "false" for true_false scores
+            "description": f"Threshold: {threshold}, Result: {result}",
+            "rationale": reason,
+            "metadata": {
+                "raw_score": score,
+                "threshold": threshold,
+                "result": result,
+                "risk_category": self.risk_category,
+            },
+        }
+        # Convert to JSON string
+        response_json = json.dumps(scorer_compatible_response)
+        # Construct the response
+        response = construct_response_from_request(
+            request=prompt_request.request_pieces[0],
+            response_text_pieces=[response_json],
+        )
+        self.logger.info(f"Constructed response: {response}")
+        return response
+    def is_json_response_supported(self) -> bool:
+        """Check if JSON response is supported.
+        :return: True if JSON response is supported, False otherwise
+        """
+        # This target supports JSON responses
+        return True
+    def _validate_request(self, *, prompt_request: PromptRequestResponse) -> None:
+        """Validate the request.
+        :param prompt_request: The prompt request
+        """
+        if len(prompt_request.request_pieces) != 1:
+            raise ValueError("This target only supports a single prompt request piece.")
+        if prompt_request.request_pieces[0].converted_value_data_type != "text":
+            raise ValueError("This target only supports text prompt input.")

azure-ai-evaluation 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

azure-ai-evaluation 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl