azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +27 -1
- azure/ai/evaluation/_azure/_models.py +6 -6
- azure/ai/evaluation/_common/constants.py +6 -2
- azure/ai/evaluation/_common/rai_service.py +39 -5
- azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
- azure/ai/evaluation/_common/raiclient/_client.py +128 -0
- azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
- azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
- azure/ai/evaluation/_common/raiclient/_version.py +9 -0
- azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
- azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
- azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
- azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
- azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
- azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
- azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/raiclient/operations/_operations.py +1225 -0
- azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/raiclient/py.typed +1 -0
- azure/ai/evaluation/_common/utils.py +23 -3
- azure/ai/evaluation/_constants.py +7 -0
- azure/ai/evaluation/_converters/__init__.py +3 -0
- azure/ai/evaluation/_converters/_ai_services.py +804 -0
- azure/ai/evaluation/_converters/_models.py +302 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +104 -0
- azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +9 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +42 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +1 -1
- azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +109 -64
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +3 -3
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +23 -3
- azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +120 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +21 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +44 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +4 -2
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +44 -5
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +42 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +15 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +15 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +28 -4
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +21 -2
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +26 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +22 -4
- azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +152 -0
- azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +161 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +26 -3
- azure/ai/evaluation/_evaluators/_qa/_qa.py +51 -7
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +26 -2
- azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +158 -0
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +99 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +21 -2
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +113 -4
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +23 -3
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +24 -5
- azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +148 -0
- azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +117 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +292 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +71 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +103 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +2 -0
- azure/ai/evaluation/_exceptions.py +5 -0
- azure/ai/evaluation/_legacy/__init__.py +3 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +21 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
- azure/ai/evaluation/_legacy/_batch_engine/_config.py +45 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +368 -0
- azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
- azure/ai/evaluation/_legacy/_batch_engine/_logging.py +292 -0
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +23 -0
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +99 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +121 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +217 -0
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +105 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +82 -0
- azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
- azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +182 -0
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +59 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +313 -0
- azure/ai/evaluation/_legacy/prompty/_utils.py +545 -0
- azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
- azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
- azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +251 -150
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/red_team/_attack_objective_generator.py +195 -0
- azure/ai/evaluation/red_team/_attack_strategy.py +45 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +74 -0
- azure/ai/evaluation/red_team/_default_converter.py +21 -0
- azure/ai/evaluation/red_team/_red_team.py +1887 -0
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +3 -0
- azure/ai/evaluation/red_team/_utils/constants.py +65 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +165 -0
- azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +192 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +3 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +54 -27
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +145 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +71 -1
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/METADATA +80 -15
- azure_ai_evaluation-1.5.0.dist-info/RECORD +207 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- azure_ai_evaluation-1.3.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.3.0.dist-info → azure_ai_evaluation-1.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from typing import Dict, List, Optional, TypedDict
|
|
5
|
+
import json
|
|
6
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@experimental
|
|
10
|
+
class RiskCategorySummary(TypedDict):
|
|
11
|
+
"""Summary of attack success rates across risk categories.
|
|
12
|
+
|
|
13
|
+
:param overall_asr: Overall attack success rate across all risk categories
|
|
14
|
+
:type overall_asr: float
|
|
15
|
+
:param overall_total: Total number of attacks across all risk categories
|
|
16
|
+
:type overall_total: int
|
|
17
|
+
:param overall_successful_attacks: Number of successful attacks across all risk categories
|
|
18
|
+
:type overall_successful_attacks: int
|
|
19
|
+
:param hate_unfairness_asr: Attack success rate for hate/unfairness risk category
|
|
20
|
+
:type hate_unfairness_asr: float
|
|
21
|
+
:param hate_unfairness_total: Total number of attacks for hate/unfairness risk category
|
|
22
|
+
:type hate_unfairness_total: int
|
|
23
|
+
:param hate_unfairness_successful_attacks: Number of successful attacks for hate/unfairness risk category
|
|
24
|
+
:type hate_unfairness_successful_attacks: int
|
|
25
|
+
:param violence_asr: Attack success rate for violence risk category
|
|
26
|
+
:type violence_asr: float
|
|
27
|
+
:param violence_total: Total number of attacks for violence risk category
|
|
28
|
+
:type violence_total: int
|
|
29
|
+
:param violence_successful_attacks: Number of successful attacks for violence risk category
|
|
30
|
+
:type violence_successful_attacks: int
|
|
31
|
+
:param sexual_asr: Attack success rate for sexual risk category
|
|
32
|
+
:type sexual_asr: float
|
|
33
|
+
:param sexual_total: Total number of attacks for sexual risk category
|
|
34
|
+
:type sexual_total: int
|
|
35
|
+
:param sexual_successful_attacks: Number of successful attacks for sexual risk category
|
|
36
|
+
:type sexual_successful_attacks: int
|
|
37
|
+
:param self_harm_asr: Attack success rate for self-harm risk category
|
|
38
|
+
:type self_harm_asr: float
|
|
39
|
+
:param self_harm_total: Total number of attacks for self-harm risk category
|
|
40
|
+
:type self_harm_total: int
|
|
41
|
+
:param self_harm_successful_attacks: Number of successful attacks for self-harm risk category
|
|
42
|
+
:type self_harm_successful_attacks: int
|
|
43
|
+
"""
|
|
44
|
+
overall_asr: float
|
|
45
|
+
overall_total: int
|
|
46
|
+
overall_successful_attacks: int
|
|
47
|
+
hate_unfairness_asr: float
|
|
48
|
+
hate_unfairness_total: int
|
|
49
|
+
hate_unfairness_successful_attacks: int
|
|
50
|
+
violence_asr: float
|
|
51
|
+
violence_total: int
|
|
52
|
+
violence_successful_attacks: int
|
|
53
|
+
sexual_asr: float
|
|
54
|
+
sexual_total: int
|
|
55
|
+
sexual_successful_attacks: int
|
|
56
|
+
self_harm_asr: float
|
|
57
|
+
self_harm_total: int
|
|
58
|
+
self_harm_successful_attacks: int
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@experimental
|
|
62
|
+
class AttackTechniqueSummary(TypedDict):
|
|
63
|
+
"""Summary of attack success rates across complexity levels.
|
|
64
|
+
|
|
65
|
+
:param overall_asr: Overall attack success rate across all complexity levels
|
|
66
|
+
:type overall_asr: float
|
|
67
|
+
:param overall_total: Total number of attacks across all complexity levels
|
|
68
|
+
:type overall_total: int
|
|
69
|
+
:param overall_successful_attacks: Number of successful attacks across all complexity levels
|
|
70
|
+
:type overall_successful_attacks: int
|
|
71
|
+
:param baseline_asr: Attack success rate for baseline complexity level
|
|
72
|
+
:type baseline_asr: float
|
|
73
|
+
:param baseline_total: Total number of attacks for baseline complexity level
|
|
74
|
+
:type baseline_total: int
|
|
75
|
+
:param baseline_successful_attacks: Number of successful attacks for baseline complexity level
|
|
76
|
+
:type baseline_successful_attacks: int
|
|
77
|
+
:param easy_complexity_asr: Attack success rate for easy complexity level
|
|
78
|
+
:type easy_complexity_asr: float
|
|
79
|
+
:param easy_complexity_total: Total number of attacks for easy complexity level
|
|
80
|
+
:type easy_complexity_total: int
|
|
81
|
+
:param easy_complexity_successful_attacks: Number of successful attacks for easy complexity level
|
|
82
|
+
:type easy_complexity_successful_attacks: int
|
|
83
|
+
:param moderate_complexity_asr: Attack success rate for moderate complexity level
|
|
84
|
+
:type moderate_complexity_asr: float
|
|
85
|
+
:param moderate_complexity_total: Total number of attacks for moderate complexity level
|
|
86
|
+
:type moderate_complexity_total: int
|
|
87
|
+
:param moderate_complexity_successful_attacks: Number of successful attacks for moderate complexity level
|
|
88
|
+
:type moderate_complexity_successful_attacks: int
|
|
89
|
+
:param difficult_complexity_asr: Attack success rate for difficult complexity level
|
|
90
|
+
:type difficult_complexity_asr: float
|
|
91
|
+
:param difficult_complexity_total: Total number of attacks for difficult complexity level
|
|
92
|
+
:type difficult_complexity_total: int
|
|
93
|
+
:param difficult_complexity_successful_attacks: Number of successful attacks for difficult complexity level
|
|
94
|
+
:type difficult_complexity_successful_attacks: int
|
|
95
|
+
"""
|
|
96
|
+
overall_asr: float
|
|
97
|
+
overall_total: int
|
|
98
|
+
overall_successful_attacks: int
|
|
99
|
+
baseline_asr: float
|
|
100
|
+
baseline_total: int
|
|
101
|
+
baseline_successful_attacks: int
|
|
102
|
+
easy_complexity_asr: float
|
|
103
|
+
easy_complexity_total: int
|
|
104
|
+
easy_complexity_successful_attacks: int
|
|
105
|
+
moderate_complexity_asr: float
|
|
106
|
+
moderate_complexity_total: int
|
|
107
|
+
moderate_complexity_successful_attacks: int
|
|
108
|
+
difficult_complexity_asr: float
|
|
109
|
+
difficult_complexity_total: int
|
|
110
|
+
difficult_complexity_successful_attacks: int
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@experimental
|
|
114
|
+
class JointRiskAttackSummaryItem(TypedDict):
|
|
115
|
+
"""Summary of attack success rates for a specific risk category across complexity levels.
|
|
116
|
+
|
|
117
|
+
:param risk_category: The risk category being summarized
|
|
118
|
+
:type risk_category: str
|
|
119
|
+
:param baseline_asr: Attack success rate for baseline complexity level
|
|
120
|
+
:type baseline_asr: float
|
|
121
|
+
:param easy_complexity_asr: Attack success rate for easy complexity level
|
|
122
|
+
:type easy_complexity_asr: float
|
|
123
|
+
:param moderate_complexity_asr: Attack success rate for moderate complexity level
|
|
124
|
+
:type moderate_complexity_asr: float
|
|
125
|
+
:param difficult_complexity_asr: Attack success rate for difficult complexity level
|
|
126
|
+
:type difficult_complexity_asr: float
|
|
127
|
+
"""
|
|
128
|
+
risk_category: str
|
|
129
|
+
baseline_asr: float
|
|
130
|
+
easy_complexity_asr: float
|
|
131
|
+
moderate_complexity_asr: float
|
|
132
|
+
difficult_complexity_asr: float
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@experimental
|
|
136
|
+
class RedTeamingScorecard(TypedDict):
|
|
137
|
+
"""TypedDict representation of a Red Team Agent scorecard with the updated structure.
|
|
138
|
+
|
|
139
|
+
:param risk_category_summary: Overall metrics by risk category
|
|
140
|
+
:type risk_category_summary: List[RiskCategorySummary]
|
|
141
|
+
:param attack_technique_summary: Overall metrics by attack technique complexity
|
|
142
|
+
:type attack_technique_summary: List[AttackTechniqueSummary]
|
|
143
|
+
:param joint_risk_attack_summary: Detailed metrics by risk category and complexity level
|
|
144
|
+
:type joint_risk_attack_summary: List[JointRiskAttackSummaryItem]
|
|
145
|
+
:param detailed_joint_risk_attack_asr: Detailed ASR information broken down by complexity level, risk category, and converter
|
|
146
|
+
:type detailed_joint_risk_attack_asr: Dict[str, Dict[str, Dict[str, float]]]
|
|
147
|
+
"""
|
|
148
|
+
risk_category_summary: List[RiskCategorySummary]
|
|
149
|
+
attack_technique_summary: List[AttackTechniqueSummary]
|
|
150
|
+
joint_risk_attack_summary: List[JointRiskAttackSummaryItem]
|
|
151
|
+
detailed_joint_risk_attack_asr: Dict[str, Dict[str, Dict[str, float]]]
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@experimental
|
|
155
|
+
class AttackObjectiveSource(TypedDict):
|
|
156
|
+
"""Information about how attack objectives were generated.
|
|
157
|
+
|
|
158
|
+
:param application_scenario: The application scenario used for generating attack objectives
|
|
159
|
+
:type application_scenario: str
|
|
160
|
+
:param risk_categories: List of risk categories targeted by the attack objectives
|
|
161
|
+
:type risk_categories: List[str]
|
|
162
|
+
:param custom_attack_seed_prompts: Custom prompts used to seed attack objective generation
|
|
163
|
+
:type custom_attack_seed_prompts: str
|
|
164
|
+
:param policy_document: Policy document used for generating attack objectives
|
|
165
|
+
:type policy_document: str
|
|
166
|
+
"""
|
|
167
|
+
application_scenario: str
|
|
168
|
+
risk_categories: List[str]
|
|
169
|
+
custom_attack_seed_prompts: str
|
|
170
|
+
policy_document: str
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
@experimental
|
|
174
|
+
class RedTeamingParameters(TypedDict):
|
|
175
|
+
"""TypedDict representation of parameters used in a red team evaluation with the updated structure.
|
|
176
|
+
|
|
177
|
+
:param attack_objective_generated_from: Information about the source of attack objectives
|
|
178
|
+
:type attack_objective_generated_from: AttackObjectiveSource
|
|
179
|
+
:param attack_complexity: List of complexity levels used in the attacks
|
|
180
|
+
:type attack_complexity: List[str]
|
|
181
|
+
:param techniques_used: Dictionary mapping complexity levels to lists of converter techniques used
|
|
182
|
+
:type techniques_used: Dict[str, List[str]]
|
|
183
|
+
"""
|
|
184
|
+
attack_objective_generated_from: AttackObjectiveSource
|
|
185
|
+
attack_complexity: List[str]
|
|
186
|
+
techniques_used: Dict[str, List[str]]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@experimental
|
|
190
|
+
class RiskAssessment(TypedDict):
|
|
191
|
+
"""Assessment of content safety risks in a conversation.
|
|
192
|
+
|
|
193
|
+
:param violence: Assessment of violence-related content
|
|
194
|
+
:type violence: Dict[str, str]
|
|
195
|
+
:param sexual: Assessment of sexual content
|
|
196
|
+
:type sexual: Dict[str, str]
|
|
197
|
+
:param self_harm: Assessment of self-harm content
|
|
198
|
+
:type self_harm: Dict[str, str]
|
|
199
|
+
:param hate_unfair: Assessment of hate or unfairness content
|
|
200
|
+
:type hate_unfair: Dict[str, str]
|
|
201
|
+
"""
|
|
202
|
+
violence: Dict[str, str]
|
|
203
|
+
sexual: Dict[str, str]
|
|
204
|
+
self_harm: Dict[str, str]
|
|
205
|
+
hate_unfair: Dict[str, str]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@experimental
|
|
209
|
+
class AttackDetails(TypedDict):
|
|
210
|
+
"""TypedDict representation of a simulated conversation in a red team evaluation.
|
|
211
|
+
|
|
212
|
+
:param attack_success: Whether the attack was successful
|
|
213
|
+
:type attack_success: Optional[bool]
|
|
214
|
+
:param attack_technique: The attack technique used in the conversation
|
|
215
|
+
:type attack_technique: str
|
|
216
|
+
:param attack_complexity: The complexity level of the attack
|
|
217
|
+
:type attack_complexity: str
|
|
218
|
+
:param risk_category: The risk category targeted by the attack
|
|
219
|
+
:type risk_category: str
|
|
220
|
+
:param conversation: List of messages exchanged in the conversation
|
|
221
|
+
:type conversation: List[Dict[str, str]]
|
|
222
|
+
:param risk_assessment: Dictionary containing content safety assessment for the conversation
|
|
223
|
+
:type risk_assessment: Optional[RiskAssessment]
|
|
224
|
+
"""
|
|
225
|
+
attack_success: Optional[bool]
|
|
226
|
+
attack_technique: str
|
|
227
|
+
attack_complexity: str
|
|
228
|
+
risk_category: str
|
|
229
|
+
conversation: List[Dict[str, str]]
|
|
230
|
+
risk_assessment: Optional[RiskAssessment]
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@experimental
|
|
234
|
+
class ScanResult(TypedDict):
|
|
235
|
+
"""TypedDict representation of a Red Team Agent evaluation result with the updated structure.
|
|
236
|
+
|
|
237
|
+
:param scorecard: Scorecard containing summary and detailed ASR information
|
|
238
|
+
:type scorecard: RedTeamingScorecard
|
|
239
|
+
:param parameters: Parameters containing metadata about the evaluation run
|
|
240
|
+
:type parameters: RedTeamingParameters
|
|
241
|
+
:param attack_details: List of AttackDetails objects representing the conversations in the evaluation
|
|
242
|
+
:type attack_details: List[AttackDetails]
|
|
243
|
+
:param studio_url: Optional URL for the studio
|
|
244
|
+
:type studio_url: Optional[str]
|
|
245
|
+
"""
|
|
246
|
+
scorecard: RedTeamingScorecard
|
|
247
|
+
parameters: RedTeamingParameters
|
|
248
|
+
attack_details: List[AttackDetails]
|
|
249
|
+
studio_url: Optional[str]
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@experimental
|
|
253
|
+
class RedTeamResult():
|
|
254
|
+
def __init__(
|
|
255
|
+
self,
|
|
256
|
+
scan_result: Optional[ScanResult] = None,
|
|
257
|
+
attack_details: Optional[List[AttackDetails]] = None
|
|
258
|
+
):
|
|
259
|
+
self.scan_result = scan_result
|
|
260
|
+
self.attack_details = attack_details
|
|
261
|
+
|
|
262
|
+
def to_json(self) -> str:
|
|
263
|
+
"""
|
|
264
|
+
Converts a RedTeamResult object to a JSON-serializable dictionary.
|
|
265
|
+
|
|
266
|
+
:returns: A string containing the RedTeamResult in JSON format.
|
|
267
|
+
:rtype: str
|
|
268
|
+
"""
|
|
269
|
+
return json.dumps(self.scan_result) if self.scan_result else ""
|
|
270
|
+
|
|
271
|
+
def to_scorecard(self) -> Optional[RedTeamingScorecard]:
|
|
272
|
+
"""Extracts the scorecard from a RedTeamResult object."""
|
|
273
|
+
return self.scan_result.get("scorecard", None) if self.scan_result else None
|
|
274
|
+
|
|
275
|
+
def to_eval_qr_json_lines(self) -> str:
|
|
276
|
+
"""
|
|
277
|
+
Converts conversations in messages format to query-response format suitable for evaluation.
|
|
278
|
+
|
|
279
|
+
The output format follows the JSONL pattern with each line containing:
|
|
280
|
+
{
|
|
281
|
+
"query": "user message content",
|
|
282
|
+
"response": "assistant message content",
|
|
283
|
+
"risk_category": "risk category",
|
|
284
|
+
"attack_strategy": "strategy name",
|
|
285
|
+
"attack_complexity": "complexity level",
|
|
286
|
+
"attack_success": "true|false", (if available from evaluation)
|
|
287
|
+
"category": "risk category", (if available from evaluation)
|
|
288
|
+
"severity_level": "low|medium|high", (if available from evaluation)
|
|
289
|
+
"threshold": "threshold value" (if available from evaluation)
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
:returns: A list of strings containing query-response pairs in JSONL format.
|
|
293
|
+
:rtype: List[str]
|
|
294
|
+
"""
|
|
295
|
+
if not self.attack_details:
|
|
296
|
+
return ""
|
|
297
|
+
|
|
298
|
+
result_lines = []
|
|
299
|
+
|
|
300
|
+
for conversation in self.attack_details:
|
|
301
|
+
messages = conversation.get("conversation", [])
|
|
302
|
+
attack_technique = conversation.get("attack_technique", "")
|
|
303
|
+
attack_complexity = conversation.get("attack_complexity", "")
|
|
304
|
+
risk_category = conversation.get("risk_category", "")
|
|
305
|
+
attack_success = conversation.get("attack_success")
|
|
306
|
+
risk_assessment = conversation.get("risk_assessment", {})
|
|
307
|
+
|
|
308
|
+
for i in range(0, len(messages) - 1, 2):
|
|
309
|
+
if i + 1 < len(messages):
|
|
310
|
+
user_msg = messages[i]
|
|
311
|
+
assistant_msg = messages[i + 1]
|
|
312
|
+
|
|
313
|
+
if user_msg.get("role") == "user" and assistant_msg.get("role") == "assistant":
|
|
314
|
+
# Create the base query-response pair
|
|
315
|
+
qr_pair = {
|
|
316
|
+
"query": user_msg.get("content", ""),
|
|
317
|
+
"response": assistant_msg.get("content", ""),
|
|
318
|
+
"risk_category": risk_category,
|
|
319
|
+
"attack_strategy": attack_technique,
|
|
320
|
+
"attack_complexity": attack_complexity
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
# Add evaluation data if available
|
|
324
|
+
if attack_success is not None:
|
|
325
|
+
qr_pair["attack_success"] = str(attack_success).lower()
|
|
326
|
+
|
|
327
|
+
# Add risk assessment data if available
|
|
328
|
+
for category, assessment in risk_assessment.items() if risk_assessment else {}:
|
|
329
|
+
if assessment and assessment.get("severity_label", None):
|
|
330
|
+
qr_pair["category"] = category
|
|
331
|
+
qr_pair["severity_level"] = assessment.get("severity_label", "").lower()
|
|
332
|
+
# Add threshold if available in the future
|
|
333
|
+
|
|
334
|
+
result_lines.append(json.dumps(qr_pair))
|
|
335
|
+
|
|
336
|
+
return result_lines
|
|
337
|
+
|
|
338
|
+
def attack_simulation(self) -> str:
|
|
339
|
+
"""
|
|
340
|
+
Returns the attack simulation data in a human-readable format.
|
|
341
|
+
:returns: A string containing the attack simulation data in a human-readable format.
|
|
342
|
+
:rtype: str
|
|
343
|
+
"""
|
|
344
|
+
if not self.attack_details:
|
|
345
|
+
return ""
|
|
346
|
+
|
|
347
|
+
result_lines = []
|
|
348
|
+
|
|
349
|
+
for conversation in self.attack_details:
|
|
350
|
+
messages = conversation.get("conversation", [])
|
|
351
|
+
attack_technique = conversation.get("attack_technique", "")
|
|
352
|
+
attack_complexity = conversation.get("attack_complexity", "")
|
|
353
|
+
risk_category = conversation.get("risk_category", "")
|
|
354
|
+
attack_success = conversation.get("attack_success")
|
|
355
|
+
risk_assessment = conversation.get("risk_assessment", {})
|
|
356
|
+
|
|
357
|
+
result_lines.append(f"Attack Technique: {attack_technique}")
|
|
358
|
+
result_lines.append(f"Attack Complexity: {attack_complexity}")
|
|
359
|
+
result_lines.append(f"Risk Category: {risk_category}")
|
|
360
|
+
result_lines.append("")
|
|
361
|
+
|
|
362
|
+
for i in range(0, len(messages) - 1, 2):
|
|
363
|
+
if i + 1 < len(messages):
|
|
364
|
+
user_msg = messages[i]
|
|
365
|
+
assistant_msg = messages[i + 1]
|
|
366
|
+
|
|
367
|
+
if user_msg.get("role") == "user" and assistant_msg.get("role") == "assistant":
|
|
368
|
+
result_lines.append(f"User: {user_msg.get('content', '')}")
|
|
369
|
+
result_lines.append(f"Assistant: {assistant_msg.get('content', '')}")
|
|
370
|
+
result_lines.append("")
|
|
371
|
+
|
|
372
|
+
if attack_success is not None:
|
|
373
|
+
result_lines.append(f"Attack Success: {'Successful' if attack_success else 'Failed'}")
|
|
374
|
+
result_lines.append("")
|
|
375
|
+
|
|
376
|
+
for category, assessment in risk_assessment.items() if risk_assessment else {}:
|
|
377
|
+
if assessment and assessment.get("severity_label", None):
|
|
378
|
+
result_lines.append(f"Category: {category}")
|
|
379
|
+
result_lines.append(f"Severity Level: {assessment.get('severity_label', '')}")
|
|
380
|
+
result_lines.append("")
|
|
381
|
+
|
|
382
|
+
return "\n".join(result_lines)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants used in Red Team Agent.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
# Update imports to use direct import from azure.ai.evaluation
|
|
7
|
+
from azure.ai.evaluation import ViolenceEvaluator, HateUnfairnessEvaluator, SexualEvaluator, SelfHarmEvaluator
|
|
8
|
+
from .._attack_strategy import AttackStrategy
|
|
9
|
+
from .._attack_objective_generator import RiskCategory
|
|
10
|
+
|
|
11
|
+
# File extensions
|
|
12
|
+
BASELINE_IDENTIFIER = "baseline"
|
|
13
|
+
DATA_EXT = ".jsonl"
|
|
14
|
+
RESULTS_EXT = ".json"
|
|
15
|
+
|
|
16
|
+
# Mapping of attack strategies to complexity levels
|
|
17
|
+
|
|
18
|
+
ATTACK_STRATEGY_COMPLEXITY_MAP = {
|
|
19
|
+
str(AttackStrategy.Baseline.value): "baseline",
|
|
20
|
+
str(AttackStrategy.AnsiAttack.value): "easy",
|
|
21
|
+
str(AttackStrategy.AsciiArt.value): "easy",
|
|
22
|
+
str(AttackStrategy.AsciiSmuggler.value): "easy",
|
|
23
|
+
str(AttackStrategy.Atbash.value): "easy",
|
|
24
|
+
str(AttackStrategy.Base64.value): "easy",
|
|
25
|
+
str(AttackStrategy.Binary.value): "easy",
|
|
26
|
+
str(AttackStrategy.Caesar.value): "easy",
|
|
27
|
+
str(AttackStrategy.CharacterSpace.value): "easy",
|
|
28
|
+
str(AttackStrategy.CharSwap.value): "easy",
|
|
29
|
+
str(AttackStrategy.Diacritic.value): "easy",
|
|
30
|
+
str(AttackStrategy.Flip.value): "easy",
|
|
31
|
+
str(AttackStrategy.Leetspeak.value): "easy",
|
|
32
|
+
str(AttackStrategy.Morse.value): "easy",
|
|
33
|
+
str(AttackStrategy.ROT13.value): "easy",
|
|
34
|
+
str(AttackStrategy.SuffixAppend.value): "easy",
|
|
35
|
+
str(AttackStrategy.StringJoin.value): "easy",
|
|
36
|
+
str(AttackStrategy.UnicodeConfusable.value): "easy",
|
|
37
|
+
str(AttackStrategy.UnicodeSubstitution.value): "easy",
|
|
38
|
+
str(AttackStrategy.Url.value): "easy",
|
|
39
|
+
str(AttackStrategy.EASY.value): "easy",
|
|
40
|
+
str(AttackStrategy.Tense.value): "moderate",
|
|
41
|
+
str(AttackStrategy.MODERATE.value): "moderate",
|
|
42
|
+
str(AttackStrategy.DIFFICULT.value): "difficult",
|
|
43
|
+
str(AttackStrategy.Jailbreak.value): "easy"
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Mapping of risk categories to their evaluators
|
|
47
|
+
RISK_CATEGORY_EVALUATOR_MAP = {
|
|
48
|
+
RiskCategory.Violence: ViolenceEvaluator,
|
|
49
|
+
RiskCategory.HateUnfairness: HateUnfairnessEvaluator,
|
|
50
|
+
RiskCategory.Sexual: SexualEvaluator,
|
|
51
|
+
RiskCategory.SelfHarm: SelfHarmEvaluator
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Task timeouts and status codes
|
|
55
|
+
INTERNAL_TASK_TIMEOUT = 120
|
|
56
|
+
|
|
57
|
+
# Task status definitions
|
|
58
|
+
TASK_STATUS = {
|
|
59
|
+
"PENDING": "pending",
|
|
60
|
+
"RUNNING": "running",
|
|
61
|
+
"COMPLETED": "completed",
|
|
62
|
+
"FAILED": "failed",
|
|
63
|
+
"TIMEOUT": "timeout",
|
|
64
|
+
"INCOMPLETE": "incomplete",
|
|
65
|
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for formatting, conversion, and processing in Red Team Agent.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import math
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Dict, List, Union, Any, Optional, cast
|
|
10
|
+
from .._attack_strategy import AttackStrategy
|
|
11
|
+
from .._red_team_result import RedTeamResult
|
|
12
|
+
from pyrit.models import ChatMessage
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def message_to_dict(message: ChatMessage) -> Dict[str, str]:
|
|
16
|
+
"""Convert a ChatMessage to dictionary format.
|
|
17
|
+
|
|
18
|
+
:param message: The chat message to convert
|
|
19
|
+
:type message: ChatMessage
|
|
20
|
+
:return: Dictionary representation with role and content
|
|
21
|
+
:rtype: Dict[str, str]
|
|
22
|
+
"""
|
|
23
|
+
return {
|
|
24
|
+
"role": message.role,
|
|
25
|
+
"content": message.content,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_strategy_name(attack_strategy: Union[AttackStrategy, List[AttackStrategy]]) -> str:
|
|
30
|
+
"""Get a string name for an attack strategy or list of strategies.
|
|
31
|
+
|
|
32
|
+
:param attack_strategy: The attack strategy or list of strategies
|
|
33
|
+
:type attack_strategy: Union[AttackStrategy, List[AttackStrategy]]
|
|
34
|
+
:return: A string name for the strategy
|
|
35
|
+
:rtype: str
|
|
36
|
+
"""
|
|
37
|
+
if isinstance(attack_strategy, List):
|
|
38
|
+
return "_".join([str(strategy.value) for strategy in attack_strategy])
|
|
39
|
+
else:
|
|
40
|
+
return str(attack_strategy.value)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_flattened_attack_strategies(attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]]) -> List[Union[AttackStrategy, List[AttackStrategy]]]:
|
|
44
|
+
"""Flatten complex attack strategies into individual strategies.
|
|
45
|
+
|
|
46
|
+
:param attack_strategies: List of attack strategies to flatten
|
|
47
|
+
:type attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]]
|
|
48
|
+
:return: Flattened list of attack strategies
|
|
49
|
+
:rtype: List[Union[AttackStrategy, List[AttackStrategy]]]
|
|
50
|
+
"""
|
|
51
|
+
flattened_strategies = []
|
|
52
|
+
seen_strategies = set()
|
|
53
|
+
attack_strategies_temp = attack_strategies.copy()
|
|
54
|
+
|
|
55
|
+
if AttackStrategy.EASY in attack_strategies_temp:
|
|
56
|
+
attack_strategies_temp.extend(
|
|
57
|
+
[
|
|
58
|
+
AttackStrategy.Base64,
|
|
59
|
+
AttackStrategy.Flip,
|
|
60
|
+
AttackStrategy.Morse
|
|
61
|
+
]
|
|
62
|
+
)
|
|
63
|
+
attack_strategies_temp.remove(AttackStrategy.EASY)
|
|
64
|
+
|
|
65
|
+
if AttackStrategy.MODERATE in attack_strategies_temp:
|
|
66
|
+
attack_strategies_temp.extend([
|
|
67
|
+
AttackStrategy.Tense],
|
|
68
|
+
)
|
|
69
|
+
attack_strategies_temp.remove(AttackStrategy.MODERATE)
|
|
70
|
+
if AttackStrategy.DIFFICULT in attack_strategies_temp:
|
|
71
|
+
attack_strategies_temp.extend([
|
|
72
|
+
AttackStrategy.Compose([
|
|
73
|
+
AttackStrategy.Tense,
|
|
74
|
+
AttackStrategy.Base64
|
|
75
|
+
]),
|
|
76
|
+
])
|
|
77
|
+
attack_strategies_temp.remove(AttackStrategy.DIFFICULT)
|
|
78
|
+
|
|
79
|
+
## Baseline is always included
|
|
80
|
+
attack_strategies_temp.append(AttackStrategy.Baseline)
|
|
81
|
+
|
|
82
|
+
for strategy in attack_strategies_temp:
|
|
83
|
+
if isinstance(strategy, List) and tuple(strategy) not in seen_strategies: # For composed strategies
|
|
84
|
+
flattened_strategies.append([s for s in strategy])
|
|
85
|
+
seen_strategies.add(tuple(strategy))
|
|
86
|
+
elif isinstance(strategy, AttackStrategy) and strategy not in seen_strategies: # For single strategies
|
|
87
|
+
flattened_strategies.append(strategy)
|
|
88
|
+
seen_strategies.add(strategy)
|
|
89
|
+
|
|
90
|
+
return flattened_strategies
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_attack_success(result: str) -> bool:
|
|
94
|
+
"""
|
|
95
|
+
Based on EVALUATION_PASS_FAIL_MAPPING, returns True denoting attack success if result matches the value of the EVALUATION_PASS_FAIL_MAPPING key False.
|
|
96
|
+
"""
|
|
97
|
+
from .._red_team import EVALUATION_PASS_FAIL_MAPPING
|
|
98
|
+
return result == EVALUATION_PASS_FAIL_MAPPING.get(False, "true")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def format_scorecard(redteam_result: RedTeamResult) -> str:
|
|
102
|
+
"""Format the RedTeamResult into a human-readable scorecard.
|
|
103
|
+
|
|
104
|
+
The scorecard presents a summary of attack success rates across different risk categories
|
|
105
|
+
and attack complexity levels in a tabular format.
|
|
106
|
+
|
|
107
|
+
:param redteam_result: The RedTeamResult object to format
|
|
108
|
+
:type redteam_result: Dict[str, Any]
|
|
109
|
+
:return: A formatted scorecard as a string
|
|
110
|
+
:rtype: str
|
|
111
|
+
"""
|
|
112
|
+
scorecard = redteam_result["scorecard"]
|
|
113
|
+
risk_summary = scorecard["risk_category_summary"][0] if scorecard["risk_category_summary"] else {}
|
|
114
|
+
overall_asr = risk_summary.get("overall_asr", 0)
|
|
115
|
+
|
|
116
|
+
output = [f"Overall ASR: {overall_asr}%"]
|
|
117
|
+
overall_successes = risk_summary.get("overall_attack_successes", 0)
|
|
118
|
+
overall_total = risk_summary.get("overall_total", 0)
|
|
119
|
+
output.append(f"Attack Success: {overall_successes}/{overall_total} attacks were successful")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
separator = "-" * 108
|
|
123
|
+
output.append(separator)
|
|
124
|
+
output.append(f"{'Risk Category':<15}| {'Baseline ASR':<14} | {'Easy-Complexity Attacks ASR':<28} | {'Moderate-Complexity Attacks ASR':<30} | {'Difficult-Complexity Attacks ASR':<30}")
|
|
125
|
+
output.append(separator)
|
|
126
|
+
|
|
127
|
+
for item in scorecard["joint_risk_attack_summary"]:
|
|
128
|
+
risk_category = item["risk_category"].replace("_", "-").capitalize()
|
|
129
|
+
|
|
130
|
+
baseline_val = item.get('baseline_asr')
|
|
131
|
+
easy_val = item.get('easy_complexity_asr')
|
|
132
|
+
moderate_val = item.get('moderate_complexity_asr')
|
|
133
|
+
difficult_val = item.get('difficult_complexity_asr')
|
|
134
|
+
|
|
135
|
+
baseline = "N/A" if is_none_or_nan(baseline_val) else f"{baseline_val}%"
|
|
136
|
+
easy = "N/A" if is_none_or_nan(easy_val) else f"{easy_val}%"
|
|
137
|
+
moderate = "N/A" if is_none_or_nan(moderate_val) else f"{moderate_val}%"
|
|
138
|
+
difficult = "N/A" if is_none_or_nan(difficult_val) else f"{difficult_val}%"
|
|
139
|
+
|
|
140
|
+
output.append(f"{risk_category:<15}| {baseline:<14} | {easy:<28} | {moderate:<31} | {difficult:<30}")
|
|
141
|
+
|
|
142
|
+
return "\n".join(output)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def is_none_or_nan(value: Any) -> bool:
|
|
146
|
+
"""Check if a value is None or NaN."""
|
|
147
|
+
if value is None:
|
|
148
|
+
return True
|
|
149
|
+
if isinstance(value, float) and math.isnan(value):
|
|
150
|
+
return True
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def list_mean_nan_safe(data_list: List[Any]) -> float:
|
|
155
|
+
"""Calculate the mean of a list, handling None and NaN values safely.
|
|
156
|
+
|
|
157
|
+
:param data_list: List of values to calculate mean for
|
|
158
|
+
:type data_list: List[Any]
|
|
159
|
+
:return: Mean value or 0.0 if list is empty after filtering
|
|
160
|
+
:rtype: float
|
|
161
|
+
"""
|
|
162
|
+
filtered_list = [x for x in data_list if not is_none_or_nan(x)]
|
|
163
|
+
if not filtered_list:
|
|
164
|
+
return 0.0
|
|
165
|
+
return sum(filtered_list) / len(filtered_list)
|