azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -39,6 +39,7 @@ ATTACK_STRATEGY_COMPLEXITY_MAP = {
|
|
|
39
39
|
str(AttackStrategy.MODERATE.value): "moderate",
|
|
40
40
|
str(AttackStrategy.DIFFICULT.value): "difficult",
|
|
41
41
|
str(AttackStrategy.Jailbreak.value): "easy",
|
|
42
|
+
str(AttackStrategy.IndirectJailbreak.value): "easy",
|
|
42
43
|
str(AttackStrategy.MultiTurn.value): "difficult",
|
|
43
44
|
str(AttackStrategy.Crescendo.value): "difficult",
|
|
44
45
|
}
|
|
@@ -15,17 +15,26 @@ from .._attack_strategy import AttackStrategy
|
|
|
15
15
|
from .._red_team_result import RedTeamResult
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def message_to_dict(
|
|
18
|
+
def message_to_dict(
|
|
19
|
+
message: ChatMessage, context: str = None, tool_calls: List[Any] = None, token_usage: Dict[str, Any] = None
|
|
20
|
+
) -> Dict[str, Any]:
|
|
19
21
|
"""Convert a ChatMessage and context to dictionary format.
|
|
20
22
|
|
|
21
23
|
:param message: The chat message to convert
|
|
22
24
|
:type message: ChatMessage
|
|
23
25
|
:param context: Additional context to include in the dictionary
|
|
24
26
|
:type context: str
|
|
27
|
+
:param tool_calls: List of tool calls to include in the dictionary
|
|
28
|
+
:type tool_calls: List[Any]
|
|
29
|
+
:param token_usage: Token usage information from the callback
|
|
30
|
+
:type token_usage: Dict[str, Any]
|
|
25
31
|
:return: Dictionary representation with role and content
|
|
26
|
-
:rtype: Dict[str,
|
|
32
|
+
:rtype: Dict[str, Any]
|
|
27
33
|
"""
|
|
28
|
-
|
|
34
|
+
msg_dict = {"role": message.role, "content": message.content, "context": context, "tool_calls": tool_calls}
|
|
35
|
+
if token_usage:
|
|
36
|
+
msg_dict["token_usage"] = token_usage
|
|
37
|
+
return msg_dict
|
|
29
38
|
|
|
30
39
|
|
|
31
40
|
def get_strategy_name(attack_strategy: Union[AttackStrategy, List[AttackStrategy]]) -> str:
|
|
@@ -164,6 +173,82 @@ def list_mean_nan_safe(data_list: List[Any]) -> float:
|
|
|
164
173
|
return sum(filtered_list) / len(filtered_list)
|
|
165
174
|
|
|
166
175
|
|
|
176
|
+
def format_content_by_modality(content: str, modality: str) -> str:
|
|
177
|
+
"""Format content based on the specified modality, returning one random format.
|
|
178
|
+
|
|
179
|
+
:param content: The content to format
|
|
180
|
+
:type content: str
|
|
181
|
+
:param modality: The modality type (html, markdown, footnote, code, text)
|
|
182
|
+
:type modality: str
|
|
183
|
+
:return: Formatted content
|
|
184
|
+
:rtype: str
|
|
185
|
+
"""
|
|
186
|
+
import html
|
|
187
|
+
import random
|
|
188
|
+
|
|
189
|
+
def format_as_html(text):
|
|
190
|
+
html_text = html.escape(text)
|
|
191
|
+
return [
|
|
192
|
+
f"<p>{html_text}</p>",
|
|
193
|
+
f'<div style="display:none">{html_text}</div>',
|
|
194
|
+
f'<span style="display:none">{html_text}</span>',
|
|
195
|
+
f"<div hidden>{html_text}</div>",
|
|
196
|
+
f'<meta content="{html_text}">',
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
def format_as_markdown(text):
|
|
200
|
+
markdown_text = text.replace("\n", " \n") # Convert newlines to Markdown line breaks
|
|
201
|
+
return [
|
|
202
|
+
f"\n**{markdown_text}**\n", # Bold
|
|
203
|
+
f"\n*{markdown_text}*\n", # Italic
|
|
204
|
+
f"\n`{markdown_text}`\n", # Inline code
|
|
205
|
+
f"\n```{markdown_text}```\n", # Code block
|
|
206
|
+
f"\n[Link]({markdown_text})\n", # Link
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
def format_as_footnote(text):
|
|
210
|
+
footnote_text = text.replace("\n", " ")
|
|
211
|
+
return [
|
|
212
|
+
f"[^{footnote_text}]", # Footnote reference
|
|
213
|
+
f"\n[1] {footnote_text}\n", # Footnote reference with number
|
|
214
|
+
f"<sup>{footnote_text}</sup>", # Superscript
|
|
215
|
+
f'<span class="footnote">{footnote_text}</span>', # Custom footnote span
|
|
216
|
+
f'<div class="footnote">{footnote_text}</div>', # Custom footnote div
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
def format_as_code(text):
|
|
220
|
+
code_text = text.replace("\n", " ")
|
|
221
|
+
return [
|
|
222
|
+
f"`{code_text}`", # Inline code
|
|
223
|
+
f"```\n{code_text}\n```", # Code block
|
|
224
|
+
f'"""\n{code_text}\n"""', # Code block
|
|
225
|
+
f"# {code_text}", # Inline comment
|
|
226
|
+
f'def function():\n print("{code_text}")', # Function call
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
def format_as_text(text):
|
|
230
|
+
return [f"<document>{text}</document>"] # Return text in document tags
|
|
231
|
+
|
|
232
|
+
# Mapping of modality types to formatting functions
|
|
233
|
+
modality_formatters = {
|
|
234
|
+
"html": format_as_html,
|
|
235
|
+
"markdown": format_as_markdown,
|
|
236
|
+
"footnote": format_as_footnote,
|
|
237
|
+
"code": format_as_code,
|
|
238
|
+
"text": format_as_text,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
# Get formatter based on modality type
|
|
242
|
+
if modality and modality.lower() in modality_formatters:
|
|
243
|
+
formatter = modality_formatters[modality.lower()]
|
|
244
|
+
formats = formatter(content)
|
|
245
|
+
# Return one random format from the available options
|
|
246
|
+
return random.choice(formats)
|
|
247
|
+
else:
|
|
248
|
+
# Return plain text if modality not recognized
|
|
249
|
+
return content
|
|
250
|
+
|
|
251
|
+
|
|
167
252
|
def write_pyrit_outputs_to_file(
|
|
168
253
|
*,
|
|
169
254
|
output_path: str,
|
|
@@ -194,7 +279,13 @@ def write_pyrit_outputs_to_file(
|
|
|
194
279
|
|
|
195
280
|
conversations = [
|
|
196
281
|
[
|
|
197
|
-
(
|
|
282
|
+
(
|
|
283
|
+
item.to_chat_message(),
|
|
284
|
+
prompt_to_context.get(item.original_value, "") or item.labels.get("context", ""),
|
|
285
|
+
item.labels.get("tool_calls", []),
|
|
286
|
+
item.labels.get("risk_sub_type"),
|
|
287
|
+
item.labels.get("token_usage"),
|
|
288
|
+
)
|
|
198
289
|
for item in group
|
|
199
290
|
]
|
|
200
291
|
for conv_id, group in itertools.groupby(prompts_request_pieces, key=lambda x: x.conversation_id)
|
|
@@ -217,16 +308,22 @@ def write_pyrit_outputs_to_file(
|
|
|
217
308
|
if conversation[0][0].role == "system":
|
|
218
309
|
# Skip system messages in the output
|
|
219
310
|
continue
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
311
|
+
conv_dict = {
|
|
312
|
+
"conversation": {
|
|
313
|
+
"messages": [
|
|
314
|
+
message_to_dict(
|
|
315
|
+
message[0], message[1], message[2], message[4] if len(message) > 4 else None
|
|
316
|
+
)
|
|
317
|
+
for message in conversation
|
|
318
|
+
]
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
# Add risk_sub_type if present (check first message for the label)
|
|
322
|
+
if conversation and len(conversation) > 0 and len(conversation[0]) > 3:
|
|
323
|
+
risk_sub_type = conversation[0][3]
|
|
324
|
+
if risk_sub_type:
|
|
325
|
+
conv_dict["risk_sub_type"] = risk_sub_type
|
|
326
|
+
json_lines += json.dumps(conv_dict) + "\n"
|
|
230
327
|
with Path(output_path).open("w") as f:
|
|
231
328
|
f.writelines(json_lines)
|
|
232
329
|
logger.debug(
|
|
@@ -248,16 +345,20 @@ def write_pyrit_outputs_to_file(
|
|
|
248
345
|
if conversation[0][0].role == "system":
|
|
249
346
|
# Skip system messages in the output
|
|
250
347
|
continue
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
)
|
|
348
|
+
conv_dict = {
|
|
349
|
+
"conversation": {
|
|
350
|
+
"messages": [
|
|
351
|
+
message_to_dict(message[0], message[1], message[2], message[4] if len(message) > 4 else None)
|
|
352
|
+
for message in conversation
|
|
353
|
+
]
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
# Add risk_sub_type if present (check first message for the label)
|
|
357
|
+
if conversation and len(conversation) > 0 and len(conversation[0]) > 3:
|
|
358
|
+
risk_sub_type = conversation[0][3]
|
|
359
|
+
if risk_sub_type:
|
|
360
|
+
conv_dict["risk_sub_type"] = risk_sub_type
|
|
361
|
+
json_lines += json.dumps(conv_dict) + "\n"
|
|
261
362
|
with Path(output_path).open("w") as f:
|
|
262
363
|
f.writelines(json_lines)
|
|
263
364
|
logger.debug(f"Successfully wrote {len(conversations)} conversations to {output_path}")
|
|
@@ -15,9 +15,11 @@ RISK_CATEGORY_METRIC_MAP = {
|
|
|
15
15
|
RiskCategory.SelfHarm: EvaluationMetrics.SELF_HARM,
|
|
16
16
|
RiskCategory.ProtectedMaterial: EvaluationMetrics.PROTECTED_MATERIAL,
|
|
17
17
|
RiskCategory.UngroundedAttributes: EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
|
|
18
|
-
RiskCategory.IndirectAttack: EvaluationMetrics.XPIA,
|
|
19
18
|
_InternalRiskCategory.ECI: _InternalEvaluationMetrics.ECI,
|
|
20
19
|
RiskCategory.CodeVulnerability: EvaluationMetrics.CODE_VULNERABILITY,
|
|
20
|
+
RiskCategory.SensitiveDataLeakage: EvaluationMetrics.SENSITIVE_DATA_LEAKAGE,
|
|
21
|
+
RiskCategory.TaskAdherence: EvaluationMetrics.TASK_ADHERENCE,
|
|
22
|
+
RiskCategory.ProhibitedActions: EvaluationMetrics.PROHIBITED_ACTIONS,
|
|
21
23
|
}
|
|
22
24
|
|
|
23
25
|
RISK_CATEGORY_ANNOTATION_TASK_MAP = {
|
|
@@ -27,9 +29,11 @@ RISK_CATEGORY_ANNOTATION_TASK_MAP = {
|
|
|
27
29
|
RiskCategory.SelfHarm: Tasks.CONTENT_HARM,
|
|
28
30
|
RiskCategory.ProtectedMaterial: Tasks.PROTECTED_MATERIAL,
|
|
29
31
|
RiskCategory.UngroundedAttributes: Tasks.UNGROUNDED_ATTRIBUTES,
|
|
30
|
-
RiskCategory.IndirectAttack: Tasks.XPIA,
|
|
31
32
|
_InternalRiskCategory.ECI: _InternalAnnotationTasks.ECI,
|
|
32
33
|
RiskCategory.CodeVulnerability: Tasks.CODE_VULNERABILITY,
|
|
34
|
+
RiskCategory.SensitiveDataLeakage: Tasks.SENSITIVE_DATA_LEAKAGE,
|
|
35
|
+
RiskCategory.TaskAdherence: Tasks.TASK_ADHERENCE,
|
|
36
|
+
RiskCategory.ProhibitedActions: Tasks.PROHIBITED_ACTIONS,
|
|
33
37
|
}
|
|
34
38
|
|
|
35
39
|
|
|
@@ -63,8 +67,7 @@ def get_attack_objective_from_risk_category(risk_category: Union[RiskCategory])
|
|
|
63
67
|
:return: The corresponding attack objective string
|
|
64
68
|
:rtype: str
|
|
65
69
|
"""
|
|
66
|
-
|
|
67
|
-
"isa"
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
)
|
|
70
|
+
if risk_category == RiskCategory.UngroundedAttributes:
|
|
71
|
+
return "isa"
|
|
72
|
+
else:
|
|
73
|
+
return risk_category.value
|
|
@@ -68,6 +68,7 @@ def strategy_converter_map() -> Dict[Any, Union[PromptConverter, List[PromptConv
|
|
|
68
68
|
AttackStrategy.Jailbreak: None,
|
|
69
69
|
AttackStrategy.MultiTurn: None,
|
|
70
70
|
AttackStrategy.Crescendo: None,
|
|
71
|
+
AttackStrategy.IndirectJailbreak: None,
|
|
71
72
|
}
|
|
72
73
|
|
|
73
74
|
|
|
@@ -89,14 +90,11 @@ def get_converter_for_strategy(
|
|
|
89
90
|
|
|
90
91
|
def get_chat_target(
|
|
91
92
|
target: Union[PromptChatTarget, Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
|
|
92
|
-
prompt_to_context: Optional[Dict[str, str]] = None,
|
|
93
93
|
) -> PromptChatTarget:
|
|
94
94
|
"""Convert various target types to a PromptChatTarget.
|
|
95
95
|
|
|
96
96
|
:param target: The target to convert
|
|
97
97
|
:type target: Union[PromptChatTarget, Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
|
|
98
|
-
:param prompt_to_context: Optional mapping from prompt content to context
|
|
99
|
-
:type prompt_to_context: Optional[Dict[str, str]]
|
|
100
98
|
:return: A PromptChatTarget instance
|
|
101
99
|
:rtype: PromptChatTarget
|
|
102
100
|
"""
|
|
@@ -154,7 +152,7 @@ def get_chat_target(
|
|
|
154
152
|
has_callback_signature = False
|
|
155
153
|
|
|
156
154
|
if has_callback_signature:
|
|
157
|
-
chat_target = _CallbackChatTarget(callback=target
|
|
155
|
+
chat_target = _CallbackChatTarget(callback=target)
|
|
158
156
|
else:
|
|
159
157
|
|
|
160
158
|
async def callback_target(
|
|
@@ -190,26 +188,6 @@ def get_chat_target(
|
|
|
190
188
|
messages_list.append(formatted_response) # type: ignore
|
|
191
189
|
return {"messages": messages_list, "stream": stream, "session_state": session_state, "context": {}}
|
|
192
190
|
|
|
193
|
-
chat_target = _CallbackChatTarget(callback=callback_target
|
|
191
|
+
chat_target = _CallbackChatTarget(callback=callback_target) # type: ignore
|
|
194
192
|
|
|
195
193
|
return chat_target
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
def get_orchestrators_for_attack_strategies(
|
|
199
|
-
attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]]
|
|
200
|
-
) -> List[Callable]:
|
|
201
|
-
"""
|
|
202
|
-
Gets a list of orchestrator functions to use based on the attack strategies.
|
|
203
|
-
|
|
204
|
-
:param attack_strategies: The list of attack strategies
|
|
205
|
-
:type attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]]
|
|
206
|
-
:return: A list of orchestrator functions
|
|
207
|
-
:rtype: List[Callable]
|
|
208
|
-
"""
|
|
209
|
-
call_to_orchestrators = []
|
|
210
|
-
|
|
211
|
-
# Since we're just returning one orchestrator type for now, simplify the logic
|
|
212
|
-
# This can be expanded later if different orchestrators are needed for different strategies
|
|
213
|
-
return [
|
|
214
|
-
lambda chat_target, all_prompts, converter, strategy_name, risk_category: None
|
|
215
|
-
] # This will be replaced with the actual orchestrator function in the main class
|
|
@@ -14,7 +14,7 @@ from tqdm import tqdm
|
|
|
14
14
|
|
|
15
15
|
from azure.ai.evaluation._common._experimental import experimental
|
|
16
16
|
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
17
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
17
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
18
18
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
19
19
|
from azure.ai.evaluation._http_utils import get_async_http_client
|
|
20
20
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
@@ -15,7 +15,7 @@ import jinja2
|
|
|
15
15
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
16
|
from azure.ai.evaluation._http_utils import AsyncHttpPipeline
|
|
17
17
|
from .._model_tools import LLMBase, OpenAIChatCompletionsModel, RAIClient
|
|
18
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
18
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
19
19
|
from .._model_tools._template_handler import TemplateParameters
|
|
20
20
|
from .constants import ConversationRole
|
|
21
21
|
|
|
@@ -11,7 +11,7 @@ from azure.ai.evaluation.simulator._constants import SupportedLanguages
|
|
|
11
11
|
from azure.ai.evaluation.simulator._helpers._language_suffix_mapping import SUPPORTED_LANGUAGES_MAPPING
|
|
12
12
|
from ..._http_utils import AsyncHttpPipeline
|
|
13
13
|
from . import ConversationBot, ConversationTurn
|
|
14
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
14
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def is_closing_message(response: Union[Dict, str], recursion_depth: int = 0) -> bool:
|
|
@@ -13,7 +13,7 @@ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_oned
|
|
|
13
13
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
14
14
|
from azure.ai.evaluation.simulator import AdversarialScenario
|
|
15
15
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
16
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
16
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
17
17
|
from azure.core.credentials import TokenCredential
|
|
18
18
|
|
|
19
19
|
from ._adversarial_simulator import AdversarialSimulator
|
|
@@ -15,7 +15,7 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
15
15
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
16
16
|
from azure.ai.evaluation.simulator import AdversarialScenarioJailbreak, SupportedLanguages
|
|
17
17
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
18
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
18
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
19
19
|
from azure.core.credentials import TokenCredential
|
|
20
20
|
from azure.ai.evaluation._constants import TokenScope
|
|
21
21
|
|
|
@@ -12,7 +12,7 @@ from azure.ai.evaluation.simulator._model_tools import ManagedIdentityAPITokenMa
|
|
|
12
12
|
from azure.ai.evaluation._common.raiclient import MachineLearningServicesClient
|
|
13
13
|
from azure.ai.evaluation._constants import TokenScope
|
|
14
14
|
from azure.ai.evaluation._common.utils import is_onedp_project
|
|
15
|
-
from azure.ai.evaluation._common.onedp import AIProjectClient
|
|
15
|
+
from azure.ai.evaluation._common.onedp import ProjectsClient as AIProjectClient
|
|
16
16
|
from azure.ai.evaluation._common import EvaluationServiceOneDPClient
|
|
17
17
|
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
18
18
|
import jwt
|
|
@@ -113,6 +113,8 @@ class GeneratedRAIClient:
|
|
|
113
113
|
strategy: Optional[str] = None,
|
|
114
114
|
language: str = "en",
|
|
115
115
|
scan_session_id: Optional[str] = None,
|
|
116
|
+
target: Optional[str] = None,
|
|
117
|
+
client_id: Optional[str] = None,
|
|
116
118
|
) -> Dict:
|
|
117
119
|
"""Get attack objectives using the auto-generated operations.
|
|
118
120
|
|
|
@@ -128,18 +130,38 @@ class GeneratedRAIClient:
|
|
|
128
130
|
:type language: str
|
|
129
131
|
:param scan_session_id: Optional unique session ID for the scan
|
|
130
132
|
:type scan_session_id: Optional[str]
|
|
133
|
+
:param target: Optional target type (model/agent)
|
|
134
|
+
:type target: Optional[str]
|
|
135
|
+
:param client_id: Optional client ID for ACA token authorization
|
|
136
|
+
:type client_id: Optional[str]
|
|
131
137
|
:return: The attack objectives
|
|
132
138
|
:rtype: Dict
|
|
133
139
|
"""
|
|
134
140
|
try:
|
|
141
|
+
# Build headers dictionary
|
|
142
|
+
headers = {}
|
|
143
|
+
if scan_session_id:
|
|
144
|
+
headers["x-ms-client-request-id"] = scan_session_id
|
|
145
|
+
if client_id:
|
|
146
|
+
from azure.identity import DefaultAzureCredential
|
|
147
|
+
|
|
148
|
+
# Get token using the client_id for managed identity
|
|
149
|
+
managed_identity_credential = DefaultAzureCredential(
|
|
150
|
+
managed_identity_client_id=client_id, exclude_interactive_browser_credential=True
|
|
151
|
+
)
|
|
152
|
+
token = managed_identity_credential.get_token(TokenScope.DEFAULT_AZURE_MANAGEMENT).token
|
|
153
|
+
headers["aml-aca-token"] = token
|
|
154
|
+
|
|
135
155
|
# Send the request using the autogenerated client
|
|
136
156
|
response = self._client.get_attack_objectives(
|
|
137
157
|
risk_types=[risk_type],
|
|
138
158
|
risk_category=risk_category,
|
|
139
159
|
lang=language,
|
|
140
160
|
strategy=strategy,
|
|
141
|
-
|
|
161
|
+
target_type=target,
|
|
162
|
+
headers=headers,
|
|
142
163
|
)
|
|
164
|
+
|
|
143
165
|
return response
|
|
144
166
|
|
|
145
167
|
except Exception as e:
|
|
@@ -195,4 +217,5 @@ class GeneratedRAIClient:
|
|
|
195
217
|
if (exp_time - current_time) >= 300:
|
|
196
218
|
return token
|
|
197
219
|
|
|
220
|
+
# Get token
|
|
198
221
|
return credential.get_token(TokenScope.DEFAULT_AZURE_MANAGEMENT).token
|
|
@@ -152,6 +152,7 @@ class ManagedIdentityAPITokenManager(APITokenManager):
|
|
|
152
152
|
):
|
|
153
153
|
self.last_refresh_time = time.time()
|
|
154
154
|
get_token_method = self.credential.get_token(self.token_scope.value)
|
|
155
|
+
|
|
155
156
|
if inspect.isawaitable(get_token_method):
|
|
156
157
|
# If it's awaitable, await it
|
|
157
158
|
token_response: AccessToken = await get_token_method
|
|
@@ -12,7 +12,7 @@ from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_cl
|
|
|
12
12
|
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
13
13
|
from azure.core.exceptions import HttpResponseError, ServiceResponseError
|
|
14
14
|
from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
|
|
15
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
15
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
16
16
|
from azure.ai.evaluation._common.onedp.models import SimulationDTO
|
|
17
17
|
from azure.ai.evaluation._common.constants import RAIService
|
|
18
18
|
|
|
@@ -7,7 +7,7 @@ from ast import literal_eval
|
|
|
7
7
|
from typing_extensions import NotRequired
|
|
8
8
|
|
|
9
9
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
10
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
10
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
11
11
|
from azure.ai.evaluation.simulator._adversarial_scenario import AdversarialScenario
|
|
12
12
|
|
|
13
13
|
from ._rai_client import RAIClient
|
|
@@ -12,7 +12,7 @@ from abc import ABC, abstractmethod
|
|
|
12
12
|
from collections import deque
|
|
13
13
|
from typing import Deque, Dict, List, Optional, Union
|
|
14
14
|
from urllib.parse import urlparse
|
|
15
|
-
from azure.ai.evaluation._common.onedp._client import AIProjectClient
|
|
15
|
+
from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
|
|
16
16
|
from ._rai_client import RAIClient
|
|
17
17
|
|
|
18
18
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: azure-ai-evaluation
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.13.0
|
|
4
4
|
Summary: Microsoft Azure Evaluation Library for Python
|
|
5
5
|
Home-page: https://github.com/Azure/azure-sdk-for-python
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -22,20 +22,25 @@ Requires-Python: >=3.9
|
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
23
|
License-File: NOTICE.txt
|
|
24
24
|
Requires-Dist: pyjwt>=2.8.0
|
|
25
|
-
Requires-Dist: azure-identity>=1.
|
|
26
|
-
Requires-Dist: azure-core>=1.
|
|
25
|
+
Requires-Dist: azure-identity>=1.19.0
|
|
26
|
+
Requires-Dist: azure-core>=1.31.0
|
|
27
27
|
Requires-Dist: nltk>=3.9.1
|
|
28
|
-
Requires-Dist: azure-storage-blob>=12.
|
|
29
|
-
Requires-Dist: httpx>=0.
|
|
30
|
-
Requires-Dist: pandas<3.0.0,>=2.1.2
|
|
28
|
+
Requires-Dist: azure-storage-blob>=12.19.0
|
|
29
|
+
Requires-Dist: httpx>=0.27.2
|
|
30
|
+
Requires-Dist: pandas<3.0.0,>=2.1.2; python_version < "3.13"
|
|
31
|
+
Requires-Dist: pandas<3.0.0,>=2.2.3; python_version == "3.13"
|
|
32
|
+
Requires-Dist: pandas<3.0.0,>=2.3.3; python_version >= "3.14"
|
|
31
33
|
Requires-Dist: openai>=1.108.0
|
|
32
34
|
Requires-Dist: ruamel.yaml<1.0.0,>=0.17.10
|
|
33
35
|
Requires-Dist: msrest>=0.6.21
|
|
34
36
|
Requires-Dist: Jinja2>=3.1.6
|
|
35
37
|
Requires-Dist: aiohttp>=3.0
|
|
36
38
|
Provides-Extra: redteam
|
|
37
|
-
Requires-Dist: pyrit==0.8.1; extra == "redteam"
|
|
38
|
-
Requires-Dist: duckdb==1.3.2; extra == "redteam"
|
|
39
|
+
Requires-Dist: pyrit==0.8.1; python_version >= "3.10" and extra == "redteam"
|
|
40
|
+
Requires-Dist: duckdb==1.3.2; python_version >= "3.10" and extra == "redteam"
|
|
41
|
+
Provides-Extra: opentelemetry
|
|
42
|
+
Requires-Dist: opentelemetry-sdk>=1.17.0; extra == "opentelemetry"
|
|
43
|
+
Requires-Dist: azure-monitor-opentelemetry-exporter>=1.0.0b17; extra == "opentelemetry"
|
|
39
44
|
Dynamic: author
|
|
40
45
|
Dynamic: author-email
|
|
41
46
|
Dynamic: classifier
|
|
@@ -413,6 +418,25 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
413
418
|
|
|
414
419
|
# Release History
|
|
415
420
|
|
|
421
|
+
## 1.13.0 (2025-10-30)
|
|
422
|
+
|
|
423
|
+
### Features Added
|
|
424
|
+
|
|
425
|
+
- Updated `IndirectAttack` risk category for RedTeam to `IndirectJailbreak` to better reflect its purpose. This change allows users to apply cross-domain prompt injection (XPIA) attack strategies across all risk categories, enabling more comprehensive security testing of AI systems against indirect prompt injection attacks during red teaming.
|
|
426
|
+
- Added `TaskAdherence`, `SensitiveDataLeakage`, and `ProhibitedActions` as cloud-only agent safety risk categories for red teaming.
|
|
427
|
+
- Updated all evaluators' output to be of the following schema:
|
|
428
|
+
- `gpt_{evaluator_name}`, `{evaluator_name}`: float score,
|
|
429
|
+
- `{evaluator_name}_result`: pass/fail based on threshold,
|
|
430
|
+
- `{evaluator_name}_reason`, `{evaluator_name}_threshold`
|
|
431
|
+
- `{evaluator_name}_prompt_tokens`, `{evaluator_name}_completion_tokens`, `{evaluator_name}_total_tokens`, `{evaluator_name}_finish_reason`
|
|
432
|
+
- `{evaluator_name}_model`: model used for evaluation
|
|
433
|
+
- `{evaluator_name}_sample_input`, `{evaluator_name}_sample_output`: input and output used for evaluation
|
|
434
|
+
|
|
435
|
+
This change standardizes the output format across all evaluators and follows OTel convention.
|
|
436
|
+
|
|
437
|
+
### Bugs Fixed
|
|
438
|
+
|
|
439
|
+
- `image_tag` parameter in `AzureOpenAIPythonGrader` is now optional.
|
|
416
440
|
|
|
417
441
|
## 1.12.0 (2025-10-02)
|
|
418
442
|
|
|
@@ -423,6 +447,12 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
|
|
|
423
447
|
### Bugs Fixed
|
|
424
448
|
- Support for multi-level nesting in OpenAI grader (experimental)
|
|
425
449
|
|
|
450
|
+
## 1.11.2 (2025-10-09)
|
|
451
|
+
|
|
452
|
+
### Bugs Fixed
|
|
453
|
+
|
|
454
|
+
- **kwargs in an evaluator signature receives input columns that are not otherwise named in the evaluator's signature
|
|
455
|
+
|
|
426
456
|
## 1.11.1 (2025-09-19)
|
|
427
457
|
|
|
428
458
|
### Bugs Fixed
|