azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
  3. azure/ai/evaluation/_aoai/label_grader.py +6 -10
  4. azure/ai/evaluation/_aoai/python_grader.py +7 -10
  5. azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
  6. azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +241 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -2
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
  33. azure/ai/evaluation/_evaluate/_utils.py +10 -3
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
  38. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  39. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
  40. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  41. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  42. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  43. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
  44. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  45. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  46. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  47. azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
  48. azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
  49. azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
  50. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
  52. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  53. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  55. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  56. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  57. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  58. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  59. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  60. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  61. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  62. azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
  63. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  64. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  65. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  66. azure/ai/evaluation/_exceptions.py +6 -1
  67. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  68. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  69. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  70. azure/ai/evaluation/_model_configurations.py +26 -0
  71. azure/ai/evaluation/_version.py +1 -1
  72. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  73. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  74. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  75. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  76. azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
  77. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  78. azure/ai/evaluation/red_team/_red_team.py +494 -37
  79. azure/ai/evaluation/red_team/_red_team_result.py +48 -28
  80. azure/ai/evaluation/red_team/_result_processor.py +558 -29
  81. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  82. azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
  83. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  84. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  85. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  86. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  87. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  88. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  90. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  91. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  92. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  94. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  95. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
  96. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
  97. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  98. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  99. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -39,6 +39,7 @@ ATTACK_STRATEGY_COMPLEXITY_MAP = {
39
39
  str(AttackStrategy.MODERATE.value): "moderate",
40
40
  str(AttackStrategy.DIFFICULT.value): "difficult",
41
41
  str(AttackStrategy.Jailbreak.value): "easy",
42
+ str(AttackStrategy.IndirectJailbreak.value): "easy",
42
43
  str(AttackStrategy.MultiTurn.value): "difficult",
43
44
  str(AttackStrategy.Crescendo.value): "difficult",
44
45
  }
@@ -15,17 +15,26 @@ from .._attack_strategy import AttackStrategy
15
15
  from .._red_team_result import RedTeamResult
16
16
 
17
17
 
18
- def message_to_dict(message: ChatMessage, context: str = None) -> Dict[str, str]:
18
+ def message_to_dict(
19
+ message: ChatMessage, context: str = None, tool_calls: List[Any] = None, token_usage: Dict[str, Any] = None
20
+ ) -> Dict[str, Any]:
19
21
  """Convert a ChatMessage and context to dictionary format.
20
22
 
21
23
  :param message: The chat message to convert
22
24
  :type message: ChatMessage
23
25
  :param context: Additional context to include in the dictionary
24
26
  :type context: str
27
+ :param tool_calls: List of tool calls to include in the dictionary
28
+ :type tool_calls: List[Any]
29
+ :param token_usage: Token usage information from the callback
30
+ :type token_usage: Dict[str, Any]
25
31
  :return: Dictionary representation with role and content
26
- :rtype: Dict[str, str]
32
+ :rtype: Dict[str, Any]
27
33
  """
28
- return {"role": message.role, "content": message.content, "context": context}
34
+ msg_dict = {"role": message.role, "content": message.content, "context": context, "tool_calls": tool_calls}
35
+ if token_usage:
36
+ msg_dict["token_usage"] = token_usage
37
+ return msg_dict
29
38
 
30
39
 
31
40
  def get_strategy_name(attack_strategy: Union[AttackStrategy, List[AttackStrategy]]) -> str:
@@ -164,6 +173,82 @@ def list_mean_nan_safe(data_list: List[Any]) -> float:
164
173
  return sum(filtered_list) / len(filtered_list)
165
174
 
166
175
 
176
+ def format_content_by_modality(content: str, modality: str) -> str:
177
+ """Format content based on the specified modality, returning one random format.
178
+
179
+ :param content: The content to format
180
+ :type content: str
181
+ :param modality: The modality type (html, markdown, footnote, code, text)
182
+ :type modality: str
183
+ :return: Formatted content
184
+ :rtype: str
185
+ """
186
+ import html
187
+ import random
188
+
189
+ def format_as_html(text):
190
+ html_text = html.escape(text)
191
+ return [
192
+ f"<p>{html_text}</p>",
193
+ f'<div style="display:none">{html_text}</div>',
194
+ f'<span style="display:none">{html_text}</span>',
195
+ f"<div hidden>{html_text}</div>",
196
+ f'<meta content="{html_text}">',
197
+ ]
198
+
199
+ def format_as_markdown(text):
200
+ markdown_text = text.replace("\n", " \n") # Convert newlines to Markdown line breaks
201
+ return [
202
+ f"\n**{markdown_text}**\n", # Bold
203
+ f"\n*{markdown_text}*\n", # Italic
204
+ f"\n`{markdown_text}`\n", # Inline code
205
+ f"\n```{markdown_text}```\n", # Code block
206
+ f"\n[Link]({markdown_text})\n", # Link
207
+ ]
208
+
209
+ def format_as_footnote(text):
210
+ footnote_text = text.replace("\n", " ")
211
+ return [
212
+ f"[^{footnote_text}]", # Footnote reference
213
+ f"\n[1] {footnote_text}\n", # Footnote reference with number
214
+ f"<sup>{footnote_text}</sup>", # Superscript
215
+ f'<span class="footnote">{footnote_text}</span>', # Custom footnote span
216
+ f'<div class="footnote">{footnote_text}</div>', # Custom footnote div
217
+ ]
218
+
219
+ def format_as_code(text):
220
+ code_text = text.replace("\n", " ")
221
+ return [
222
+ f"`{code_text}`", # Inline code
223
+ f"```\n{code_text}\n```", # Code block
224
+ f'"""\n{code_text}\n"""', # Code block
225
+ f"# {code_text}", # Inline comment
226
+ f'def function():\n print("{code_text}")', # Function call
227
+ ]
228
+
229
+ def format_as_text(text):
230
+ return [f"<document>{text}</document>"] # Return text in document tags
231
+
232
+ # Mapping of modality types to formatting functions
233
+ modality_formatters = {
234
+ "html": format_as_html,
235
+ "markdown": format_as_markdown,
236
+ "footnote": format_as_footnote,
237
+ "code": format_as_code,
238
+ "text": format_as_text,
239
+ }
240
+
241
+ # Get formatter based on modality type
242
+ if modality and modality.lower() in modality_formatters:
243
+ formatter = modality_formatters[modality.lower()]
244
+ formats = formatter(content)
245
+ # Return one random format from the available options
246
+ return random.choice(formats)
247
+ else:
248
+ # Return plain text if modality not recognized
249
+ return content
250
+
251
+
167
252
  def write_pyrit_outputs_to_file(
168
253
  *,
169
254
  output_path: str,
@@ -194,7 +279,13 @@ def write_pyrit_outputs_to_file(
194
279
 
195
280
  conversations = [
196
281
  [
197
- (item.to_chat_message(), prompt_to_context.get(item.original_value, "") or item.labels.get("context", ""))
282
+ (
283
+ item.to_chat_message(),
284
+ prompt_to_context.get(item.original_value, "") or item.labels.get("context", ""),
285
+ item.labels.get("tool_calls", []),
286
+ item.labels.get("risk_sub_type"),
287
+ item.labels.get("token_usage"),
288
+ )
198
289
  for item in group
199
290
  ]
200
291
  for conv_id, group in itertools.groupby(prompts_request_pieces, key=lambda x: x.conversation_id)
@@ -217,16 +308,22 @@ def write_pyrit_outputs_to_file(
217
308
  if conversation[0][0].role == "system":
218
309
  # Skip system messages in the output
219
310
  continue
220
- json_lines += (
221
- json.dumps(
222
- {
223
- "conversation": {
224
- "messages": [message_to_dict(message[0], message[1]) for message in conversation]
225
- }
226
- }
227
- )
228
- + "\n"
229
- )
311
+ conv_dict = {
312
+ "conversation": {
313
+ "messages": [
314
+ message_to_dict(
315
+ message[0], message[1], message[2], message[4] if len(message) > 4 else None
316
+ )
317
+ for message in conversation
318
+ ]
319
+ }
320
+ }
321
+ # Add risk_sub_type if present (check first message for the label)
322
+ if conversation and len(conversation) > 0 and len(conversation[0]) > 3:
323
+ risk_sub_type = conversation[0][3]
324
+ if risk_sub_type:
325
+ conv_dict["risk_sub_type"] = risk_sub_type
326
+ json_lines += json.dumps(conv_dict) + "\n"
230
327
  with Path(output_path).open("w") as f:
231
328
  f.writelines(json_lines)
232
329
  logger.debug(
@@ -248,16 +345,20 @@ def write_pyrit_outputs_to_file(
248
345
  if conversation[0][0].role == "system":
249
346
  # Skip system messages in the output
250
347
  continue
251
- json_lines += (
252
- json.dumps(
253
- {
254
- "conversation": {
255
- "messages": [message_to_dict(message[0], message[1]) for message in conversation]
256
- }
257
- }
258
- )
259
- + "\n"
260
- )
348
+ conv_dict = {
349
+ "conversation": {
350
+ "messages": [
351
+ message_to_dict(message[0], message[1], message[2], message[4] if len(message) > 4 else None)
352
+ for message in conversation
353
+ ]
354
+ }
355
+ }
356
+ # Add risk_sub_type if present (check first message for the label)
357
+ if conversation and len(conversation) > 0 and len(conversation[0]) > 3:
358
+ risk_sub_type = conversation[0][3]
359
+ if risk_sub_type:
360
+ conv_dict["risk_sub_type"] = risk_sub_type
361
+ json_lines += json.dumps(conv_dict) + "\n"
261
362
  with Path(output_path).open("w") as f:
262
363
  f.writelines(json_lines)
263
364
  logger.debug(f"Successfully wrote {len(conversations)} conversations to {output_path}")
@@ -15,9 +15,11 @@ RISK_CATEGORY_METRIC_MAP = {
15
15
  RiskCategory.SelfHarm: EvaluationMetrics.SELF_HARM,
16
16
  RiskCategory.ProtectedMaterial: EvaluationMetrics.PROTECTED_MATERIAL,
17
17
  RiskCategory.UngroundedAttributes: EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
18
- RiskCategory.IndirectAttack: EvaluationMetrics.XPIA,
19
18
  _InternalRiskCategory.ECI: _InternalEvaluationMetrics.ECI,
20
19
  RiskCategory.CodeVulnerability: EvaluationMetrics.CODE_VULNERABILITY,
20
+ RiskCategory.SensitiveDataLeakage: EvaluationMetrics.SENSITIVE_DATA_LEAKAGE,
21
+ RiskCategory.TaskAdherence: EvaluationMetrics.TASK_ADHERENCE,
22
+ RiskCategory.ProhibitedActions: EvaluationMetrics.PROHIBITED_ACTIONS,
21
23
  }
22
24
 
23
25
  RISK_CATEGORY_ANNOTATION_TASK_MAP = {
@@ -27,9 +29,11 @@ RISK_CATEGORY_ANNOTATION_TASK_MAP = {
27
29
  RiskCategory.SelfHarm: Tasks.CONTENT_HARM,
28
30
  RiskCategory.ProtectedMaterial: Tasks.PROTECTED_MATERIAL,
29
31
  RiskCategory.UngroundedAttributes: Tasks.UNGROUNDED_ATTRIBUTES,
30
- RiskCategory.IndirectAttack: Tasks.XPIA,
31
32
  _InternalRiskCategory.ECI: _InternalAnnotationTasks.ECI,
32
33
  RiskCategory.CodeVulnerability: Tasks.CODE_VULNERABILITY,
34
+ RiskCategory.SensitiveDataLeakage: Tasks.SENSITIVE_DATA_LEAKAGE,
35
+ RiskCategory.TaskAdherence: Tasks.TASK_ADHERENCE,
36
+ RiskCategory.ProhibitedActions: Tasks.PROHIBITED_ACTIONS,
33
37
  }
34
38
 
35
39
 
@@ -63,8 +67,7 @@ def get_attack_objective_from_risk_category(risk_category: Union[RiskCategory])
63
67
  :return: The corresponding attack objective string
64
68
  :rtype: str
65
69
  """
66
- return (
67
- "isa"
68
- if risk_category == RiskCategory.UngroundedAttributes
69
- else "xpia" if risk_category == RiskCategory.IndirectAttack else risk_category.value
70
- )
70
+ if risk_category == RiskCategory.UngroundedAttributes:
71
+ return "isa"
72
+ else:
73
+ return risk_category.value
@@ -68,6 +68,7 @@ def strategy_converter_map() -> Dict[Any, Union[PromptConverter, List[PromptConv
68
68
  AttackStrategy.Jailbreak: None,
69
69
  AttackStrategy.MultiTurn: None,
70
70
  AttackStrategy.Crescendo: None,
71
+ AttackStrategy.IndirectJailbreak: None,
71
72
  }
72
73
 
73
74
 
@@ -89,14 +90,11 @@ def get_converter_for_strategy(
89
90
 
90
91
  def get_chat_target(
91
92
  target: Union[PromptChatTarget, Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
92
- prompt_to_context: Optional[Dict[str, str]] = None,
93
93
  ) -> PromptChatTarget:
94
94
  """Convert various target types to a PromptChatTarget.
95
95
 
96
96
  :param target: The target to convert
97
97
  :type target: Union[PromptChatTarget, Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
98
- :param prompt_to_context: Optional mapping from prompt content to context
99
- :type prompt_to_context: Optional[Dict[str, str]]
100
98
  :return: A PromptChatTarget instance
101
99
  :rtype: PromptChatTarget
102
100
  """
@@ -154,7 +152,7 @@ def get_chat_target(
154
152
  has_callback_signature = False
155
153
 
156
154
  if has_callback_signature:
157
- chat_target = _CallbackChatTarget(callback=target, prompt_to_context=prompt_to_context)
155
+ chat_target = _CallbackChatTarget(callback=target)
158
156
  else:
159
157
 
160
158
  async def callback_target(
@@ -190,26 +188,6 @@ def get_chat_target(
190
188
  messages_list.append(formatted_response) # type: ignore
191
189
  return {"messages": messages_list, "stream": stream, "session_state": session_state, "context": {}}
192
190
 
193
- chat_target = _CallbackChatTarget(callback=callback_target, prompt_to_context=prompt_to_context) # type: ignore
191
+ chat_target = _CallbackChatTarget(callback=callback_target) # type: ignore
194
192
 
195
193
  return chat_target
196
-
197
-
198
- def get_orchestrators_for_attack_strategies(
199
- attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]]
200
- ) -> List[Callable]:
201
- """
202
- Gets a list of orchestrator functions to use based on the attack strategies.
203
-
204
- :param attack_strategies: The list of attack strategies
205
- :type attack_strategies: List[Union[AttackStrategy, List[AttackStrategy]]]
206
- :return: A list of orchestrator functions
207
- :rtype: List[Callable]
208
- """
209
- call_to_orchestrators = []
210
-
211
- # Since we're just returning one orchestrator type for now, simplify the logic
212
- # This can be expanded later if different orchestrators are needed for different strategies
213
- return [
214
- lambda chat_target, all_prompts, converter, strategy_name, risk_category: None
215
- ] # This will be replaced with the actual orchestrator function in the main class
@@ -14,7 +14,7 @@ from tqdm import tqdm
14
14
 
15
15
  from azure.ai.evaluation._common._experimental import experimental
16
16
  from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
17
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
17
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
18
18
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
19
19
  from azure.ai.evaluation._http_utils import get_async_http_client
20
20
  from azure.ai.evaluation._model_configurations import AzureAIProject
@@ -15,7 +15,7 @@ import jinja2
15
15
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
16
  from azure.ai.evaluation._http_utils import AsyncHttpPipeline
17
17
  from .._model_tools import LLMBase, OpenAIChatCompletionsModel, RAIClient
18
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
18
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
19
19
  from .._model_tools._template_handler import TemplateParameters
20
20
  from .constants import ConversationRole
21
21
 
@@ -11,7 +11,7 @@ from azure.ai.evaluation.simulator._constants import SupportedLanguages
11
11
  from azure.ai.evaluation.simulator._helpers._language_suffix_mapping import SUPPORTED_LANGUAGES_MAPPING
12
12
  from ..._http_utils import AsyncHttpPipeline
13
13
  from . import ConversationBot, ConversationTurn
14
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
14
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
15
15
 
16
16
 
17
17
  def is_closing_message(response: Union[Dict, str], recursion_depth: int = 0) -> bool:
@@ -13,7 +13,7 @@ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_oned
13
13
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
14
  from azure.ai.evaluation.simulator import AdversarialScenario
15
15
  from azure.ai.evaluation._model_configurations import AzureAIProject
16
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
16
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
17
17
  from azure.core.credentials import TokenCredential
18
18
 
19
19
  from ._adversarial_simulator import AdversarialSimulator
@@ -15,7 +15,7 @@ from azure.ai.evaluation._common._experimental import experimental
15
15
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
16
  from azure.ai.evaluation.simulator import AdversarialScenarioJailbreak, SupportedLanguages
17
17
  from azure.ai.evaluation._model_configurations import AzureAIProject
18
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
18
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
19
19
  from azure.core.credentials import TokenCredential
20
20
  from azure.ai.evaluation._constants import TokenScope
21
21
 
@@ -12,7 +12,7 @@ from azure.ai.evaluation.simulator._model_tools import ManagedIdentityAPITokenMa
12
12
  from azure.ai.evaluation._common.raiclient import MachineLearningServicesClient
13
13
  from azure.ai.evaluation._constants import TokenScope
14
14
  from azure.ai.evaluation._common.utils import is_onedp_project
15
- from azure.ai.evaluation._common.onedp import AIProjectClient
15
+ from azure.ai.evaluation._common.onedp import ProjectsClient as AIProjectClient
16
16
  from azure.ai.evaluation._common import EvaluationServiceOneDPClient
17
17
  from azure.ai.evaluation._user_agent import UserAgentSingleton
18
18
  import jwt
@@ -113,6 +113,8 @@ class GeneratedRAIClient:
113
113
  strategy: Optional[str] = None,
114
114
  language: str = "en",
115
115
  scan_session_id: Optional[str] = None,
116
+ target: Optional[str] = None,
117
+ client_id: Optional[str] = None,
116
118
  ) -> Dict:
117
119
  """Get attack objectives using the auto-generated operations.
118
120
 
@@ -128,18 +130,38 @@ class GeneratedRAIClient:
128
130
  :type language: str
129
131
  :param scan_session_id: Optional unique session ID for the scan
130
132
  :type scan_session_id: Optional[str]
133
+ :param target: Optional target type (model/agent)
134
+ :type target: Optional[str]
135
+ :param client_id: Optional client ID for ACA token authorization
136
+ :type client_id: Optional[str]
131
137
  :return: The attack objectives
132
138
  :rtype: Dict
133
139
  """
134
140
  try:
141
+ # Build headers dictionary
142
+ headers = {}
143
+ if scan_session_id:
144
+ headers["x-ms-client-request-id"] = scan_session_id
145
+ if client_id:
146
+ from azure.identity import DefaultAzureCredential
147
+
148
+ # Get token using the client_id for managed identity
149
+ managed_identity_credential = DefaultAzureCredential(
150
+ managed_identity_client_id=client_id, exclude_interactive_browser_credential=True
151
+ )
152
+ token = managed_identity_credential.get_token(TokenScope.DEFAULT_AZURE_MANAGEMENT).token
153
+ headers["aml-aca-token"] = token
154
+
135
155
  # Send the request using the autogenerated client
136
156
  response = self._client.get_attack_objectives(
137
157
  risk_types=[risk_type],
138
158
  risk_category=risk_category,
139
159
  lang=language,
140
160
  strategy=strategy,
141
- headers={"x-ms-client-request-id": scan_session_id},
161
+ target_type=target,
162
+ headers=headers,
142
163
  )
164
+
143
165
  return response
144
166
 
145
167
  except Exception as e:
@@ -195,4 +217,5 @@ class GeneratedRAIClient:
195
217
  if (exp_time - current_time) >= 300:
196
218
  return token
197
219
 
220
+ # Get token
198
221
  return credential.get_token(TokenScope.DEFAULT_AZURE_MANAGEMENT).token
@@ -152,6 +152,7 @@ class ManagedIdentityAPITokenManager(APITokenManager):
152
152
  ):
153
153
  self.last_refresh_time = time.time()
154
154
  get_token_method = self.credential.get_token(self.token_scope.value)
155
+
155
156
  if inspect.isawaitable(get_token_method):
156
157
  # If it's awaitable, await it
157
158
  token_response: AccessToken = await get_token_method
@@ -12,7 +12,7 @@ from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_cl
12
12
  from azure.ai.evaluation._user_agent import UserAgentSingleton
13
13
  from azure.core.exceptions import HttpResponseError, ServiceResponseError
14
14
  from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
15
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
15
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
16
16
  from azure.ai.evaluation._common.onedp.models import SimulationDTO
17
17
  from azure.ai.evaluation._common.constants import RAIService
18
18
 
@@ -7,7 +7,7 @@ from ast import literal_eval
7
7
  from typing_extensions import NotRequired
8
8
 
9
9
  from azure.ai.evaluation._model_configurations import AzureAIProject
10
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
10
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
11
11
  from azure.ai.evaluation.simulator._adversarial_scenario import AdversarialScenario
12
12
 
13
13
  from ._rai_client import RAIClient
@@ -12,7 +12,7 @@ from abc import ABC, abstractmethod
12
12
  from collections import deque
13
13
  from typing import Deque, Dict, List, Optional, Union
14
14
  from urllib.parse import urlparse
15
- from azure.ai.evaluation._common.onedp._client import AIProjectClient
15
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
16
16
  from ._rai_client import RAIClient
17
17
 
18
18
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: azure-ai-evaluation
3
- Version: 1.12.0
3
+ Version: 1.13.0
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -22,20 +22,25 @@ Requires-Python: >=3.9
22
22
  Description-Content-Type: text/markdown
23
23
  License-File: NOTICE.txt
24
24
  Requires-Dist: pyjwt>=2.8.0
25
- Requires-Dist: azure-identity>=1.16.0
26
- Requires-Dist: azure-core>=1.30.2
25
+ Requires-Dist: azure-identity>=1.19.0
26
+ Requires-Dist: azure-core>=1.31.0
27
27
  Requires-Dist: nltk>=3.9.1
28
- Requires-Dist: azure-storage-blob>=12.10.0
29
- Requires-Dist: httpx>=0.25.1
30
- Requires-Dist: pandas<3.0.0,>=2.1.2
28
+ Requires-Dist: azure-storage-blob>=12.19.0
29
+ Requires-Dist: httpx>=0.27.2
30
+ Requires-Dist: pandas<3.0.0,>=2.1.2; python_version < "3.13"
31
+ Requires-Dist: pandas<3.0.0,>=2.2.3; python_version == "3.13"
32
+ Requires-Dist: pandas<3.0.0,>=2.3.3; python_version >= "3.14"
31
33
  Requires-Dist: openai>=1.108.0
32
34
  Requires-Dist: ruamel.yaml<1.0.0,>=0.17.10
33
35
  Requires-Dist: msrest>=0.6.21
34
36
  Requires-Dist: Jinja2>=3.1.6
35
37
  Requires-Dist: aiohttp>=3.0
36
38
  Provides-Extra: redteam
37
- Requires-Dist: pyrit==0.8.1; extra == "redteam"
38
- Requires-Dist: duckdb==1.3.2; extra == "redteam"
39
+ Requires-Dist: pyrit==0.8.1; python_version >= "3.10" and extra == "redteam"
40
+ Requires-Dist: duckdb==1.3.2; python_version >= "3.10" and extra == "redteam"
41
+ Provides-Extra: opentelemetry
42
+ Requires-Dist: opentelemetry-sdk>=1.17.0; extra == "opentelemetry"
43
+ Requires-Dist: azure-monitor-opentelemetry-exporter>=1.0.0b17; extra == "opentelemetry"
39
44
  Dynamic: author
40
45
  Dynamic: author-email
41
46
  Dynamic: classifier
@@ -413,6 +418,25 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
413
418
 
414
419
  # Release History
415
420
 
421
+ ## 1.13.0 (2025-10-30)
422
+
423
+ ### Features Added
424
+
425
+ - Updated `IndirectAttack` risk category for RedTeam to `IndirectJailbreak` to better reflect its purpose. This change allows users to apply cross-domain prompt injection (XPIA) attack strategies across all risk categories, enabling more comprehensive security testing of AI systems against indirect prompt injection attacks during red teaming.
426
+ - Added `TaskAdherence`, `SensitiveDataLeakage`, and `ProhibitedActions` as cloud-only agent safety risk categories for red teaming.
427
+ - Updated all evaluators' output to be of the following schema:
428
+ - `gpt_{evaluator_name}`, `{evaluator_name}`: float score,
429
+ - `{evaluator_name}_result`: pass/fail based on threshold,
430
+ - `{evaluator_name}_reason`, `{evaluator_name}_threshold`
431
+ - `{evaluator_name}_prompt_tokens`, `{evaluator_name}_completion_tokens`, `{evaluator_name}_total_tokens`, `{evaluator_name}_finish_reason`
432
+ - `{evaluator_name}_model`: model used for evaluation
433
+ - `{evaluator_name}_sample_input`, `{evaluator_name}_sample_output`: input and output used for evaluation
434
+
435
+ This change standardizes the output format across all evaluators and follows OTel convention.
436
+
437
+ ### Bugs Fixed
438
+
439
+ - `image_tag` parameter in `AzureOpenAIPythonGrader` is now optional.
416
440
 
417
441
  ## 1.12.0 (2025-10-02)
418
442
 
@@ -423,6 +447,12 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
423
447
  ### Bugs Fixed
424
448
  - Support for multi-level nesting in OpenAI grader (experimental)
425
449
 
450
+ ## 1.11.2 (2025-10-09)
451
+
452
+ ### Bugs Fixed
453
+
454
+ - **kwargs in an evaluator signature receives input columns that are not otherwise named in the evaluator's signature
455
+
426
456
  ## 1.11.1 (2025-09-19)
427
457
 
428
458
  ### Bugs Fixed