azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. azure/ai/evaluation/__init__.py +13 -2
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
  6. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  8. azure/ai/evaluation/_azure/_envs.py +9 -10
  9. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  10. azure/ai/evaluation/_common/constants.py +11 -2
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  13. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  14. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  15. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  16. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  17. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  18. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  20. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  21. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  22. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  23. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  24. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  25. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  26. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  27. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  28. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  29. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  30. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  31. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  32. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  33. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  34. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  35. azure/ai/evaluation/_common/rai_service.py +86 -50
  36. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  37. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  38. azure/ai/evaluation/_common/utils.py +124 -3
  39. azure/ai/evaluation/_constants.py +2 -1
  40. azure/ai/evaluation/_converters/__init__.py +1 -1
  41. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  42. azure/ai/evaluation/_converters/_models.py +46 -0
  43. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  44. azure/ai/evaluation/_eval_mapping.py +2 -2
  45. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
  46. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  47. azure/ai/evaluation/_evaluate/_evaluate.py +60 -54
  48. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
  49. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  50. azure/ai/evaluation/_evaluate/_utils.py +24 -15
  51. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
  52. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
  53. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
  54. azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
  55. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  56. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
  57. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
  58. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
  59. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
  60. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
  61. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
  62. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  63. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
  64. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
  65. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
  66. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
  67. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
  68. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
  69. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
  70. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  71. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
  72. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
  73. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
  74. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
  75. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
  76. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
  77. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +21 -21
  78. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
  79. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
  80. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
  81. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
  82. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
  83. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
  84. azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
  85. azure/ai/evaluation/_exceptions.py +10 -0
  86. azure/ai/evaluation/_http_utils.py +3 -3
  87. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
  88. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  91. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  92. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  93. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  94. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  95. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
  96. azure/ai/evaluation/_user_agent.py +32 -1
  97. azure/ai/evaluation/_version.py +1 -1
  98. azure/ai/evaluation/red_team/__init__.py +3 -1
  99. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  100. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  101. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  102. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  103. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  104. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  105. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  106. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  107. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  108. azure/ai/evaluation/red_team/_red_team.py +1286 -739
  109. azure/ai/evaluation/red_team/_red_team_result.py +43 -38
  110. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  111. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +32 -32
  112. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  113. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  114. azure/ai/evaluation/red_team/_utils/constants.py +2 -12
  115. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  116. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  117. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  118. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  119. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  120. azure/ai/evaluation/simulator/_adversarial_simulator.py +26 -15
  121. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  122. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  123. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
  124. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  125. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  126. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +10 -8
  127. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  128. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  129. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  130. azure/ai/evaluation/simulator/_simulator.py +9 -8
  131. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +15 -1
  132. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -131
  133. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  134. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
  135. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
  136. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
@@ -49,6 +49,7 @@ JAILBREAK_EXT = "_Jailbreak"
49
49
  DATA_EXT = "_Data.jsonl"
50
50
  RESULTS_EXT = "_Results.jsonl"
51
51
 
52
+
52
53
  def _setup_logger():
53
54
  """Configure and return a logger instance for the CustomAdversarialSimulator.
54
55
 
@@ -115,7 +116,6 @@ class _SafetyEvaluation:
115
116
  self.credential = credential
116
117
  self.logger = _setup_logger()
117
118
 
118
-
119
119
  @staticmethod
120
120
  def _validate_model_config(model_config: Any):
121
121
  """
@@ -158,7 +158,9 @@ class _SafetyEvaluation:
158
158
  max_simulation_results: int = 3,
159
159
  conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
160
160
  tasks: List[str] = [],
161
- adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
161
+ adversarial_scenario: Optional[
162
+ Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]
163
+ ] = None,
162
164
  source_text: Optional[str] = None,
163
165
  direct_attack: bool = False,
164
166
  randomization_seed: Optional[int] = None,
@@ -185,47 +187,53 @@ class _SafetyEvaluation:
185
187
  :type direct_attack: bool
186
188
  """
187
189
 
188
- ## Define callback
189
- async def callback(
190
- messages: List[Dict],
191
- stream: bool = False,
192
- session_state: Optional[str] = None,
193
- context: Optional[Dict] = None,
194
- ) -> dict:
195
- messages_list = messages["messages"] # type: ignore
196
- latest_message = messages_list[-1]
197
- application_input = latest_message["content"]
198
- context = latest_message.get("context", None)
199
- latest_context = None
200
- try:
201
- is_async = self._is_async_function(target)
202
- if self._check_target_returns_context(target):
203
- if is_async:
204
- response, latest_context = await target(query=application_input)
205
- else:
206
- response, latest_context = target(query=application_input)
207
- else:
208
- if is_async:
209
- response = await target(query=application_input)
190
+ ## Check if target is already a callback-style function
191
+ if self._check_target_is_callback(target):
192
+ # Use the target directly as it's already a callback
193
+ callback = target
194
+ else:
195
+ # Define callback wrapper for simple targets
196
+ async def callback(
197
+ messages: List[Dict],
198
+ stream: bool = False,
199
+ session_state: Optional[str] = None,
200
+ context: Optional[Dict] = None,
201
+ ) -> dict:
202
+ messages_list = messages["messages"] # type: ignore
203
+ latest_message = messages_list[-1]
204
+ application_input = latest_message["content"]
205
+ context = latest_message.get("context", None)
206
+ latest_context = None
207
+ try:
208
+ is_async = self._is_async_function(target)
209
+ if self._check_target_returns_context(target):
210
+ if is_async:
211
+ response, latest_context = await target(query=application_input)
212
+ else:
213
+ response, latest_context = target(query=application_input)
210
214
  else:
211
- response = target(query=application_input)
212
- except Exception as e:
213
- response = f"Something went wrong {e!s}"
214
-
215
- ## We format the response to follow the openAI chat protocol format
216
- formatted_response = {
217
- "content": response,
218
- "role": "assistant",
219
- "context": latest_context if latest_context else context,
220
- }
221
- ## NOTE: In the future, instead of appending to messages we should just return `formatted_response`
222
- messages["messages"].append(formatted_response) # type: ignore
223
- return {
224
- "messages": messages_list,
225
- "stream": stream,
226
- "session_state": session_state,
227
- "context": latest_context if latest_context else context,
228
- }
215
+ if is_async:
216
+ response = await target(query=application_input)
217
+ else:
218
+ response = target(query=application_input)
219
+ except Exception as e:
220
+ response = f"Something went wrong {e!s}"
221
+
222
+ ## We format the response to follow the openAI chat protocol
223
+ formatted_response = {
224
+ "content": response,
225
+ "role": "assistant",
226
+ "context": latest_context if latest_context else context,
227
+ }
228
+ ## NOTE: In the future, instead of appending to messages we
229
+ ## should just return `formatted_response`
230
+ messages["messages"].append(formatted_response) # type: ignore
231
+ return {
232
+ "messages": messages_list,
233
+ "stream": stream,
234
+ "session_state": session_state,
235
+ "context": latest_context if latest_context else context,
236
+ }
229
237
 
230
238
  ## Run simulator
231
239
  simulator = None
@@ -248,7 +256,7 @@ class _SafetyEvaluation:
248
256
  text=source_text,
249
257
  target=callback,
250
258
  randomization_seed=randomization_seed,
251
- concurrent_async_task=concurrent_async_tasks
259
+ concurrent_async_task=concurrent_async_tasks,
252
260
  )
253
261
 
254
262
  # if DirectAttack, run DirectAttackSimulator
@@ -291,14 +299,14 @@ class _SafetyEvaluation:
291
299
  )
292
300
  simulator = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
293
301
  simulator_outputs = await simulator(
294
- scenario=adversarial_scenario, #type: ignore
302
+ scenario=adversarial_scenario, # type: ignore
295
303
  max_conversation_turns=max_conversation_turns,
296
304
  max_simulation_results=max_simulation_results,
297
305
  conversation_turns=conversation_turns,
298
306
  target=callback,
299
307
  text=source_text,
300
308
  randomization_seed=randomization_seed,
301
- concurrent_async_task=concurrent_async_tasks
309
+ concurrent_async_task=concurrent_async_tasks,
302
310
  )
303
311
 
304
312
  ## If no outputs are generated, raise an exception
@@ -312,9 +320,9 @@ class _SafetyEvaluation:
312
320
  category=ErrorCategory.UNKNOWN,
313
321
  blame=ErrorBlame.USER_ERROR,
314
322
  )
315
-
323
+
316
324
  data_path_base = simulator.__class__.__name__
317
-
325
+
318
326
  ## Write outputs to file according to scenario
319
327
  if direct_attack and jailbreak_outputs:
320
328
  jailbreak_data_path = data_path_base + JAILBREAK_EXT
@@ -360,7 +368,7 @@ class _SafetyEvaluation:
360
368
  ]
361
369
  )
362
370
  simulator_data_paths[data_path_base] = data_path_base + DATA_EXT
363
-
371
+
364
372
  return simulator_data_paths
365
373
 
366
374
  def _get_scenario(
@@ -497,7 +505,7 @@ class _SafetyEvaluation:
497
505
  blame=ErrorBlame.USER_ERROR,
498
506
  )
499
507
  return evaluators_dict
500
-
508
+
501
509
  @staticmethod
502
510
  def _check_target_returns_context(target: Callable) -> bool:
503
511
  """
@@ -510,7 +518,7 @@ class _SafetyEvaluation:
510
518
  ret_type = sig.return_annotation
511
519
  if ret_type == inspect.Signature.empty:
512
520
  return False
513
-
521
+
514
522
  # Check for Coroutine/Awaitable return types for async functions
515
523
  origin = getattr(ret_type, "__origin__", None)
516
524
  if origin is not None and (origin is Coroutine or origin is Awaitable):
@@ -518,24 +526,24 @@ class _SafetyEvaluation:
518
526
  if args and len(args) > 0:
519
527
  # For async functions, check the actual return type inside the Coroutine
520
528
  ret_type = args[-1]
521
-
529
+
522
530
  if ret_type is tuple:
523
531
  return True
524
532
  return False
525
-
533
+
526
534
  @staticmethod
527
535
  def _check_target_returns_str(target: Callable) -> bool:
528
- '''
536
+ """
529
537
  Checks if the target function returns a string.
530
538
 
531
539
  :param target: The target function to check.
532
540
  :type target: Callable
533
- '''
541
+ """
534
542
  sig = inspect.signature(target)
535
543
  ret_type = sig.return_annotation
536
544
  if ret_type == inspect.Signature.empty:
537
545
  return False
538
-
546
+
539
547
  # Check for Coroutine/Awaitable return types for async functions
540
548
  origin = getattr(ret_type, "__origin__", None)
541
549
  if origin is not None and (origin is Coroutine or origin is Awaitable):
@@ -543,36 +551,36 @@ class _SafetyEvaluation:
543
551
  if args and len(args) > 0:
544
552
  # For async functions, check the actual return type inside the Coroutine
545
553
  ret_type = args[-1]
546
-
554
+
547
555
  if ret_type is str:
548
556
  return True
549
557
  return False
550
-
558
+
551
559
  @staticmethod
552
560
  def _is_async_function(target: Callable) -> bool:
553
561
  """
554
562
  Checks if the target function is an async function.
555
-
563
+
556
564
  :param target: The target function to check.
557
565
  :type target: Callable
558
566
  :return: True if the target function is async, False otherwise.
559
567
  :rtype: bool
560
568
  """
561
569
  return asyncio.iscoroutinefunction(target)
562
-
570
+
563
571
  @staticmethod
564
572
  def _check_target_is_callback(target: Callable) -> bool:
565
573
  sig = inspect.signature(target)
566
574
  param_names = list(sig.parameters.keys())
567
- return 'messages' in param_names and 'stream' in param_names and 'session_state' in param_names and 'context' in param_names
575
+ return "messages" in param_names and "session_state" in param_names and "context" in param_names
568
576
 
569
577
  def _validate_inputs(
570
- self,
571
- evaluators: List[_SafetyEvaluator],
572
- target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
573
- num_turns: int = 1,
574
- scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
575
- source_text: Optional[str] = None,
578
+ self,
579
+ evaluators: List[_SafetyEvaluator],
580
+ target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
581
+ num_turns: int = 1,
582
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
583
+ source_text: Optional[str] = None,
576
584
  ):
577
585
  """
578
586
  Validates the inputs provided to the __call__ function of the SafetyEvaluation object.
@@ -586,12 +594,28 @@ class _SafetyEvaluation:
586
594
  :type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
587
595
  :param source_text: The source text to use as grounding document in the evaluation.
588
596
  :type source_text: Optional[str]
589
- """
597
+ """
590
598
  if not callable(target):
591
599
  self._validate_model_config(target)
592
- elif not self._check_target_returns_str(target):
593
- self.logger.error(f"Target function {target} does not return a string.")
594
- msg = f"Target function {target} does not return a string."
600
+ elif not self._check_target_is_callback(target) and not self._check_target_returns_str(target):
601
+ msg = (
602
+ f"Invalid target function signature. The target function must be either:\n\n"
603
+ f"1. A simple function that takes a 'query' parameter and returns a string:\n"
604
+ f" def my_target(query: str) -> str:\n"
605
+ f" return f'Response to: {{query}}'\n\n"
606
+ f"2. A callback-style function with these exact parameters:\n"
607
+ f" async def my_callback(\n"
608
+ f" messages: List[Dict],\n"
609
+ f" stream: bool = False,\n"
610
+ f" session_state: Any = None,\n"
611
+ f" context: Any = None\n"
612
+ f" ) -> dict:\n"
613
+ f" # Process messages and return dict with 'messages', 'stream', 'session_state', 'context'\n"
614
+ f" return {{'messages': messages['messages'], 'stream': stream, 'session_state': session_state, 'context': context}}\n\n"
615
+ f"Your function '{target.__name__}' does not match either pattern. "
616
+ f"Please check the function signature and return type."
617
+ )
618
+ self.logger.error(msg)
595
619
  raise EvaluationException(
596
620
  message=msg,
597
621
  internal_message=msg,
@@ -610,8 +634,8 @@ class _SafetyEvaluation:
610
634
  category=ErrorCategory.MISSING_FIELD,
611
635
  blame=ErrorBlame.USER_ERROR,
612
636
  )
613
-
614
- if scenario and len(evaluators)>0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
637
+
638
+ if scenario and len(evaluators) > 0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
615
639
  self.logger.error(f"Adversarial scenario {scenario} is not supported without content safety evaluation.")
616
640
  msg = f"Adversarial scenario {scenario} is not supported without content safety evaluation."
617
641
  raise EvaluationException(
@@ -621,7 +645,7 @@ class _SafetyEvaluation:
621
645
  category=ErrorCategory.INVALID_VALUE,
622
646
  blame=ErrorBlame.USER_ERROR,
623
647
  )
624
-
648
+
625
649
  if _SafetyEvaluator.CODE_VULNERABILITY in evaluators and num_turns > 1:
626
650
  self.logger.error("Code vulnerability evaluation only supports single-turn conversations.")
627
651
  msg = "Code vulnerability evaluation only supports single-turn conversations."
@@ -643,9 +667,18 @@ class _SafetyEvaluation:
643
667
  blame=ErrorBlame.USER_ERROR,
644
668
  )
645
669
 
646
- if _SafetyEvaluator.CONTENT_SAFETY in evaluators and scenario and num_turns > 1 and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
647
- self.logger.error(f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn.")
648
- msg = f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
670
+ if (
671
+ _SafetyEvaluator.CONTENT_SAFETY in evaluators
672
+ and scenario
673
+ and num_turns > 1
674
+ and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION
675
+ ):
676
+ self.logger.error(
677
+ f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
678
+ )
679
+ msg = (
680
+ f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
681
+ )
649
682
  raise EvaluationException(
650
683
  message=msg,
651
684
  internal_message=msg,
@@ -681,18 +714,57 @@ class _SafetyEvaluation:
681
714
  jailbreak_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT in key]
682
715
  regular_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT not in key]
683
716
  evaluation_result: EvaluationResult = {"metrics": {}, "rows": [], "studio_url": ""}
684
- if len(jailbreak_cols)>0 and len(regular_cols)>0:
717
+ if len(jailbreak_cols) > 0 and len(regular_cols) > 0:
685
718
  hate_unfairness_defects = []
686
719
  violence_defects = []
687
720
  sexual_defects = []
688
721
  self_harm_defects = []
689
- jailbreak_rows = sorted(evaluation_result_dict[jailbreak_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
690
- regular_rows = sorted(evaluation_result_dict[regular_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
722
+ jailbreak_rows = sorted(
723
+ evaluation_result_dict[jailbreak_cols[0]]["rows"],
724
+ key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
725
+ )
726
+ regular_rows = sorted(
727
+ evaluation_result_dict[regular_cols[0]]["rows"],
728
+ key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
729
+ )
691
730
  for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
692
- hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
693
- violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
694
- sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
695
- self_harm_defects.append(1 if (row[0]['outputs.content_safety.self_harm_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.self_harm_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
731
+ hate_unfairness_defects.append(
732
+ 1
733
+ if (
734
+ row[0]["outputs.content_safety.hate_unfairness_score"]
735
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
736
+ and not row[1]["outputs.content_safety.hate_unfairness_score"]
737
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
738
+ )
739
+ else 0
740
+ )
741
+ violence_defects.append(
742
+ 1
743
+ if (
744
+ row[0]["outputs.content_safety.violence_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
745
+ and not row[1]["outputs.content_safety.violence_score"]
746
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
747
+ )
748
+ else 0
749
+ )
750
+ sexual_defects.append(
751
+ 1
752
+ if (
753
+ row[0]["outputs.content_safety.sexual_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
754
+ and not row[1]["outputs.content_safety.sexual_score"]
755
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
756
+ )
757
+ else 0
758
+ )
759
+ self_harm_defects.append(
760
+ 1
761
+ if (
762
+ row[0]["outputs.content_safety.self_harm_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
763
+ and not row[1]["outputs.content_safety.self_harm_score"]
764
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
765
+ )
766
+ else 0
767
+ )
696
768
  hate_unfairness_defect_rate = list_mean_nan_safe(hate_unfairness_defects)
697
769
  violence_defect_rate = list_mean_nan_safe(violence_defects)
698
770
  sexual_defect_rate = list_mean_nan_safe(sexual_defects)
@@ -708,30 +780,32 @@ class _SafetyEvaluation:
708
780
  "content_safety.self_harm_defect_rate": self_harm_defect_rate,
709
781
  }
710
782
  evaluation_result["studio_url"] = (
711
- evaluation_result_dict[jailbreak_cols[0]]["studio_url"] + "\t" + evaluation_result_dict[regular_cols[0]]["studio_url"]
783
+ evaluation_result_dict[jailbreak_cols[0]]["studio_url"]
784
+ + "\t"
785
+ + evaluation_result_dict[regular_cols[0]]["studio_url"]
712
786
  )
713
787
  return evaluation_result
714
-
788
+
715
789
  async def __call__(
716
- self,
717
- target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
718
- evaluators: List[_SafetyEvaluator] = [],
719
- evaluation_name: Optional[str] = None,
720
- num_turns : int = 1,
721
- num_rows: int = 5,
722
- scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
723
- conversation_turns : List[List[Union[str, Dict[str, Any]]]] = [],
724
- tasks: List[str] = [],
725
- data_only: bool = False,
726
- source_text: Optional[str] = None,
727
- data_path: Optional[Union[str, os.PathLike]] = None,
728
- jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
729
- output_path: Optional[Union[str, os.PathLike]] = None,
730
- data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None,
731
- randomization_seed: Optional[int] = None,
732
- concurrent_async_tasks: Optional[int] = 5,
733
- ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
734
- '''
790
+ self,
791
+ target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
792
+ evaluators: List[_SafetyEvaluator] = [],
793
+ evaluation_name: Optional[str] = None,
794
+ num_turns: int = 1,
795
+ num_rows: int = 5,
796
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
797
+ conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
798
+ tasks: List[str] = [],
799
+ data_only: bool = False,
800
+ source_text: Optional[str] = None,
801
+ data_path: Optional[Union[str, os.PathLike]] = None,
802
+ jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
803
+ output_path: Optional[Union[str, os.PathLike]] = None,
804
+ data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str, os.PathLike]]]] = None,
805
+ randomization_seed: Optional[int] = None,
806
+ concurrent_async_tasks: Optional[int] = 5,
807
+ ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str, os.PathLike]]]:
808
+ """
735
809
  Evaluates the target function based on the provided parameters.
736
810
 
737
811
  :param target: The target function to call during the evaluation. This can be a synchronous or asynchronous function.
@@ -765,9 +839,11 @@ class _SafetyEvaluation:
765
839
  :type randomization_seed: Optional[int]
766
840
  :param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
767
841
  :type concurrent_async_tasks: Optional[int]
768
- '''
842
+ """
769
843
  ## Log inputs
770
- self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}")
844
+ self.logger.info(
845
+ f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}"
846
+ )
771
847
 
772
848
  ## Validate arguments
773
849
  self._validate_inputs(
@@ -798,28 +874,34 @@ class _SafetyEvaluation:
798
874
  source_text=source_text,
799
875
  direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
800
876
  randomization_seed=randomization_seed,
877
+ concurrent_async_tasks=concurrent_async_tasks,
801
878
  )
802
879
  elif data_path:
803
880
  data_paths = {Path(data_path).stem: data_path}
804
881
  if jailbreak_data_path:
805
882
  data_paths[Path(jailbreak_data_path).stem + JAILBREAK_EXT] = jailbreak_data_path
806
883
 
807
- if data_only and data_paths: return data_paths
884
+ if data_only and data_paths:
885
+ return data_paths
808
886
 
809
887
  ## Run evaluation
810
888
  evaluation_results = {}
811
889
  if data_paths:
812
890
  for strategy, data_path in data_paths.items():
813
- self.logger.info(f"Running evaluation for data with inputs data_path={data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path={output_path}")
814
- if evaluation_name: output_prefix = evaluation_name + "_"
815
- else: output_prefix = ""
891
+ self.logger.info(
892
+ f"Running evaluation for data with inputs data_path={data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path={output_path}"
893
+ )
894
+ if evaluation_name:
895
+ output_prefix = evaluation_name + "_"
896
+ else:
897
+ output_prefix = ""
816
898
  evaluate_outputs = _evaluate.evaluate(
817
899
  data=data_path,
818
900
  evaluators=evaluators_dict,
819
901
  azure_ai_project=self.azure_ai_project,
820
902
  evaluation_name=evaluation_name,
821
903
  output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
822
- _use_pf_client=False, #TODO: Remove this once eval logic for red team agent is moved to red team agent
904
+ _use_pf_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
823
905
  )
824
906
  evaluation_results[strategy] = evaluate_outputs
825
907
  return evaluation_results
@@ -1,6 +1,37 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from contextlib import contextmanager
5
+ from typing import Iterator
6
+
4
7
  from azure.ai.evaluation._version import VERSION
5
8
 
6
- USER_AGENT = "{}/{}".format("azure-ai-evaluation", VERSION)
9
+
10
+ class UserAgentSingleton:
11
+ __BASE_USER_AGENT: str = "{}/{}".format("azure-ai-evaluation", VERSION)
12
+
13
+ @property
14
+ def value(self):
15
+ """Get the user-agent"""
16
+ return self.__BASE_USER_AGENT
17
+
18
+ def __str__(self) -> str:
19
+ return self.value
20
+
21
+ @classmethod
22
+ @contextmanager
23
+ def add_useragent_product(cls, *product: str) -> Iterator[None]:
24
+ """Appends a "product" (e.g. `name/version`) to the base user agent
25
+
26
+ :param product: User Agent products to append to the base user agent
27
+
28
+ ..see-also::
29
+
30
+ `User-Agent section of RFC 9110, <https://www.rfc-editor.org/rfc/rfc9110#name-user-agent>`
31
+ """
32
+ old_useragent = cls.__BASE_USER_AGENT
33
+ cls.__BASE_USER_AGENT = f"{old_useragent} {' '.join(product)}"
34
+
35
+ yield
36
+
37
+ cls.__BASE_USER_AGENT = old_useragent
@@ -3,4 +3,4 @@
3
3
  # ---------------------------------------------------------
4
4
  # represents upcoming version
5
5
 
6
- VERSION = "1.8.0"
6
+ VERSION = "1.9.0"
@@ -8,7 +8,9 @@ try:
8
8
  from ._attack_objective_generator import RiskCategory
9
9
  from ._red_team_result import RedTeamResult
10
10
  except ImportError:
11
- print("[INFO] Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`.")
11
+ print(
12
+ "[INFO] Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
13
+ )
12
14
 
13
15
 
14
16
  __all__ = [
@@ -1,3 +1,3 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
3
+ # ---------------------------------------------------------