azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -49,6 +49,7 @@ JAILBREAK_EXT = "_Jailbreak"
49
49
  DATA_EXT = "_Data.jsonl"
50
50
  RESULTS_EXT = "_Results.jsonl"
51
51
 
52
+
52
53
  def _setup_logger():
53
54
  """Configure and return a logger instance for the CustomAdversarialSimulator.
54
55
 
@@ -115,7 +116,6 @@ class _SafetyEvaluation:
115
116
  self.credential = credential
116
117
  self.logger = _setup_logger()
117
118
 
118
-
119
119
  @staticmethod
120
120
  def _validate_model_config(model_config: Any):
121
121
  """
@@ -158,7 +158,9 @@ class _SafetyEvaluation:
158
158
  max_simulation_results: int = 3,
159
159
  conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
160
160
  tasks: List[str] = [],
161
- adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
161
+ adversarial_scenario: Optional[
162
+ Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]
163
+ ] = None,
162
164
  source_text: Optional[str] = None,
163
165
  direct_attack: bool = False,
164
166
  randomization_seed: Optional[int] = None,
@@ -185,47 +187,53 @@ class _SafetyEvaluation:
185
187
  :type direct_attack: bool
186
188
  """
187
189
 
188
- ## Define callback
189
- async def callback(
190
- messages: List[Dict],
191
- stream: bool = False,
192
- session_state: Optional[str] = None,
193
- context: Optional[Dict] = None,
194
- ) -> dict:
195
- messages_list = messages["messages"] # type: ignore
196
- latest_message = messages_list[-1]
197
- application_input = latest_message["content"]
198
- context = latest_message.get("context", None)
199
- latest_context = None
200
- try:
201
- is_async = self._is_async_function(target)
202
- if self._check_target_returns_context(target):
203
- if is_async:
204
- response, latest_context = await target(query=application_input)
205
- else:
206
- response, latest_context = target(query=application_input)
207
- else:
208
- if is_async:
209
- response = await target(query=application_input)
190
+ ## Check if target is already a callback-style function
191
+ if self._check_target_is_callback(target):
192
+ # Use the target directly as it's already a callback
193
+ callback = target
194
+ else:
195
+ # Define callback wrapper for simple targets
196
+ async def callback(
197
+ messages: List[Dict],
198
+ stream: bool = False,
199
+ session_state: Optional[str] = None,
200
+ context: Optional[Dict] = None,
201
+ ) -> dict:
202
+ messages_list = messages["messages"] # type: ignore
203
+ latest_message = messages_list[-1]
204
+ application_input = latest_message["content"]
205
+ context = latest_message.get("context", None)
206
+ latest_context = None
207
+ try:
208
+ is_async = self._is_async_function(target)
209
+ if self._check_target_returns_context(target):
210
+ if is_async:
211
+ response, latest_context = await target(query=application_input)
212
+ else:
213
+ response, latest_context = target(query=application_input)
210
214
  else:
211
- response = target(query=application_input)
212
- except Exception as e:
213
- response = f"Something went wrong {e!s}"
214
-
215
- ## We format the response to follow the openAI chat protocol format
216
- formatted_response = {
217
- "content": response,
218
- "role": "assistant",
219
- "context": latest_context if latest_context else context,
220
- }
221
- ## NOTE: In the future, instead of appending to messages we should just return `formatted_response`
222
- messages["messages"].append(formatted_response) # type: ignore
223
- return {
224
- "messages": messages_list,
225
- "stream": stream,
226
- "session_state": session_state,
227
- "context": latest_context if latest_context else context,
228
- }
215
+ if is_async:
216
+ response = await target(query=application_input)
217
+ else:
218
+ response = target(query=application_input)
219
+ except Exception as e:
220
+ response = f"Something went wrong {e!s}"
221
+
222
+ ## We format the response to follow the openAI chat protocol
223
+ formatted_response = {
224
+ "content": response,
225
+ "role": "assistant",
226
+ "context": latest_context if latest_context else context,
227
+ }
228
+ ## NOTE: In the future, instead of appending to messages we
229
+ ## should just return `formatted_response`
230
+ messages["messages"].append(formatted_response) # type: ignore
231
+ return {
232
+ "messages": messages_list,
233
+ "stream": stream,
234
+ "session_state": session_state,
235
+ "context": latest_context if latest_context else context,
236
+ }
229
237
 
230
238
  ## Run simulator
231
239
  simulator = None
@@ -248,7 +256,7 @@ class _SafetyEvaluation:
248
256
  text=source_text,
249
257
  target=callback,
250
258
  randomization_seed=randomization_seed,
251
- concurrent_async_task=concurrent_async_tasks
259
+ concurrent_async_task=concurrent_async_tasks,
252
260
  )
253
261
 
254
262
  # if DirectAttack, run DirectAttackSimulator
@@ -282,6 +290,7 @@ class _SafetyEvaluation:
282
290
  target=callback,
283
291
  text=source_text if source_text else "",
284
292
  concurrent_async_tasks=concurrent_async_tasks,
293
+ randomization_seed=randomization_seed,
285
294
  )
286
295
 
287
296
  ## Run AdversarialSimulator
@@ -291,14 +300,14 @@ class _SafetyEvaluation:
291
300
  )
292
301
  simulator = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
293
302
  simulator_outputs = await simulator(
294
- scenario=adversarial_scenario, #type: ignore
303
+ scenario=adversarial_scenario, # type: ignore
295
304
  max_conversation_turns=max_conversation_turns,
296
305
  max_simulation_results=max_simulation_results,
297
306
  conversation_turns=conversation_turns,
298
307
  target=callback,
299
308
  text=source_text,
300
309
  randomization_seed=randomization_seed,
301
- concurrent_async_task=concurrent_async_tasks
310
+ concurrent_async_task=concurrent_async_tasks,
302
311
  )
303
312
 
304
313
  ## If no outputs are generated, raise an exception
@@ -312,9 +321,9 @@ class _SafetyEvaluation:
312
321
  category=ErrorCategory.UNKNOWN,
313
322
  blame=ErrorBlame.USER_ERROR,
314
323
  )
315
-
324
+
316
325
  data_path_base = simulator.__class__.__name__
317
-
326
+
318
327
  ## Write outputs to file according to scenario
319
328
  if direct_attack and jailbreak_outputs:
320
329
  jailbreak_data_path = data_path_base + JAILBREAK_EXT
@@ -360,7 +369,7 @@ class _SafetyEvaluation:
360
369
  ]
361
370
  )
362
371
  simulator_data_paths[data_path_base] = data_path_base + DATA_EXT
363
-
372
+
364
373
  return simulator_data_paths
365
374
 
366
375
  def _get_scenario(
@@ -497,7 +506,7 @@ class _SafetyEvaluation:
497
506
  blame=ErrorBlame.USER_ERROR,
498
507
  )
499
508
  return evaluators_dict
500
-
509
+
501
510
  @staticmethod
502
511
  def _check_target_returns_context(target: Callable) -> bool:
503
512
  """
@@ -510,7 +519,7 @@ class _SafetyEvaluation:
510
519
  ret_type = sig.return_annotation
511
520
  if ret_type == inspect.Signature.empty:
512
521
  return False
513
-
522
+
514
523
  # Check for Coroutine/Awaitable return types for async functions
515
524
  origin = getattr(ret_type, "__origin__", None)
516
525
  if origin is not None and (origin is Coroutine or origin is Awaitable):
@@ -518,24 +527,24 @@ class _SafetyEvaluation:
518
527
  if args and len(args) > 0:
519
528
  # For async functions, check the actual return type inside the Coroutine
520
529
  ret_type = args[-1]
521
-
530
+
522
531
  if ret_type is tuple:
523
532
  return True
524
533
  return False
525
-
534
+
526
535
  @staticmethod
527
536
  def _check_target_returns_str(target: Callable) -> bool:
528
- '''
537
+ """
529
538
  Checks if the target function returns a string.
530
539
 
531
540
  :param target: The target function to check.
532
541
  :type target: Callable
533
- '''
542
+ """
534
543
  sig = inspect.signature(target)
535
544
  ret_type = sig.return_annotation
536
545
  if ret_type == inspect.Signature.empty:
537
546
  return False
538
-
547
+
539
548
  # Check for Coroutine/Awaitable return types for async functions
540
549
  origin = getattr(ret_type, "__origin__", None)
541
550
  if origin is not None and (origin is Coroutine or origin is Awaitable):
@@ -543,36 +552,36 @@ class _SafetyEvaluation:
543
552
  if args and len(args) > 0:
544
553
  # For async functions, check the actual return type inside the Coroutine
545
554
  ret_type = args[-1]
546
-
555
+
547
556
  if ret_type is str:
548
557
  return True
549
558
  return False
550
-
559
+
551
560
  @staticmethod
552
561
  def _is_async_function(target: Callable) -> bool:
553
562
  """
554
563
  Checks if the target function is an async function.
555
-
564
+
556
565
  :param target: The target function to check.
557
566
  :type target: Callable
558
567
  :return: True if the target function is async, False otherwise.
559
568
  :rtype: bool
560
569
  """
561
570
  return asyncio.iscoroutinefunction(target)
562
-
571
+
563
572
  @staticmethod
564
573
  def _check_target_is_callback(target: Callable) -> bool:
565
574
  sig = inspect.signature(target)
566
575
  param_names = list(sig.parameters.keys())
567
- return 'messages' in param_names and 'stream' in param_names and 'session_state' in param_names and 'context' in param_names
576
+ return "messages" in param_names and "session_state" in param_names and "context" in param_names
568
577
 
569
578
  def _validate_inputs(
570
- self,
571
- evaluators: List[_SafetyEvaluator],
572
- target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
573
- num_turns: int = 1,
574
- scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
575
- source_text: Optional[str] = None,
579
+ self,
580
+ evaluators: List[_SafetyEvaluator],
581
+ target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
582
+ num_turns: int = 1,
583
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
584
+ source_text: Optional[str] = None,
576
585
  ):
577
586
  """
578
587
  Validates the inputs provided to the __call__ function of the SafetyEvaluation object.
@@ -586,12 +595,28 @@ class _SafetyEvaluation:
586
595
  :type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
587
596
  :param source_text: The source text to use as grounding document in the evaluation.
588
597
  :type source_text: Optional[str]
589
- """
598
+ """
590
599
  if not callable(target):
591
600
  self._validate_model_config(target)
592
- elif not self._check_target_returns_str(target):
593
- self.logger.error(f"Target function {target} does not return a string.")
594
- msg = f"Target function {target} does not return a string."
601
+ elif not self._check_target_is_callback(target) and not self._check_target_returns_str(target):
602
+ msg = (
603
+ f"Invalid target function signature. The target function must be either:\n\n"
604
+ f"1. A simple function that takes a 'query' parameter and returns a string:\n"
605
+ f" def my_target(query: str) -> str:\n"
606
+ f" return f'Response to: {{query}}'\n\n"
607
+ f"2. A callback-style function with these exact parameters:\n"
608
+ f" async def my_callback(\n"
609
+ f" messages: List[Dict],\n"
610
+ f" stream: bool = False,\n"
611
+ f" session_state: Any = None,\n"
612
+ f" context: Any = None\n"
613
+ f" ) -> dict:\n"
614
+ f" # Process messages and return dict with 'messages', 'stream', 'session_state', 'context'\n"
615
+ f" return {{'messages': messages['messages'], 'stream': stream, 'session_state': session_state, 'context': context}}\n\n"
616
+ f"Your function '{target.__name__}' does not match either pattern. "
617
+ f"Please check the function signature and return type."
618
+ )
619
+ self.logger.error(msg)
595
620
  raise EvaluationException(
596
621
  message=msg,
597
622
  internal_message=msg,
@@ -610,8 +635,8 @@ class _SafetyEvaluation:
610
635
  category=ErrorCategory.MISSING_FIELD,
611
636
  blame=ErrorBlame.USER_ERROR,
612
637
  )
613
-
614
- if scenario and len(evaluators)>0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
638
+
639
+ if scenario and len(evaluators) > 0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
615
640
  self.logger.error(f"Adversarial scenario {scenario} is not supported without content safety evaluation.")
616
641
  msg = f"Adversarial scenario {scenario} is not supported without content safety evaluation."
617
642
  raise EvaluationException(
@@ -621,7 +646,7 @@ class _SafetyEvaluation:
621
646
  category=ErrorCategory.INVALID_VALUE,
622
647
  blame=ErrorBlame.USER_ERROR,
623
648
  )
624
-
649
+
625
650
  if _SafetyEvaluator.CODE_VULNERABILITY in evaluators and num_turns > 1:
626
651
  self.logger.error("Code vulnerability evaluation only supports single-turn conversations.")
627
652
  msg = "Code vulnerability evaluation only supports single-turn conversations."
@@ -643,9 +668,18 @@ class _SafetyEvaluation:
643
668
  blame=ErrorBlame.USER_ERROR,
644
669
  )
645
670
 
646
- if _SafetyEvaluator.CONTENT_SAFETY in evaluators and scenario and num_turns > 1 and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
647
- self.logger.error(f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn.")
648
- msg = f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
671
+ if (
672
+ _SafetyEvaluator.CONTENT_SAFETY in evaluators
673
+ and scenario
674
+ and num_turns > 1
675
+ and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION
676
+ ):
677
+ self.logger.error(
678
+ f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
679
+ )
680
+ msg = (
681
+ f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
682
+ )
649
683
  raise EvaluationException(
650
684
  message=msg,
651
685
  internal_message=msg,
@@ -681,18 +715,57 @@ class _SafetyEvaluation:
681
715
  jailbreak_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT in key]
682
716
  regular_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT not in key]
683
717
  evaluation_result: EvaluationResult = {"metrics": {}, "rows": [], "studio_url": ""}
684
- if len(jailbreak_cols)>0 and len(regular_cols)>0:
718
+ if len(jailbreak_cols) > 0 and len(regular_cols) > 0:
685
719
  hate_unfairness_defects = []
686
720
  violence_defects = []
687
721
  sexual_defects = []
688
722
  self_harm_defects = []
689
- jailbreak_rows = sorted(evaluation_result_dict[jailbreak_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
690
- regular_rows = sorted(evaluation_result_dict[regular_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
723
+ jailbreak_rows = sorted(
724
+ evaluation_result_dict[jailbreak_cols[0]]["rows"],
725
+ key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
726
+ )
727
+ regular_rows = sorted(
728
+ evaluation_result_dict[regular_cols[0]]["rows"],
729
+ key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
730
+ )
691
731
  for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
692
- hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
693
- violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
694
- sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
695
- self_harm_defects.append(1 if (row[0]['outputs.content_safety.self_harm_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.self_harm_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
732
+ hate_unfairness_defects.append(
733
+ 1
734
+ if (
735
+ row[0]["outputs.content_safety.hate_unfairness_score"]
736
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
737
+ and not row[1]["outputs.content_safety.hate_unfairness_score"]
738
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
739
+ )
740
+ else 0
741
+ )
742
+ violence_defects.append(
743
+ 1
744
+ if (
745
+ row[0]["outputs.content_safety.violence_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
746
+ and not row[1]["outputs.content_safety.violence_score"]
747
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
748
+ )
749
+ else 0
750
+ )
751
+ sexual_defects.append(
752
+ 1
753
+ if (
754
+ row[0]["outputs.content_safety.sexual_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
755
+ and not row[1]["outputs.content_safety.sexual_score"]
756
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
757
+ )
758
+ else 0
759
+ )
760
+ self_harm_defects.append(
761
+ 1
762
+ if (
763
+ row[0]["outputs.content_safety.self_harm_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
764
+ and not row[1]["outputs.content_safety.self_harm_score"]
765
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
766
+ )
767
+ else 0
768
+ )
696
769
  hate_unfairness_defect_rate = list_mean_nan_safe(hate_unfairness_defects)
697
770
  violence_defect_rate = list_mean_nan_safe(violence_defects)
698
771
  sexual_defect_rate = list_mean_nan_safe(sexual_defects)
@@ -708,30 +781,32 @@ class _SafetyEvaluation:
708
781
  "content_safety.self_harm_defect_rate": self_harm_defect_rate,
709
782
  }
710
783
  evaluation_result["studio_url"] = (
711
- evaluation_result_dict[jailbreak_cols[0]]["studio_url"] + "\t" + evaluation_result_dict[regular_cols[0]]["studio_url"]
784
+ evaluation_result_dict[jailbreak_cols[0]]["studio_url"]
785
+ + "\t"
786
+ + evaluation_result_dict[regular_cols[0]]["studio_url"]
712
787
  )
713
788
  return evaluation_result
714
-
789
+
715
790
  async def __call__(
716
- self,
717
- target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
718
- evaluators: List[_SafetyEvaluator] = [],
719
- evaluation_name: Optional[str] = None,
720
- num_turns : int = 1,
721
- num_rows: int = 5,
722
- scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
723
- conversation_turns : List[List[Union[str, Dict[str, Any]]]] = [],
724
- tasks: List[str] = [],
725
- data_only: bool = False,
726
- source_text: Optional[str] = None,
727
- data_path: Optional[Union[str, os.PathLike]] = None,
728
- jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
729
- output_path: Optional[Union[str, os.PathLike]] = None,
730
- data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None,
731
- randomization_seed: Optional[int] = None,
732
- concurrent_async_tasks: Optional[int] = 5,
733
- ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
734
- '''
791
+ self,
792
+ target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
793
+ evaluators: List[_SafetyEvaluator] = [],
794
+ evaluation_name: Optional[str] = None,
795
+ num_turns: int = 1,
796
+ num_rows: int = 5,
797
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
798
+ conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
799
+ tasks: List[str] = [],
800
+ data_only: bool = False,
801
+ source_text: Optional[str] = None,
802
+ data_path: Optional[Union[str, os.PathLike]] = None,
803
+ jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
804
+ output_path: Optional[Union[str, os.PathLike]] = None,
805
+ data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str, os.PathLike]]]] = None,
806
+ randomization_seed: Optional[int] = None,
807
+ concurrent_async_tasks: Optional[int] = 5,
808
+ ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str, os.PathLike]]]:
809
+ """
735
810
  Evaluates the target function based on the provided parameters.
736
811
 
737
812
  :param target: The target function to call during the evaluation. This can be a synchronous or asynchronous function.
@@ -765,9 +840,11 @@ class _SafetyEvaluation:
765
840
  :type randomization_seed: Optional[int]
766
841
  :param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
767
842
  :type concurrent_async_tasks: Optional[int]
768
- '''
843
+ """
769
844
  ## Log inputs
770
- self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}")
845
+ self.logger.info(
846
+ f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}"
847
+ )
771
848
 
772
849
  ## Validate arguments
773
850
  self._validate_inputs(
@@ -798,28 +875,35 @@ class _SafetyEvaluation:
798
875
  source_text=source_text,
799
876
  direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
800
877
  randomization_seed=randomization_seed,
878
+ concurrent_async_tasks=concurrent_async_tasks,
801
879
  )
802
880
  elif data_path:
803
881
  data_paths = {Path(data_path).stem: data_path}
804
882
  if jailbreak_data_path:
805
883
  data_paths[Path(jailbreak_data_path).stem + JAILBREAK_EXT] = jailbreak_data_path
806
884
 
807
- if data_only and data_paths: return data_paths
885
+ if data_only and data_paths:
886
+ return data_paths
808
887
 
809
888
  ## Run evaluation
810
889
  evaluation_results = {}
811
890
  if data_paths:
812
891
  for strategy, data_path in data_paths.items():
813
- self.logger.info(f"Running evaluation for data with inputs data_path={data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path={output_path}")
814
- if evaluation_name: output_prefix = evaluation_name + "_"
815
- else: output_prefix = ""
892
+ self.logger.info(
893
+ f"Running evaluation for data with inputs data_path={data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path={output_path}"
894
+ )
895
+ if evaluation_name:
896
+ output_prefix = evaluation_name + "_"
897
+ else:
898
+ output_prefix = ""
816
899
  evaluate_outputs = _evaluate.evaluate(
817
900
  data=data_path,
818
901
  evaluators=evaluators_dict,
819
902
  azure_ai_project=self.azure_ai_project,
820
903
  evaluation_name=evaluation_name,
821
904
  output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
822
- _use_pf_client=False, #TODO: Remove this once eval logic for red team agent is moved to red team agent
905
+ _use_pf_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
906
+ _use_run_submitter_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
823
907
  )
824
908
  evaluation_results[strategy] = evaluate_outputs
825
909
  return evaluation_results
@@ -1,6 +1,37 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from contextlib import contextmanager
5
+ from typing import Iterator
6
+
4
7
  from azure.ai.evaluation._version import VERSION
5
8
 
6
- USER_AGENT = "{}/{}".format("azure-ai-evaluation", VERSION)
9
+
10
+ class UserAgentSingleton:
11
+ __BASE_USER_AGENT: str = "{}/{}".format("azure-ai-evaluation", VERSION)
12
+
13
+ @property
14
+ def value(self):
15
+ """Get the user-agent"""
16
+ return self.__BASE_USER_AGENT
17
+
18
+ def __str__(self) -> str:
19
+ return self.value
20
+
21
+ @classmethod
22
+ @contextmanager
23
+ def add_useragent_product(cls, *product: str) -> Iterator[None]:
24
+ """Appends a "product" (e.g. `name/version`) to the base user agent
25
+
26
+ :param product: User Agent products to append to the base user agent
27
+
28
+ ..see-also::
29
+
30
+ `User-Agent section of RFC 9110, <https://www.rfc-editor.org/rfc/rfc9110#name-user-agent>`
31
+ """
32
+ old_useragent = cls.__BASE_USER_AGENT
33
+ cls.__BASE_USER_AGENT = f"{old_useragent} {' '.join(product)}"
34
+
35
+ yield
36
+
37
+ cls.__BASE_USER_AGENT = old_useragent
@@ -3,4 +3,4 @@
3
3
  # ---------------------------------------------------------
4
4
  # represents upcoming version
5
5
 
6
- VERSION = "1.8.0"
6
+ VERSION = "1.10.0"
@@ -8,7 +8,9 @@ try:
8
8
  from ._attack_objective_generator import RiskCategory
9
9
  from ._red_team_result import RedTeamResult
10
10
  except ImportError:
11
- print("[INFO] Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`.")
11
+ raise ImportError(
12
+ "Could not import Pyrit. Please install the dependency with `pip install azure-ai-evaluation[redteam]`."
13
+ )
12
14
 
13
15
 
14
16
  __all__ = [
@@ -1,3 +1,3 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
3
+ # ---------------------------------------------------------