azure-ai-evaluation 1.6.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (55) hide show
  1. azure/ai/evaluation/__init__.py +1 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +1 -1
  3. azure/ai/evaluation/_aoai/label_grader.py +2 -2
  4. azure/ai/evaluation/_aoai/string_check_grader.py +2 -2
  5. azure/ai/evaluation/_aoai/text_similarity_grader.py +2 -2
  6. azure/ai/evaluation/_common/__init__.py +3 -1
  7. azure/ai/evaluation/_common/evaluation_onedp_client.py +50 -5
  8. azure/ai/evaluation/_common/onedp/operations/_operations.py +1 -1
  9. azure/ai/evaluation/_common/rai_service.py +7 -6
  10. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  11. azure/ai/evaluation/_converters/_models.py +76 -6
  12. azure/ai/evaluation/_eval_mapping.py +2 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +11 -13
  14. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +24 -5
  15. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  16. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  17. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  18. azure/ai/evaluation/_evaluators/_common/_base_eval.py +4 -0
  19. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  20. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  21. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  22. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  23. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  24. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +31 -29
  25. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  26. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +10 -0
  27. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  28. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +10 -0
  29. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +10 -0
  30. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  31. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  32. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  33. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +10 -0
  34. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +13 -0
  35. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  36. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  37. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  38. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +10 -0
  39. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -0
  40. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +80 -10
  41. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  42. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  43. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +26 -7
  44. azure/ai/evaluation/_version.py +1 -1
  45. azure/ai/evaluation/red_team/_red_team.py +183 -128
  46. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  47. azure/ai/evaluation/simulator/_direct_attack_simulator.py +3 -3
  48. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +3 -3
  49. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +2 -0
  50. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +6 -5
  51. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +26 -3
  52. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +55 -55
  53. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  54. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  55. {azure_ai_evaluation-1.6.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,16 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
33
33
  :dedent: 8
34
34
  :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
35
35
 
36
+ .. admonition:: Example using Azure AI Project URL:
37
+
38
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
39
+ :start-after: [START intent_resolution_evaluator]
40
+ :end-before: [END intent_resolution_evaluator]
41
+ :language: python
42
+ :dedent: 8
43
+ :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
44
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
45
+
36
46
  """
37
47
 
38
48
  _PROMPTY_FILE = "intent_resolution.prompty"
@@ -45,6 +45,16 @@ class MeteorScoreEvaluator(EvaluatorBase):
45
45
  :dedent: 8
46
46
  :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
47
47
 
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START meteor_score_evaluator]
52
+ :end-before: [END meteor_score_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
48
58
  .. admonition:: Example with Threshold:
49
59
 
50
60
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -37,6 +37,17 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
37
37
  :language: python
38
38
  :dedent: 8
39
39
  :caption: Initialize and call a ProtectedMaterialEvaluator.
40
+
41
+ .. admonition:: Example using Azure AI Project URL:
42
+
43
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
44
+ :start-after: [START protected_material_evaluator]
45
+ :end-before: [END protected_material_evaluator]
46
+ :language: python
47
+ :dedent: 8
48
+ :caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
49
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
+
40
51
  """
41
52
 
42
53
  id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"
@@ -48,6 +48,16 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
48
48
  :dedent: 8
49
49
  :caption: Initialize and call a QAEvaluator.
50
50
 
51
+ .. admonition:: Example using Azure AI Project URL:
52
+
53
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
54
+ :start-after: [START qa_evaluator]
55
+ :end-before: [END qa_evaluator]
56
+ :language: python
57
+ :dedent: 8
58
+ :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
59
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
60
+
51
61
  .. admonition:: Example with Threshold:
52
62
 
53
63
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -39,6 +39,16 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
39
39
  :dedent: 8
40
40
  :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
41
41
 
42
+ .. admonition:: Example using Azure AI Project URL:
43
+
44
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
45
+ :start-after: [START relevance_evaluator]
46
+ :end-before: [END relevance_evaluator]
47
+ :language: python
48
+ :dedent: 8
49
+ :caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
50
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
51
+
42
52
  .. admonition:: Example with Threshold:
43
53
 
44
54
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -37,13 +37,26 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
37
37
  :param model_config: Configuration for the Azure OpenAI model.
38
38
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
39
39
  ~azure.ai.evaluation.OpenAIModelConfiguration]
40
+
40
41
  .. admonition:: Example:
42
+
41
43
  .. literalinclude:: ../samples/evaluation_samples_evaluate.py
42
44
  :start-after: [START completeness_evaluator]
43
45
  :end-before: [END completeness_evaluator]
44
46
  :language: python
45
47
  :dedent: 8
46
48
  :caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
49
+
50
+ .. admonition:: Example using Azure AI Project URL:
51
+
52
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
53
+ :start-after: [START completeness_evaluator]
54
+ :end-before: [END completeness_evaluator]
55
+ :language: python
56
+ :dedent: 8
57
+ :caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
58
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
59
+
47
60
  """
48
61
 
49
62
  # Constants must be defined within eval's directory to be save/loadable
@@ -45,6 +45,16 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
45
45
  :dedent: 8
46
46
  :caption: Initialize and call a RetrievalEvaluator.
47
47
 
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START retrieval_evaluator]
52
+ :end-before: [END retrieval_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
48
58
  .. admonition:: Example with Threshold:
49
59
 
50
60
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -70,6 +70,16 @@ class RougeScoreEvaluator(EvaluatorBase):
70
70
  :dedent: 8
71
71
  :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
72
72
 
73
+ .. admonition:: Example using Azure AI Project URL:
74
+
75
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
76
+ :start-after: [START rouge_score_evaluator]
77
+ :end-before: [END rouge_score_evaluator]
78
+ :language: python
79
+ :dedent: 8
80
+ :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
81
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
82
+
73
83
  .. admonition:: Example with threshold:
74
84
 
75
85
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -41,6 +41,16 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
41
41
  :dedent: 8
42
42
  :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
43
43
 
44
+ .. admonition:: Example using Azure AI Project URL:
45
+
46
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
47
+ :start-after: [START groundedness_pro_evaluator]
48
+ :end-before: [END groundedness_pro_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
52
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
53
+
44
54
  .. admonition:: Example with threshold:
45
55
 
46
56
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -40,6 +40,16 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
40
40
  :dedent: 8
41
41
  :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
42
42
 
43
+ .. admonition:: Example using Azure AI Project URL:
44
+
45
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
46
+ :start-after: [START similarity_evaluator]
47
+ :end-before: [END similarity_evaluator]
48
+ :language: python
49
+ :dedent: 8
50
+ :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
51
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
52
+
43
53
  .. admonition:: Example:
44
54
 
45
55
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -42,6 +42,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
42
42
  :language: python
43
43
  :dedent: 8
44
44
  :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
45
+
46
+ .. admonition:: Example using Azure AI Project URL:
47
+
48
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
49
+ :start-after: [START task_adherence_evaluator]
50
+ :end-before: [END task_adherence_evaluator]
51
+ :language: python
52
+ :dedent: 8
53
+ :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
54
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
55
+
45
56
  """
46
57
 
47
58
  _PROMPTY_FILE = "task_adherence.prompty"
@@ -45,6 +45,16 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
45
45
  :dedent: 8
46
46
  :caption: Initialize and call a ToolCallAccuracyEvaluator.
47
47
 
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START tool_call_accuracy_evaluator]
52
+ :end-before: [END tool_call_accuracy_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
48
58
  .. note::
49
59
 
50
60
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -214,12 +224,18 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
214
224
  score = math.nan
215
225
  if llm_output:
216
226
  score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
217
- return {
218
- self._result_key: bool(float(score)),
219
- f"{self._result_key}_reason": reason,
220
- "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
221
- }
222
- return {self._result_key: float(score)}
227
+ if score >= 0 and score <= 1:
228
+ return {
229
+ self._result_key: bool(float(score)),
230
+ f"{self._result_key}_reason": reason,
231
+ "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
232
+ }
233
+ raise EvaluationException(
234
+ message="Tool call accuracy evaluator: Invalid score returned from LLM.",
235
+ blame=ErrorBlame.SYSTEM_ERROR,
236
+ category=ErrorCategory.INVALID_VALUE,
237
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
238
+ )
223
239
 
224
240
  async def _real_call(self, **kwargs):
225
241
  """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -231,13 +247,55 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
231
247
  """
232
248
  # Convert inputs into list of evaluable inputs.
233
249
  eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
250
+ if len(eval_input_list) == 0:
251
+ return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
252
+ f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
253
+ f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
254
+ f"{self._AGGREGATE_RESULT_KEY}_reason":
255
+ "No tool calls were made.",
256
+ "per_tool_call_details": []
257
+ }
258
+
234
259
  per_turn_results = []
235
260
  # Evaluate all inputs.
236
261
  for eval_input in eval_input_list:
237
- per_turn_results.append(await self._do_eval(eval_input))
262
+ if self._is_applicable_tool(eval_input):
263
+ per_turn_results.append(await self._do_eval(eval_input))
264
+ else:
265
+ per_turn_results.append(self._not_applicable_result(eval_input))
238
266
 
239
267
  return self._aggregate_results(per_turn_results=per_turn_results)
240
268
 
269
+ def _is_applicable_tool(self, eval_input):
270
+ """Determine if a given tool should be evaluated, since we only evaluate tools that
271
+ have sufficient context available.
272
+
273
+ :type eval_input: Dict
274
+ :return: True if the tool call should be evaluated
275
+ :rtype: bool
276
+ """
277
+ tool_definition = eval_input.get("tool_definition")
278
+ if tool_definition is None or len(tool_definition) != 1:
279
+ return False
280
+ tool_type = tool_definition[0].get("type")
281
+ if tool_type is None or tool_type != "function":
282
+ return False
283
+ return True
284
+
285
+ def _not_applicable_result(self, eval_input):
286
+ """Return a result indicating that the tool call is not applicable for evaluation.
287
+
288
+ :param eval_input: The input to the evaluator.
289
+ :type eval_input: Dict
290
+ :return: A dictionary containing the result of the evaluation.
291
+ :rtype: Dict[str, Union[str, float]]
292
+ """
293
+ return {
294
+ f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
295
+ f"{self._result_key}_reason": "Tool call not supported for evaluation",
296
+ "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
297
+ }
298
+
241
299
  def _aggregate_results(self, per_turn_results):
242
300
  """Aggregate the evaluation results of each conversation turn into a single result.
243
301
 
@@ -260,11 +318,23 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
260
318
  # Go over each turn, and rotate the results into a
261
319
  # metric: List[values] format for the evals_per_turn dictionary.
262
320
 
263
- score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
321
+ num_evaluated = len([per_turn_result for per_turn_result in per_turn_results
322
+ if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT])
323
+ if num_evaluated == 0:
324
+ # None of the invoked tools were applicable, return not applicable result
325
+ # (If a tool fails evaluation, we'll throw an exception)
326
+ return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
327
+ f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
328
+ f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
329
+ f"{self._AGGREGATE_RESULT_KEY}_reason":
330
+ "Tool call accuracy evaluation is not yet supported for the invoked tools.",
331
+ "per_tool_call_details": []
332
+ }
333
+ # ignore not_applicable results, where the _result_key will be "not applicable"
334
+ score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated
264
335
  aggregated[self._AGGREGATE_RESULT_KEY] = score
265
- aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
336
+ aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
266
337
  aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
267
-
268
338
  aggregated["per_tool_call_details"] = per_turn_results
269
339
  return aggregated
270
340
 
@@ -41,6 +41,16 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
41
41
  :dedent: 8
42
42
  :caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
43
43
 
44
+ .. admonition:: Example using Azure AI Project URL:
45
+
46
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
47
+ :start-after: [START ungrounded_attributes_evaluator]
48
+ :end-before: [END ungrounded_attributes_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
52
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
53
+
44
54
  .. note::
45
55
 
46
56
  If this evaluator is supplied to the `evaluate` function, the metric
@@ -54,6 +54,17 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
54
54
  :language: python
55
55
  :dedent: 8
56
56
  :caption: Initialize and call an IndirectAttackEvaluator.
57
+
58
+ .. admonition:: Example using Azure AI Project URL:
59
+
60
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
61
+ :start-after: [START indirect_attack_evaluator]
62
+ :end-before: [END indirect_attack_evaluator]
63
+ :language: python
64
+ :dedent: 8
65
+ :caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
66
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
67
+
57
68
  """
58
69
 
59
70
  id = "azureml://registries/azureml/models/Indirect-Attack-Evaluator/versions/3"
@@ -161,6 +161,8 @@ class _SafetyEvaluation:
161
161
  adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
162
162
  source_text: Optional[str] = None,
163
163
  direct_attack: bool = False,
164
+ randomization_seed: Optional[int] = None,
165
+ concurrent_async_tasks: Optional[int] = 5,
164
166
  ) -> Dict[str, str]:
165
167
  """
166
168
  Generates synthetic conversations based on provided parameters.
@@ -245,6 +247,8 @@ class _SafetyEvaluation:
245
247
  conversation_turns=conversation_turns,
246
248
  text=source_text,
247
249
  target=callback,
250
+ randomization_seed=randomization_seed,
251
+ concurrent_async_task=concurrent_async_tasks
248
252
  )
249
253
 
250
254
  # if DirectAttack, run DirectAttackSimulator
@@ -258,6 +262,8 @@ class _SafetyEvaluation:
258
262
  max_conversation_turns=max_conversation_turns,
259
263
  max_simulation_results=max_simulation_results,
260
264
  target=callback,
265
+ randomization_seed=randomization_seed,
266
+ concurrent_async_task=concurrent_async_tasks,
261
267
  )
262
268
  jailbreak_outputs = simulator_outputs["jailbreak"]
263
269
  simulator_outputs = simulator_outputs["regular"]
@@ -275,6 +281,7 @@ class _SafetyEvaluation:
275
281
  num_queries=max_simulation_results,
276
282
  target=callback,
277
283
  text=source_text if source_text else "",
284
+ concurrent_async_tasks=concurrent_async_tasks,
278
285
  )
279
286
 
280
287
  ## Run AdversarialSimulator
@@ -290,6 +297,8 @@ class _SafetyEvaluation:
290
297
  conversation_turns=conversation_turns,
291
298
  target=callback,
292
299
  text=source_text,
300
+ randomization_seed=randomization_seed,
301
+ concurrent_async_task=concurrent_async_tasks
293
302
  )
294
303
 
295
304
  ## If no outputs are generated, raise an exception
@@ -677,7 +686,9 @@ class _SafetyEvaluation:
677
686
  violence_defects = []
678
687
  sexual_defects = []
679
688
  self_harm_defects = []
680
- for i, row in enumerate(zip(evaluation_result_dict[jailbreak_cols[0]]['rows'], evaluation_result_dict[regular_cols[0]]['rows'])):
689
+ jailbreak_rows = sorted(evaluation_result_dict[jailbreak_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
690
+ regular_rows = sorted(evaluation_result_dict[regular_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
691
+ for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
681
692
  hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
682
693
  violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
683
694
  sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
@@ -716,8 +727,10 @@ class _SafetyEvaluation:
716
727
  data_path: Optional[Union[str, os.PathLike]] = None,
717
728
  jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
718
729
  output_path: Optional[Union[str, os.PathLike]] = None,
719
- data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None
720
- ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
730
+ data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None,
731
+ randomization_seed: Optional[int] = None,
732
+ concurrent_async_tasks: Optional[int] = 5,
733
+ ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
721
734
  '''
722
735
  Evaluates the target function based on the provided parameters.
723
736
 
@@ -744,12 +757,17 @@ class _SafetyEvaluation:
744
757
  :param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run.
745
758
  :type data_path: Optional[Union[str, os.PathLike]]
746
759
  :param jailbreak_data_path: The path to the data file generated by the Simulator for jailbreak scenario. If None, the DirectAttackSimulator will be run.
747
- :type jailbreak_data_path: Optional[Union[str, os.PathLike]]
748
- :param output_path: The path to write the evaluation results to if set.
760
+ :type jailbreak_data_path: Optional[Union[str, os.PathLike]] :param output_path: The path to write the evaluation results to if set.
749
761
  :type output_path: Optional[Union[str, os.PathLike]]
762
+ :param data_paths: A dictionary of data paths to evaluate. If None, the Simulator will be run.
763
+ :type data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]]
764
+ :param randomization_seed: The seed used to randomize prompt selection. If unset, the system's default seed is used.
765
+ :type randomization_seed: Optional[int]
766
+ :param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
767
+ :type concurrent_async_tasks: Optional[int]
750
768
  '''
751
- ## Log inputs
752
- self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}")
769
+ ## Log inputs
770
+ self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}")
753
771
 
754
772
  ## Validate arguments
755
773
  self._validate_inputs(
@@ -779,6 +797,7 @@ class _SafetyEvaluation:
779
797
  tasks=tasks,
780
798
  source_text=source_text,
781
799
  direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
800
+ randomization_seed=randomization_seed,
782
801
  )
783
802
  elif data_path:
784
803
  data_paths = {Path(data_path).stem: data_path}
@@ -3,4 +3,4 @@
3
3
  # ---------------------------------------------------------
4
4
  # represents upcoming version
5
5
 
6
- VERSION = "1.6.0"
6
+ VERSION = "1.7.0"