azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. azure/ai/evaluation/__init__.py +13 -2
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/score_model_grader.py +90 -0
  6. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  8. azure/ai/evaluation/_azure/_envs.py +9 -10
  9. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  10. azure/ai/evaluation/_common/constants.py +11 -2
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  13. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  14. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  15. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  16. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  17. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  18. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  20. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  21. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  22. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  23. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  24. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  25. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  26. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  27. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  28. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  29. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  30. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  31. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  32. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  33. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  34. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  35. azure/ai/evaluation/_common/rai_service.py +86 -50
  36. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  37. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  38. azure/ai/evaluation/_common/utils.py +124 -3
  39. azure/ai/evaluation/_constants.py +2 -1
  40. azure/ai/evaluation/_converters/__init__.py +1 -1
  41. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  42. azure/ai/evaluation/_converters/_models.py +46 -0
  43. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  44. azure/ai/evaluation/_eval_mapping.py +2 -2
  45. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +4 -4
  46. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  47. azure/ai/evaluation/_evaluate/_evaluate.py +60 -54
  48. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +130 -89
  49. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  50. azure/ai/evaluation/_evaluate/_utils.py +24 -15
  51. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +3 -3
  52. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +12 -11
  53. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +5 -5
  54. azure/ai/evaluation/_evaluators/_common/_base_eval.py +15 -5
  55. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  56. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +6 -1
  57. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +13 -13
  58. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +7 -7
  59. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +7 -7
  60. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +7 -7
  61. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +6 -6
  62. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  63. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +34 -64
  64. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -3
  65. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +4 -4
  66. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -2
  67. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +3 -3
  68. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -7
  69. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +30 -25
  70. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  71. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +2 -3
  72. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +6 -6
  73. azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -4
  74. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +8 -13
  75. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -25
  76. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +4 -4
  77. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +21 -21
  78. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +5 -5
  79. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -3
  80. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +11 -14
  81. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +43 -34
  82. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +3 -3
  83. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +12 -11
  84. azure/ai/evaluation/_evaluators/_xpia/xpia.py +6 -6
  85. azure/ai/evaluation/_exceptions.py +10 -0
  86. azure/ai/evaluation/_http_utils.py +3 -3
  87. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +3 -3
  88. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  89. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +5 -10
  90. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  91. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  92. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  93. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  94. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  95. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +193 -111
  96. azure/ai/evaluation/_user_agent.py +32 -1
  97. azure/ai/evaluation/_version.py +1 -1
  98. azure/ai/evaluation/red_team/__init__.py +3 -1
  99. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  100. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  101. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  102. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  103. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  104. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  105. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  106. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  107. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  108. azure/ai/evaluation/red_team/_red_team.py +1286 -739
  109. azure/ai/evaluation/red_team/_red_team_result.py +43 -38
  110. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  111. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +32 -32
  112. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  113. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  114. azure/ai/evaluation/red_team/_utils/constants.py +2 -12
  115. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  116. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  117. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  118. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  119. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  120. azure/ai/evaluation/simulator/_adversarial_simulator.py +26 -15
  121. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  122. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  123. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +5 -5
  124. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  125. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  126. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +10 -8
  127. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  128. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  129. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  130. azure/ai/evaluation/simulator/_simulator.py +9 -8
  131. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/METADATA +15 -1
  132. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/RECORD +135 -131
  133. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  134. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/NOTICE.txt +0 -0
  135. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/WHEEL +0 -0
  136. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.9.0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ tool_provider = None
20
20
  azure_ai_project = None
21
21
  target_function = None
22
22
 
23
+
23
24
  def _get_tool_provider() -> RedTeamToolProvider:
24
25
  """Get or create the RedTeamToolProvider instance."""
25
26
  global credential, tool_provider, azure_ai_project
@@ -28,20 +29,24 @@ def _get_tool_provider() -> RedTeamToolProvider:
28
29
  tool_provider = RedTeamToolProvider(
29
30
  azure_ai_project=azure_ai_project,
30
31
  credential=credential,
31
- application_scenario="A customer service chatbot for a retail website" # Optional context
32
+ application_scenario="A customer service chatbot for a retail website", # Optional context
32
33
  )
33
34
  return tool_provider
34
35
 
36
+
35
37
  # Dictionary to store fetched prompts for reference
36
38
  fetched_prompts = {}
37
39
 
38
- def red_team_fetch_harmful_prompt(risk_category: str, strategy: str = "baseline", convert_with_strategy: Optional[str] = None) -> str:
40
+
41
+ def red_team_fetch_harmful_prompt(
42
+ risk_category: str, strategy: str = "baseline", convert_with_strategy: Optional[str] = None
43
+ ) -> str:
39
44
  """
40
45
  Fetch a harmful prompt for a specific risk category to test content filters.
41
-
46
+
42
47
  This function retrieves a prompt that could potentially trigger content filters
43
48
  and can be used for red teaming AI systems.
44
-
49
+
45
50
  :param risk_category (str): The risk category to get a harmful prompt for (e.g., 'violence', 'hate_unfairness', 'sexual', 'self_harm').
46
51
  :param strategy (str, optional): The attack strategy to use (e.g., 'baseline', 'jailbreak'). Defaults to 'baseline'.
47
52
  :param convert_with_strategy (Optional[str], optional): Optional strategy to convert the prompt (e.g., 'morse_converter', 'binary_converter'). Defaults to None.
@@ -49,102 +54,97 @@ def red_team_fetch_harmful_prompt(risk_category: str, strategy: str = "baseline"
49
54
  :rtype: str
50
55
  """
51
56
  provider = _get_tool_provider()
52
-
57
+
53
58
  # Run the async method in a new event loop
54
- result = asyncio.run(provider.fetch_harmful_prompt(
55
- risk_category_text=risk_category,
56
- strategy=strategy,
57
- convert_with_strategy=convert_with_strategy
58
- ))
59
-
59
+ result = asyncio.run(
60
+ provider.fetch_harmful_prompt(
61
+ risk_category_text=risk_category, strategy=strategy, convert_with_strategy=convert_with_strategy
62
+ )
63
+ )
64
+
60
65
  # Store the prompt for later conversion if successful
61
66
  if result["status"] == "success" and "prompt_id" in result:
62
67
  prompt_id = result["prompt_id"]
63
68
  if "prompt" in result:
64
69
  fetched_prompts[prompt_id] = result["prompt"]
65
-
70
+
66
71
  return json.dumps(result)
67
72
 
73
+
68
74
  def red_team_convert_prompt(prompt_or_id: str, strategy: str) -> str:
69
75
  """
70
76
  Convert a prompt or a previously fetched prompt ID using a specified strategy.
71
-
77
+
72
78
  This function can be used to transform prompts in ways that might bypass content filters
73
79
  for testing AI system defenses.
74
-
80
+
75
81
  :param prompt_or_id (str): Either a prompt text or a prompt ID from a previous fetch_harmful_prompt call.
76
82
  :param strategy (str): The strategy to use for conversion (e.g., 'morse_converter', 'binary_converter', 'base64_converter').
77
83
  :return: A JSON string containing the original and converted prompt.
78
84
  :rtype: str
79
85
  """
80
86
  provider = _get_tool_provider()
81
-
87
+
82
88
  # Check if input is a prompt ID we have stored
83
89
  if prompt_or_id in fetched_prompts:
84
90
  # Update the provider's cache
85
91
  provider._fetched_prompts[prompt_or_id] = fetched_prompts[prompt_or_id]
86
-
92
+
87
93
  # Run the async method in a new event loop
88
- result = asyncio.run(provider.convert_prompt(
89
- prompt_or_id=prompt_or_id,
90
- strategy=strategy
91
- ))
92
-
94
+ result = asyncio.run(provider.convert_prompt(prompt_or_id=prompt_or_id, strategy=strategy))
95
+
93
96
  return json.dumps(result)
94
97
 
98
+
95
99
  def red_team_unified(category: str, strategy: Optional[str] = None) -> str:
96
100
  """
97
101
  Get a harmful prompt for a specific risk category and optionally convert it.
98
-
102
+
99
103
  This function combines fetching and converting harmful prompts in one call,
100
104
  making it easier to test AI system defenses.
101
-
105
+
102
106
  :param category (str): The risk category to get a harmful prompt for (e.g., 'violence', 'hate_unfairness', 'sexual', 'self_harm').
103
107
  :param strategy (Optional[str], optional): Optional strategy to convert the prompt (e.g., 'morse_converter', 'binary_converter'). Defaults to None.
104
108
  :return: A JSON string containing the harmful prompt and related metadata.
105
109
  :rtype: str
106
110
  """
107
111
  provider = _get_tool_provider()
108
-
112
+
109
113
  # Run the async method in a new event loop
110
- result = asyncio.run(provider.red_team(
111
- category=category,
112
- strategy=strategy
113
- ))
114
-
114
+ result = asyncio.run(provider.red_team(category=category, strategy=strategy))
115
+
115
116
  # Store the prompt for later conversion if it's a success and we didn't convert it
116
117
  if result["status"] == "success":
117
118
  if "prompt_id" in result and "prompt" in result:
118
119
  prompt_id = result["prompt_id"]
119
120
  fetched_prompts[prompt_id] = result["prompt"]
120
-
121
+
121
122
  return json.dumps(result)
122
123
 
124
+
123
125
  def red_team_get_available_strategies() -> str:
124
126
  """
125
127
  Get a list of all available prompt conversion strategies.
126
-
128
+
127
129
  This function lists all the strategies that can be used to convert prompts
128
130
  for red teaming purposes.
129
-
131
+
130
132
  :return: A JSON string containing a list of available conversion strategies.
131
133
  :rtype: str
132
134
  """
133
135
  provider = _get_tool_provider()
134
136
  strategies = provider.get_available_strategies()
135
-
136
- return json.dumps({
137
- "status": "success",
138
- "available_strategies": strategies
139
- })
137
+
138
+ return json.dumps({"status": "success", "available_strategies": strategies})
139
+
140
140
 
141
141
  def red_team_explain_purpose() -> str:
142
142
  """
143
143
  Explain the purpose and responsible use of red teaming tools.
144
-
144
+
145
145
  This function provides information about the ethical use of red teaming tools
146
146
  and their purpose in improving AI system safety.
147
-
147
+
148
148
  :return: A JSON string containing information about responsible use of red teaming tools.
149
149
  :rtype: str
150
150
  """
@@ -153,53 +153,49 @@ def red_team_explain_purpose() -> str:
153
153
  "responsible_use": [
154
154
  "Only use these tools on systems you have permission to test.",
155
155
  "Use the results to improve system safety and robustness.",
156
- "Do not use these tools to cause harm or bypass legitimate safety measures in production systems."
156
+ "Do not use these tools to cause harm or bypass legitimate safety measures in production systems.",
157
157
  ],
158
158
  "risk_categories": {
159
159
  "violence": "Content related to physical harm, weapons, or dangerous activities",
160
160
  "hate_unfairness": "Content related to discrimination, bias, or prejudice",
161
161
  "sexual": "Explicit sexual content",
162
- "self_harm": "Content related to suicide, self-injury, or harmful behaviors"
162
+ "self_harm": "Content related to suicide, self-injury, or harmful behaviors",
163
163
  },
164
- "conversion_strategies": "Various encoding and obfuscation methods to test if harmful content can bypass detection"
164
+ "conversion_strategies": "Various encoding and obfuscation methods to test if harmful content can bypass detection",
165
165
  }
166
-
166
+
167
167
  return json.dumps(explanation)
168
168
 
169
+
169
170
  def red_team_send_to_target(prompt: str) -> str:
170
171
  """
171
172
  Send a prompt to the target function (e.g., call_ollama) and return the response.
172
-
173
- This function allows sending prompts to an external model or service for
173
+
174
+ This function allows sending prompts to an external model or service for
174
175
  evaluation.
175
-
176
+
176
177
  :param prompt (str): The prompt text to send to the target function.
177
178
  :return: A JSON string containing the response from the target function.
178
179
  :rtype: str
179
180
  """
180
181
  global target_function
181
-
182
+
182
183
  if target_function is None:
183
- return json.dumps({
184
- "status": "error",
185
- "message": "Target function not initialized. Make sure to pass a target_func to initialize_tool_provider."
186
- })
187
-
184
+ return json.dumps(
185
+ {
186
+ "status": "error",
187
+ "message": "Target function not initialized. Make sure to pass a target_func to initialize_tool_provider.",
188
+ }
189
+ )
190
+
188
191
  try:
189
192
  # Call the target function with the prompt
190
193
  response = target_function(prompt)
191
-
192
- return json.dumps({
193
- "status": "success",
194
- "prompt": prompt,
195
- "response": response
196
- })
194
+
195
+ return json.dumps({"status": "success", "prompt": prompt, "response": response})
197
196
  except Exception as e:
198
- return json.dumps({
199
- "status": "error",
200
- "message": f"Error calling target function: {str(e)}",
201
- "prompt": prompt
202
- })
197
+ return json.dumps({"status": "error", "message": f"Error calling target function: {str(e)}", "prompt": prompt})
198
+
203
199
 
204
200
  # Example User Input for Each Function
205
201
  # 1. Red Team Fetch Harmful Prompt
@@ -228,17 +224,18 @@ user_functions: Set[Callable[..., Any]] = {
228
224
  red_team_unified,
229
225
  red_team_get_available_strategies,
230
226
  red_team_explain_purpose,
231
- red_team_send_to_target
227
+ red_team_send_to_target,
232
228
  }
233
229
 
230
+
234
231
  def initialize_tool_provider(
235
- projects_connection_string: str,
236
- target_func: Optional[Callable[[str], str]] = None,
237
- ) -> Set[Callable[..., Any]]:
232
+ projects_connection_string: str,
233
+ target_func: Optional[Callable[[str], str]] = None,
234
+ ) -> Set[Callable[..., Any]]:
238
235
  """
239
236
  Initialize the RedTeamToolProvider with the Azure AI project and credential.
240
237
  This function is called when the module is imported.
241
-
238
+
242
239
  :param projects_connection_string: The Azure AI project connection string.
243
240
  :param target_func: A function that takes a string prompt and returns a string response.
244
241
  :return: A set of callable functions that can be used as tools.
@@ -246,14 +243,14 @@ def initialize_tool_provider(
246
243
  # projects_connection_string is in the format: connection_string;subscription_id;resource_group;project_name
247
244
  # parse it to a dictionary called azure_ai_project
248
245
  global azure_ai_project, credential, tool_provider, target_function
249
-
246
+
250
247
  # Store the target function for later use
251
248
  if target_func is not None:
252
- globals()['target_function'] = target_func
249
+ globals()["target_function"] = target_func
253
250
  azure_ai_project = {
254
251
  "subscription_id": projects_connection_string.split(";")[1],
255
252
  "resource_group_name": projects_connection_string.split(";")[2],
256
- "project_name": projects_connection_string.split(";")[3]
253
+ "project_name": projects_connection_string.split(";")[3],
257
254
  }
258
255
  if not credential:
259
256
  credential = DefaultAzureCredential()