azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ import os
12
12
  import tempfile
13
13
  import uuid
14
14
  from datetime import datetime
15
- from typing import Dict, Optional, cast
15
+ from typing import Any, Dict, List, Optional, Set, cast
16
16
  from pathlib import Path
17
17
 
18
18
  # Azure AI Evaluation imports
@@ -27,7 +27,14 @@ from azure.ai.evaluation._common import RedTeamUpload, ResultType
27
27
  from azure.ai.evaluation._model_configurations import AzureAIProject
28
28
 
29
29
  # Local imports
30
- from ._red_team_result import RedTeamResult
30
+ from ._red_team_result import (
31
+ RedTeamResult,
32
+ RedTeamRun,
33
+ ResultCount,
34
+ PerTestingCriteriaResult,
35
+ DataSource,
36
+ OutputItemsList,
37
+ )
31
38
  from ._utils.logging_utils import log_error
32
39
 
33
40
 
@@ -50,6 +57,32 @@ class MLflowIntegration:
50
57
  self.scan_output_dir = scan_output_dir
51
58
  self.ai_studio_url = None
52
59
  self.trace_destination = None
60
+ self._run_id_override: Optional[str] = None
61
+ self._eval_id_override: Optional[str] = None
62
+ self._created_at_override: Optional[int] = None
63
+
64
+ def set_run_identity_overrides(
65
+ self,
66
+ *,
67
+ run_id: Optional[str] = None,
68
+ eval_id: Optional[str] = None,
69
+ created_at: Optional[Any] = None,
70
+ ) -> None:
71
+ """Allow callers to supply pre-existing identifiers for the run payload."""
72
+
73
+ self._run_id_override = str(run_id).strip() if run_id else None
74
+ self._eval_id_override = str(eval_id).strip() if eval_id else None
75
+
76
+ if created_at is None or created_at == "":
77
+ self._created_at_override = None
78
+ else:
79
+ if isinstance(created_at, datetime):
80
+ self._created_at_override = int(created_at.timestamp())
81
+ else:
82
+ try:
83
+ self._created_at_override = int(created_at)
84
+ except (TypeError, ValueError):
85
+ self._created_at_override = None
53
86
 
54
87
  def start_redteam_mlflow_run(
55
88
  self,
@@ -136,6 +169,7 @@ class MLflowIntegration:
136
169
  eval_run: EvalRun,
137
170
  red_team_info: Dict,
138
171
  _skip_evals: bool = False,
172
+ aoai_summary: Optional["RedTeamRun"] = None,
139
173
  ) -> Optional[str]:
140
174
  """Log the Red Team Agent results to MLFlow.
141
175
 
@@ -147,43 +181,42 @@ class MLflowIntegration:
147
181
  :type red_team_info: Dict
148
182
  :param _skip_evals: Whether to log only data without evaluation results
149
183
  :type _skip_evals: bool
184
+ :param aoai_summary: Pre-built AOAI-compatible summary (optional, will be built if not provided)
185
+ :type aoai_summary: Optional[RedTeamRun]
150
186
  :return: The URL to the run in Azure AI Studio, if available
151
187
  :rtype: Optional[str]
152
188
  """
153
189
  self.logger.debug(f"Logging results to MLFlow, _skip_evals={_skip_evals}")
154
190
  artifact_name = "instance_results.json"
191
+ results_name = "results.json"
155
192
  eval_info_name = "redteam_info.json"
156
193
  properties = {}
157
194
 
158
195
  with tempfile.TemporaryDirectory() as tmpdir:
159
196
  if self.scan_output_dir:
197
+ # Save new format as results.json
198
+ results_path = os.path.join(self.scan_output_dir, results_name)
199
+ self.logger.debug(f"Saving results to scan output directory: {results_path}")
200
+ with open(results_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
201
+ # Use provided aoai_summary
202
+ if aoai_summary is None:
203
+ self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
204
+ raise ValueError("aoai_summary parameter is required but was not provided")
205
+
206
+ payload = dict(aoai_summary) # Make a copy
207
+ json.dump(payload, f)
208
+
209
+ # Save legacy format as instance_results.json
160
210
  artifact_path = os.path.join(self.scan_output_dir, artifact_name)
161
211
  self.logger.debug(f"Saving artifact to scan output directory: {artifact_path}")
162
212
  with open(artifact_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
163
- if _skip_evals:
164
- # In _skip_evals mode, we write the conversations in conversation/messages format
165
- f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
166
- elif redteam_result.scan_result:
167
- # Create a copy to avoid modifying the original scan result
168
- result_with_conversations = (
169
- redteam_result.scan_result.copy() if isinstance(redteam_result.scan_result, dict) else {}
170
- )
171
-
172
- # Preserve all original fields needed for scorecard generation
173
- result_with_conversations["scorecard"] = result_with_conversations.get("scorecard", {})
174
- result_with_conversations["parameters"] = result_with_conversations.get("parameters", {})
175
-
176
- # Add conversations field with all conversation data including user messages
177
- result_with_conversations["conversations"] = redteam_result.attack_details or []
178
-
179
- # Keep original attack_details field to preserve compatibility with existing code
180
- if (
181
- "attack_details" not in result_with_conversations
182
- and redteam_result.attack_details is not None
183
- ):
184
- result_with_conversations["attack_details"] = redteam_result.attack_details
185
-
186
- json.dump(result_with_conversations, f)
213
+ legacy_payload = self._build_instance_results_payload(
214
+ redteam_result=redteam_result,
215
+ eval_run=eval_run,
216
+ red_team_info=red_team_info,
217
+ scan_name=getattr(eval_run, "display_name", None),
218
+ )
219
+ json.dump(legacy_payload, f)
187
220
 
188
221
  eval_info_path = os.path.join(self.scan_output_dir, eval_info_name)
189
222
  self.logger.debug(f"Saving evaluation info to scan output directory: {eval_info_path}")
@@ -210,16 +243,35 @@ class MLflowIntegration:
210
243
  self.logger.debug(f"Saved scorecard to: {scorecard_path}")
211
244
 
212
245
  # Create a dedicated artifacts directory with proper structure for MLFlow
213
- # First, create the main artifact file that MLFlow expects
246
+ # First, create the main artifact file that MLFlow expects (new format)
247
+ with open(
248
+ os.path.join(tmpdir, results_name),
249
+ "w",
250
+ encoding=DefaultOpenEncoding.WRITE,
251
+ ) as f:
252
+ # Use provided aoai_summary (required)
253
+ if aoai_summary is None:
254
+ self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
255
+ raise ValueError("aoai_summary parameter is required but was not provided")
256
+
257
+ payload = dict(aoai_summary) # Make a copy
258
+ # Remove conversations for MLFlow artifact
259
+ payload.pop("conversations", None)
260
+ json.dump(payload, f)
261
+
262
+ # Also create legacy instance_results.json for compatibility
214
263
  with open(
215
264
  os.path.join(tmpdir, artifact_name),
216
265
  "w",
217
266
  encoding=DefaultOpenEncoding.WRITE,
218
267
  ) as f:
219
- if _skip_evals:
220
- f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
221
- elif redteam_result.scan_result:
222
- json.dump(redteam_result.scan_result, f)
268
+ legacy_payload = self._build_instance_results_payload(
269
+ redteam_result=redteam_result,
270
+ eval_run=eval_run,
271
+ red_team_info=red_team_info,
272
+ scan_name=getattr(eval_run, "display_name", None),
273
+ )
274
+ json.dump(legacy_payload, f)
223
275
 
224
276
  # Copy all relevant files to the temp directory
225
277
  import shutil
@@ -246,12 +298,34 @@ class MLflowIntegration:
246
298
  properties.update({"scan_output_dir": str(self.scan_output_dir)})
247
299
  else:
248
300
  # Use temporary directory as before if no scan output directory exists
301
+ results_file = Path(tmpdir) / results_name
302
+ with open(results_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
303
+ # Use provided aoai_summary (required)
304
+ if aoai_summary is None:
305
+ self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
306
+ raise ValueError("aoai_summary parameter is required but was not provided")
307
+
308
+ payload = dict(aoai_summary) # Make a copy
309
+ # Include conversations only if _skip_evals is True
310
+ if _skip_evals and "conversations" not in payload:
311
+ payload["conversations"] = (
312
+ redteam_result.attack_details or redteam_result.scan_result.get("attack_details") or []
313
+ )
314
+ elif not _skip_evals:
315
+ payload.pop("conversations", None)
316
+ json.dump(payload, f)
317
+ self.logger.debug(f"Logged artifact: {results_name}")
318
+
319
+ # Also create legacy instance_results.json
249
320
  artifact_file = Path(tmpdir) / artifact_name
250
321
  with open(artifact_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
251
- if _skip_evals:
252
- f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
253
- elif redteam_result.scan_result:
254
- json.dump(redteam_result.scan_result, f)
322
+ legacy_payload = self._build_instance_results_payload(
323
+ redteam_result=redteam_result,
324
+ eval_run=eval_run,
325
+ red_team_info=red_team_info,
326
+ scan_name=getattr(eval_run, "display_name", None),
327
+ )
328
+ json.dump(legacy_payload, f)
255
329
  self.logger.debug(f"Logged artifact: {artifact_name}")
256
330
 
257
331
  properties.update(
@@ -278,7 +352,7 @@ class MLflowIntegration:
278
352
  try:
279
353
  create_evaluation_result_response = (
280
354
  self.generated_rai_client._evaluation_onedp_client.create_evaluation_result(
281
- name=uuid.uuid4(),
355
+ name=str(uuid.uuid4()),
282
356
  path=tmpdir,
283
357
  metrics=metrics,
284
358
  result_type=ResultType.REDTEAM,
@@ -320,3 +394,37 @@ class MLflowIntegration:
320
394
 
321
395
  self.logger.info("Successfully logged results to AI Foundry")
322
396
  return None
397
+
398
+ def _build_instance_results_payload(
399
+ self,
400
+ redteam_result: RedTeamResult,
401
+ eval_run: Optional[Any] = None,
402
+ red_team_info: Optional[Dict] = None,
403
+ scan_name: Optional[str] = None,
404
+ ) -> Dict:
405
+ """Assemble the legacy structure for instance_results.json (scan_result format)."""
406
+
407
+ scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
408
+
409
+ # Return the scan_result directly for legacy compatibility
410
+ # This maintains the old format that was expected previously
411
+ # Filter out AOAI_Compatible properties - those belong in results.json only
412
+ legacy_payload = (
413
+ {
414
+ k: v
415
+ for k, v in scan_result.items()
416
+ if k not in ["AOAI_Compatible_Summary", "AOAI_Compatible_Row_Results"]
417
+ }
418
+ if scan_result
419
+ else {}
420
+ )
421
+
422
+ # Ensure we have the basic required fields
423
+ if "scorecard" not in legacy_payload:
424
+ legacy_payload["scorecard"] = {}
425
+ if "parameters" not in legacy_payload:
426
+ legacy_payload["parameters"] = {}
427
+ if "attack_details" not in legacy_payload:
428
+ legacy_payload["attack_details"] = redteam_result.attack_details or []
429
+
430
+ return legacy_payload