azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show
  1. azure/ai/evaluation/__init__.py +46 -12
  2. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  3. azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
  4. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  5. azure/ai/evaluation/_common/rai_service.py +3 -3
  6. azure/ai/evaluation/_common/utils.py +74 -17
  7. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  8. azure/ai/evaluation/_converters/_models.py +75 -26
  9. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
  10. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  11. azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
  12. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
  13. azure/ai/evaluation/_evaluate/_utils.py +5 -2
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  15. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
  20. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  21. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  22. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  23. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  24. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  25. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  26. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  27. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  29. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  30. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
  31. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  37. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
  38. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
  39. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  40. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
  42. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
  43. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  44. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
  45. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
  46. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  48. azure/ai/evaluation/_exceptions.py +1 -0
  49. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  50. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
  51. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  52. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  53. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
  54. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  55. azure/ai/evaluation/_version.py +1 -1
  56. azure/ai/evaluation/red_team/__init__.py +4 -3
  57. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  58. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  59. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  60. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  61. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  62. azure/ai/evaluation/red_team/_red_team.py +655 -2665
  63. azure/ai/evaluation/red_team/_red_team_result.py +6 -0
  64. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  65. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  66. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
  67. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  68. azure/ai/evaluation/red_team/_utils/constants.py +0 -2
  69. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  70. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  71. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  72. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  73. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  74. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  75. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  76. azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
  78. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
  80. azure/ai/evaluation/simulator/_simulator.py +12 -0
  81. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
  82. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
  83. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
  84. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
  85. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,322 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ """
5
+ MLflow integration module for Red Team Agent.
6
+
7
+ This module handles MLflow run creation, logging, and tracking for red team evaluations.
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import tempfile
13
+ import uuid
14
+ from datetime import datetime
15
+ from typing import Dict, Optional, cast
16
+ from pathlib import Path
17
+
18
+ # Azure AI Evaluation imports
19
+ from azure.ai.evaluation._evaluate._eval_run import EvalRun
20
+ from azure.ai.evaluation._evaluate._utils import _trace_destination_from_project_scope, _get_ai_studio_url
21
+ from azure.ai.evaluation._evaluate._utils import extract_workspace_triad_from_trace_provider
22
+ from azure.ai.evaluation._version import VERSION
23
+ from azure.ai.evaluation._azure._clients import LiteMLClient
24
+ from azure.ai.evaluation._constants import EvaluationRunProperties, DefaultOpenEncoding
25
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
26
+ from azure.ai.evaluation._common import RedTeamUpload, ResultType
27
+ from azure.ai.evaluation._model_configurations import AzureAIProject
28
+
29
+ # Local imports
30
+ from ._red_team_result import RedTeamResult
31
+ from ._utils.logging_utils import log_error
32
+
33
+
34
+ class MLflowIntegration:
35
+ """Handles MLflow integration for red team evaluations."""
36
+
37
+ def __init__(self, logger, azure_ai_project, generated_rai_client, one_dp_project, scan_output_dir=None):
38
+ """Initialize the MLflow integration.
39
+
40
+ :param logger: Logger instance for logging
41
+ :param azure_ai_project: Azure AI project configuration
42
+ :param generated_rai_client: RAI client for service interactions
43
+ :param one_dp_project: Whether this is a OneDP project
44
+ :param scan_output_dir: Directory for scan outputs
45
+ """
46
+ self.logger = logger
47
+ self.azure_ai_project = azure_ai_project
48
+ self.generated_rai_client = generated_rai_client
49
+ self._one_dp_project = one_dp_project
50
+ self.scan_output_dir = scan_output_dir
51
+ self.ai_studio_url = None
52
+ self.trace_destination = None
53
+
54
+ def start_redteam_mlflow_run(
55
+ self,
56
+ azure_ai_project: Optional[AzureAIProject] = None,
57
+ run_name: Optional[str] = None,
58
+ ) -> EvalRun:
59
+ """Start an MLFlow run for the Red Team Agent evaluation.
60
+
61
+ :param azure_ai_project: Azure AI project details for logging
62
+ :type azure_ai_project: Optional[AzureAIProject]
63
+ :param run_name: Optional name for the MLFlow run
64
+ :type run_name: Optional[str]
65
+ :return: The MLFlow run object
66
+ :rtype: EvalRun
67
+ :raises EvaluationException: If no azure_ai_project is provided or trace destination cannot be determined
68
+ """
69
+ if not azure_ai_project:
70
+ log_error(self.logger, "No azure_ai_project provided, cannot upload run")
71
+ raise EvaluationException(
72
+ message="No azure_ai_project provided",
73
+ blame=ErrorBlame.USER_ERROR,
74
+ category=ErrorCategory.MISSING_FIELD,
75
+ target=ErrorTarget.RED_TEAM,
76
+ )
77
+
78
+ if self._one_dp_project:
79
+ response = self.generated_rai_client._evaluation_onedp_client.start_red_team_run(
80
+ red_team=RedTeamUpload(
81
+ display_name=run_name or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
82
+ )
83
+ )
84
+
85
+ self.ai_studio_url = response.properties.get("AiStudioEvaluationUri")
86
+ return response
87
+
88
+ else:
89
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project)
90
+ if not trace_destination:
91
+ self.logger.warning("Could not determine trace destination from project scope")
92
+ raise EvaluationException(
93
+ message="Could not determine trace destination",
94
+ blame=ErrorBlame.SYSTEM_ERROR,
95
+ category=ErrorCategory.UNKNOWN,
96
+ target=ErrorTarget.RED_TEAM,
97
+ )
98
+
99
+ ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
100
+
101
+ management_client = LiteMLClient(
102
+ subscription_id=ws_triad.subscription_id,
103
+ resource_group=ws_triad.resource_group_name,
104
+ logger=self.logger,
105
+ credential=azure_ai_project.get("credential"),
106
+ )
107
+
108
+ tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
109
+
110
+ run_display_name = run_name or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
111
+ self.logger.debug(f"Starting MLFlow run with name: {run_display_name}")
112
+ eval_run = EvalRun(
113
+ run_name=run_display_name,
114
+ tracking_uri=cast(str, tracking_uri),
115
+ subscription_id=ws_triad.subscription_id,
116
+ group_name=ws_triad.resource_group_name,
117
+ workspace_name=ws_triad.workspace_name,
118
+ management_client=management_client,
119
+ )
120
+ eval_run._start_run()
121
+ self.logger.debug(f"MLFlow run started successfully with ID: {eval_run.info.run_id}")
122
+
123
+ self.trace_destination = trace_destination
124
+ self.logger.debug(f"MLFlow run created successfully with ID: {eval_run}")
125
+
126
+ self.ai_studio_url = _get_ai_studio_url(
127
+ trace_destination=self.trace_destination,
128
+ evaluation_id=eval_run.info.run_id,
129
+ )
130
+
131
+ return eval_run
132
+
133
+ async def log_redteam_results_to_mlflow(
134
+ self,
135
+ redteam_result: RedTeamResult,
136
+ eval_run: EvalRun,
137
+ red_team_info: Dict,
138
+ _skip_evals: bool = False,
139
+ ) -> Optional[str]:
140
+ """Log the Red Team Agent results to MLFlow.
141
+
142
+ :param redteam_result: The output from the red team agent evaluation
143
+ :type redteam_result: RedTeamResult
144
+ :param eval_run: The MLFlow run object
145
+ :type eval_run: EvalRun
146
+ :param red_team_info: Red team tracking information
147
+ :type red_team_info: Dict
148
+ :param _skip_evals: Whether to log only data without evaluation results
149
+ :type _skip_evals: bool
150
+ :return: The URL to the run in Azure AI Studio, if available
151
+ :rtype: Optional[str]
152
+ """
153
+ self.logger.debug(f"Logging results to MLFlow, _skip_evals={_skip_evals}")
154
+ artifact_name = "instance_results.json"
155
+ eval_info_name = "redteam_info.json"
156
+ properties = {}
157
+
158
+ with tempfile.TemporaryDirectory() as tmpdir:
159
+ if self.scan_output_dir:
160
+ artifact_path = os.path.join(self.scan_output_dir, artifact_name)
161
+ self.logger.debug(f"Saving artifact to scan output directory: {artifact_path}")
162
+ with open(artifact_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
163
+ if _skip_evals:
164
+ # In _skip_evals mode, we write the conversations in conversation/messages format
165
+ f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
166
+ elif redteam_result.scan_result:
167
+ # Create a copy to avoid modifying the original scan result
168
+ result_with_conversations = (
169
+ redteam_result.scan_result.copy() if isinstance(redteam_result.scan_result, dict) else {}
170
+ )
171
+
172
+ # Preserve all original fields needed for scorecard generation
173
+ result_with_conversations["scorecard"] = result_with_conversations.get("scorecard", {})
174
+ result_with_conversations["parameters"] = result_with_conversations.get("parameters", {})
175
+
176
+ # Add conversations field with all conversation data including user messages
177
+ result_with_conversations["conversations"] = redteam_result.attack_details or []
178
+
179
+ # Keep original attack_details field to preserve compatibility with existing code
180
+ if (
181
+ "attack_details" not in result_with_conversations
182
+ and redteam_result.attack_details is not None
183
+ ):
184
+ result_with_conversations["attack_details"] = redteam_result.attack_details
185
+
186
+ json.dump(result_with_conversations, f)
187
+
188
+ eval_info_path = os.path.join(self.scan_output_dir, eval_info_name)
189
+ self.logger.debug(f"Saving evaluation info to scan output directory: {eval_info_path}")
190
+ with open(eval_info_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
191
+ # Remove evaluation_result from red_team_info before logging
192
+ red_team_info_logged = {}
193
+ for strategy, harms_dict in red_team_info.items():
194
+ red_team_info_logged[strategy] = {}
195
+ for harm, info_dict in harms_dict.items():
196
+ # Create a copy to avoid modifying the original
197
+ info_dict_copy = info_dict.copy()
198
+ info_dict_copy.pop("evaluation_result", None)
199
+ red_team_info_logged[strategy][harm] = info_dict_copy
200
+ f.write(json.dumps(red_team_info_logged, indent=2))
201
+ self.logger.debug(f"Successfully wrote redteam_info.json to: {eval_info_path}")
202
+
203
+ # Also save a human-readable scorecard if available
204
+ if not _skip_evals and redteam_result.scan_result:
205
+ from ._utils.formatting_utils import format_scorecard
206
+
207
+ scorecard_path = os.path.join(self.scan_output_dir, "scorecard.txt")
208
+ with open(scorecard_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
209
+ f.write(format_scorecard(redteam_result.scan_result))
210
+ self.logger.debug(f"Saved scorecard to: {scorecard_path}")
211
+
212
+ # Create a dedicated artifacts directory with proper structure for MLFlow
213
+ # First, create the main artifact file that MLFlow expects
214
+ with open(
215
+ os.path.join(tmpdir, artifact_name),
216
+ "w",
217
+ encoding=DefaultOpenEncoding.WRITE,
218
+ ) as f:
219
+ if _skip_evals:
220
+ f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
221
+ elif redteam_result.scan_result:
222
+ json.dump(redteam_result.scan_result, f)
223
+
224
+ # Copy all relevant files to the temp directory
225
+ import shutil
226
+
227
+ for file in os.listdir(self.scan_output_dir):
228
+ file_path = os.path.join(self.scan_output_dir, file)
229
+
230
+ # Skip directories and log files if not in debug mode
231
+ if os.path.isdir(file_path):
232
+ continue
233
+ if file.endswith(".log") and not os.environ.get("DEBUG"):
234
+ continue
235
+ if file.endswith(".gitignore"):
236
+ continue
237
+ if file == artifact_name:
238
+ continue
239
+
240
+ try:
241
+ shutil.copy(file_path, os.path.join(tmpdir, file))
242
+ self.logger.debug(f"Copied file to artifact directory: {file}")
243
+ except Exception as e:
244
+ self.logger.warning(f"Failed to copy file {file} to artifact directory: {str(e)}")
245
+
246
+ properties.update({"scan_output_dir": str(self.scan_output_dir)})
247
+ else:
248
+ # Use temporary directory as before if no scan output directory exists
249
+ artifact_file = Path(tmpdir) / artifact_name
250
+ with open(artifact_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
251
+ if _skip_evals:
252
+ f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
253
+ elif redteam_result.scan_result:
254
+ json.dump(redteam_result.scan_result, f)
255
+ self.logger.debug(f"Logged artifact: {artifact_name}")
256
+
257
+ properties.update(
258
+ {
259
+ "redteaming": "asr",
260
+ EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
261
+ }
262
+ )
263
+
264
+ metrics = {}
265
+ if redteam_result.scan_result:
266
+ scorecard = redteam_result.scan_result["scorecard"]
267
+ joint_attack_summary = scorecard["joint_risk_attack_summary"]
268
+
269
+ if joint_attack_summary:
270
+ for risk_category_summary in joint_attack_summary:
271
+ risk_category = risk_category_summary.get("risk_category").lower()
272
+ for key, value in risk_category_summary.items():
273
+ if key != "risk_category":
274
+ metrics.update({f"{risk_category}_{key}": cast(float, value)})
275
+ self.logger.debug(f"Logged metric: {risk_category}_{key} = {value}")
276
+
277
+ if self._one_dp_project:
278
+ try:
279
+ create_evaluation_result_response = (
280
+ self.generated_rai_client._evaluation_onedp_client.create_evaluation_result(
281
+ name=uuid.uuid4(),
282
+ path=tmpdir,
283
+ metrics=metrics,
284
+ result_type=ResultType.REDTEAM,
285
+ )
286
+ )
287
+
288
+ update_run_response = self.generated_rai_client._evaluation_onedp_client.update_red_team_run(
289
+ name=eval_run.id,
290
+ red_team=RedTeamUpload(
291
+ id=eval_run.id,
292
+ display_name=eval_run.display_name
293
+ or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
294
+ status="Completed",
295
+ outputs={
296
+ "evaluationResultId": create_evaluation_result_response.id,
297
+ },
298
+ properties=properties,
299
+ ),
300
+ )
301
+ self.logger.debug(f"Updated UploadRun: {update_run_response.id}")
302
+ except Exception as e:
303
+ self.logger.warning(f"Failed to upload red team results to AI Foundry: {str(e)}")
304
+ else:
305
+ # Log the entire directory to MLFlow
306
+ try:
307
+ eval_run.log_artifact(tmpdir, artifact_name)
308
+ if self.scan_output_dir:
309
+ eval_run.log_artifact(tmpdir, eval_info_name)
310
+ self.logger.debug(f"Successfully logged artifacts directory to AI Foundry")
311
+ except Exception as e:
312
+ self.logger.warning(f"Failed to log artifacts to AI Foundry: {str(e)}")
313
+
314
+ for k, v in metrics.items():
315
+ eval_run.log_metric(k, v)
316
+ self.logger.debug(f"Logged metric: {k} = {v}")
317
+
318
+ eval_run.write_properties_to_run_history(properties)
319
+ eval_run._end_run("FINISHED")
320
+
321
+ self.logger.info("Successfully logged results to AI Foundry")
322
+ return None