azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
- azure/ai/evaluation/_converters/_ai_services.py +60 -10
- azure/ai/evaluation/_converters/_models.py +75 -26
- azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
- azure/ai/evaluation/_evaluate/_evaluate.py +13 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +77 -33
- azure/ai/evaluation/_evaluate/_utils.py +4 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +2 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +113 -19
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +2 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +113 -3
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +8 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +2 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +10 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +2 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +2 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +8 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +104 -60
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +58 -41
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +2 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
- azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
- azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
- azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
- azure/ai/evaluation/red_team/_red_team.py +697 -3067
- azure/ai/evaluation/red_team/_result_processor.py +610 -0
- azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
- azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +3 -1
- azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
- azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
- azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
- azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
- azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
- azure/ai/evaluation/simulator/_adversarial_simulator.py +9 -0
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +19 -5
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +4 -3
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +32 -2
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +49 -41
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.10.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
"""
|
|
5
|
+
MLflow integration module for Red Team Agent.
|
|
6
|
+
|
|
7
|
+
This module handles MLflow run creation, logging, and tracking for red team evaluations.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import tempfile
|
|
13
|
+
import uuid
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
from typing import Dict, Optional, cast
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
# Azure AI Evaluation imports
|
|
19
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
20
|
+
from azure.ai.evaluation._evaluate._utils import _trace_destination_from_project_scope, _get_ai_studio_url
|
|
21
|
+
from azure.ai.evaluation._evaluate._utils import extract_workspace_triad_from_trace_provider
|
|
22
|
+
from azure.ai.evaluation._version import VERSION
|
|
23
|
+
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
24
|
+
from azure.ai.evaluation._constants import EvaluationRunProperties, DefaultOpenEncoding
|
|
25
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
26
|
+
from azure.ai.evaluation._common import RedTeamUpload, ResultType
|
|
27
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
28
|
+
|
|
29
|
+
# Local imports
|
|
30
|
+
from ._red_team_result import RedTeamResult
|
|
31
|
+
from ._utils.logging_utils import log_error
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MLflowIntegration:
|
|
35
|
+
"""Handles MLflow integration for red team evaluations."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, logger, azure_ai_project, generated_rai_client, one_dp_project, scan_output_dir=None):
|
|
38
|
+
"""Initialize the MLflow integration.
|
|
39
|
+
|
|
40
|
+
:param logger: Logger instance for logging
|
|
41
|
+
:param azure_ai_project: Azure AI project configuration
|
|
42
|
+
:param generated_rai_client: RAI client for service interactions
|
|
43
|
+
:param one_dp_project: Whether this is a OneDP project
|
|
44
|
+
:param scan_output_dir: Directory for scan outputs
|
|
45
|
+
"""
|
|
46
|
+
self.logger = logger
|
|
47
|
+
self.azure_ai_project = azure_ai_project
|
|
48
|
+
self.generated_rai_client = generated_rai_client
|
|
49
|
+
self._one_dp_project = one_dp_project
|
|
50
|
+
self.scan_output_dir = scan_output_dir
|
|
51
|
+
self.ai_studio_url = None
|
|
52
|
+
self.trace_destination = None
|
|
53
|
+
|
|
54
|
+
def start_redteam_mlflow_run(
|
|
55
|
+
self,
|
|
56
|
+
azure_ai_project: Optional[AzureAIProject] = None,
|
|
57
|
+
run_name: Optional[str] = None,
|
|
58
|
+
) -> EvalRun:
|
|
59
|
+
"""Start an MLFlow run for the Red Team Agent evaluation.
|
|
60
|
+
|
|
61
|
+
:param azure_ai_project: Azure AI project details for logging
|
|
62
|
+
:type azure_ai_project: Optional[AzureAIProject]
|
|
63
|
+
:param run_name: Optional name for the MLFlow run
|
|
64
|
+
:type run_name: Optional[str]
|
|
65
|
+
:return: The MLFlow run object
|
|
66
|
+
:rtype: EvalRun
|
|
67
|
+
:raises EvaluationException: If no azure_ai_project is provided or trace destination cannot be determined
|
|
68
|
+
"""
|
|
69
|
+
if not azure_ai_project:
|
|
70
|
+
log_error(self.logger, "No azure_ai_project provided, cannot upload run")
|
|
71
|
+
raise EvaluationException(
|
|
72
|
+
message="No azure_ai_project provided",
|
|
73
|
+
blame=ErrorBlame.USER_ERROR,
|
|
74
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
75
|
+
target=ErrorTarget.RED_TEAM,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if self._one_dp_project:
|
|
79
|
+
response = self.generated_rai_client._evaluation_onedp_client.start_red_team_run(
|
|
80
|
+
red_team=RedTeamUpload(
|
|
81
|
+
display_name=run_name or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
self.ai_studio_url = response.properties.get("AiStudioEvaluationUri")
|
|
86
|
+
return response
|
|
87
|
+
|
|
88
|
+
else:
|
|
89
|
+
trace_destination = _trace_destination_from_project_scope(azure_ai_project)
|
|
90
|
+
if not trace_destination:
|
|
91
|
+
self.logger.warning("Could not determine trace destination from project scope")
|
|
92
|
+
raise EvaluationException(
|
|
93
|
+
message="Could not determine trace destination",
|
|
94
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
95
|
+
category=ErrorCategory.UNKNOWN,
|
|
96
|
+
target=ErrorTarget.RED_TEAM,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
100
|
+
|
|
101
|
+
management_client = LiteMLClient(
|
|
102
|
+
subscription_id=ws_triad.subscription_id,
|
|
103
|
+
resource_group=ws_triad.resource_group_name,
|
|
104
|
+
logger=self.logger,
|
|
105
|
+
credential=azure_ai_project.get("credential"),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
tracking_uri = management_client.workspace_get_info(ws_triad.workspace_name).ml_flow_tracking_uri
|
|
109
|
+
|
|
110
|
+
run_display_name = run_name or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
111
|
+
self.logger.debug(f"Starting MLFlow run with name: {run_display_name}")
|
|
112
|
+
eval_run = EvalRun(
|
|
113
|
+
run_name=run_display_name,
|
|
114
|
+
tracking_uri=cast(str, tracking_uri),
|
|
115
|
+
subscription_id=ws_triad.subscription_id,
|
|
116
|
+
group_name=ws_triad.resource_group_name,
|
|
117
|
+
workspace_name=ws_triad.workspace_name,
|
|
118
|
+
management_client=management_client,
|
|
119
|
+
)
|
|
120
|
+
eval_run._start_run()
|
|
121
|
+
self.logger.debug(f"MLFlow run started successfully with ID: {eval_run.info.run_id}")
|
|
122
|
+
|
|
123
|
+
self.trace_destination = trace_destination
|
|
124
|
+
self.logger.debug(f"MLFlow run created successfully with ID: {eval_run}")
|
|
125
|
+
|
|
126
|
+
self.ai_studio_url = _get_ai_studio_url(
|
|
127
|
+
trace_destination=self.trace_destination,
|
|
128
|
+
evaluation_id=eval_run.info.run_id,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return eval_run
|
|
132
|
+
|
|
133
|
+
async def log_redteam_results_to_mlflow(
|
|
134
|
+
self,
|
|
135
|
+
redteam_result: RedTeamResult,
|
|
136
|
+
eval_run: EvalRun,
|
|
137
|
+
red_team_info: Dict,
|
|
138
|
+
_skip_evals: bool = False,
|
|
139
|
+
) -> Optional[str]:
|
|
140
|
+
"""Log the Red Team Agent results to MLFlow.
|
|
141
|
+
|
|
142
|
+
:param redteam_result: The output from the red team agent evaluation
|
|
143
|
+
:type redteam_result: RedTeamResult
|
|
144
|
+
:param eval_run: The MLFlow run object
|
|
145
|
+
:type eval_run: EvalRun
|
|
146
|
+
:param red_team_info: Red team tracking information
|
|
147
|
+
:type red_team_info: Dict
|
|
148
|
+
:param _skip_evals: Whether to log only data without evaluation results
|
|
149
|
+
:type _skip_evals: bool
|
|
150
|
+
:return: The URL to the run in Azure AI Studio, if available
|
|
151
|
+
:rtype: Optional[str]
|
|
152
|
+
"""
|
|
153
|
+
self.logger.debug(f"Logging results to MLFlow, _skip_evals={_skip_evals}")
|
|
154
|
+
artifact_name = "instance_results.json"
|
|
155
|
+
eval_info_name = "redteam_info.json"
|
|
156
|
+
properties = {}
|
|
157
|
+
|
|
158
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
159
|
+
if self.scan_output_dir:
|
|
160
|
+
artifact_path = os.path.join(self.scan_output_dir, artifact_name)
|
|
161
|
+
self.logger.debug(f"Saving artifact to scan output directory: {artifact_path}")
|
|
162
|
+
with open(artifact_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
163
|
+
if _skip_evals:
|
|
164
|
+
# In _skip_evals mode, we write the conversations in conversation/messages format
|
|
165
|
+
f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
|
|
166
|
+
elif redteam_result.scan_result:
|
|
167
|
+
# Create a copy to avoid modifying the original scan result
|
|
168
|
+
result_with_conversations = (
|
|
169
|
+
redteam_result.scan_result.copy() if isinstance(redteam_result.scan_result, dict) else {}
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Preserve all original fields needed for scorecard generation
|
|
173
|
+
result_with_conversations["scorecard"] = result_with_conversations.get("scorecard", {})
|
|
174
|
+
result_with_conversations["parameters"] = result_with_conversations.get("parameters", {})
|
|
175
|
+
|
|
176
|
+
# Add conversations field with all conversation data including user messages
|
|
177
|
+
result_with_conversations["conversations"] = redteam_result.attack_details or []
|
|
178
|
+
|
|
179
|
+
# Keep original attack_details field to preserve compatibility with existing code
|
|
180
|
+
if (
|
|
181
|
+
"attack_details" not in result_with_conversations
|
|
182
|
+
and redteam_result.attack_details is not None
|
|
183
|
+
):
|
|
184
|
+
result_with_conversations["attack_details"] = redteam_result.attack_details
|
|
185
|
+
|
|
186
|
+
json.dump(result_with_conversations, f)
|
|
187
|
+
|
|
188
|
+
eval_info_path = os.path.join(self.scan_output_dir, eval_info_name)
|
|
189
|
+
self.logger.debug(f"Saving evaluation info to scan output directory: {eval_info_path}")
|
|
190
|
+
with open(eval_info_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
191
|
+
# Remove evaluation_result from red_team_info before logging
|
|
192
|
+
red_team_info_logged = {}
|
|
193
|
+
for strategy, harms_dict in red_team_info.items():
|
|
194
|
+
red_team_info_logged[strategy] = {}
|
|
195
|
+
for harm, info_dict in harms_dict.items():
|
|
196
|
+
# Create a copy to avoid modifying the original
|
|
197
|
+
info_dict_copy = info_dict.copy()
|
|
198
|
+
info_dict_copy.pop("evaluation_result", None)
|
|
199
|
+
red_team_info_logged[strategy][harm] = info_dict_copy
|
|
200
|
+
f.write(json.dumps(red_team_info_logged, indent=2))
|
|
201
|
+
self.logger.debug(f"Successfully wrote redteam_info.json to: {eval_info_path}")
|
|
202
|
+
|
|
203
|
+
# Also save a human-readable scorecard if available
|
|
204
|
+
if not _skip_evals and redteam_result.scan_result:
|
|
205
|
+
from ._utils.formatting_utils import format_scorecard
|
|
206
|
+
|
|
207
|
+
scorecard_path = os.path.join(self.scan_output_dir, "scorecard.txt")
|
|
208
|
+
with open(scorecard_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
209
|
+
f.write(format_scorecard(redteam_result.scan_result))
|
|
210
|
+
self.logger.debug(f"Saved scorecard to: {scorecard_path}")
|
|
211
|
+
|
|
212
|
+
# Create a dedicated artifacts directory with proper structure for MLFlow
|
|
213
|
+
# First, create the main artifact file that MLFlow expects
|
|
214
|
+
with open(
|
|
215
|
+
os.path.join(tmpdir, artifact_name),
|
|
216
|
+
"w",
|
|
217
|
+
encoding=DefaultOpenEncoding.WRITE,
|
|
218
|
+
) as f:
|
|
219
|
+
if _skip_evals:
|
|
220
|
+
f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
|
|
221
|
+
elif redteam_result.scan_result:
|
|
222
|
+
json.dump(redteam_result.scan_result, f)
|
|
223
|
+
|
|
224
|
+
# Copy all relevant files to the temp directory
|
|
225
|
+
import shutil
|
|
226
|
+
|
|
227
|
+
for file in os.listdir(self.scan_output_dir):
|
|
228
|
+
file_path = os.path.join(self.scan_output_dir, file)
|
|
229
|
+
|
|
230
|
+
# Skip directories and log files if not in debug mode
|
|
231
|
+
if os.path.isdir(file_path):
|
|
232
|
+
continue
|
|
233
|
+
if file.endswith(".log") and not os.environ.get("DEBUG"):
|
|
234
|
+
continue
|
|
235
|
+
if file.endswith(".gitignore"):
|
|
236
|
+
continue
|
|
237
|
+
if file == artifact_name:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
shutil.copy(file_path, os.path.join(tmpdir, file))
|
|
242
|
+
self.logger.debug(f"Copied file to artifact directory: {file}")
|
|
243
|
+
except Exception as e:
|
|
244
|
+
self.logger.warning(f"Failed to copy file {file} to artifact directory: {str(e)}")
|
|
245
|
+
|
|
246
|
+
properties.update({"scan_output_dir": str(self.scan_output_dir)})
|
|
247
|
+
else:
|
|
248
|
+
# Use temporary directory as before if no scan output directory exists
|
|
249
|
+
artifact_file = Path(tmpdir) / artifact_name
|
|
250
|
+
with open(artifact_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
251
|
+
if _skip_evals:
|
|
252
|
+
f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
|
|
253
|
+
elif redteam_result.scan_result:
|
|
254
|
+
json.dump(redteam_result.scan_result, f)
|
|
255
|
+
self.logger.debug(f"Logged artifact: {artifact_name}")
|
|
256
|
+
|
|
257
|
+
properties.update(
|
|
258
|
+
{
|
|
259
|
+
"redteaming": "asr",
|
|
260
|
+
EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
|
|
261
|
+
}
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
metrics = {}
|
|
265
|
+
if redteam_result.scan_result:
|
|
266
|
+
scorecard = redteam_result.scan_result["scorecard"]
|
|
267
|
+
joint_attack_summary = scorecard["joint_risk_attack_summary"]
|
|
268
|
+
|
|
269
|
+
if joint_attack_summary:
|
|
270
|
+
for risk_category_summary in joint_attack_summary:
|
|
271
|
+
risk_category = risk_category_summary.get("risk_category").lower()
|
|
272
|
+
for key, value in risk_category_summary.items():
|
|
273
|
+
if key != "risk_category":
|
|
274
|
+
metrics.update({f"{risk_category}_{key}": cast(float, value)})
|
|
275
|
+
self.logger.debug(f"Logged metric: {risk_category}_{key} = {value}")
|
|
276
|
+
|
|
277
|
+
if self._one_dp_project:
|
|
278
|
+
try:
|
|
279
|
+
create_evaluation_result_response = (
|
|
280
|
+
self.generated_rai_client._evaluation_onedp_client.create_evaluation_result(
|
|
281
|
+
name=uuid.uuid4(),
|
|
282
|
+
path=tmpdir,
|
|
283
|
+
metrics=metrics,
|
|
284
|
+
result_type=ResultType.REDTEAM,
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
update_run_response = self.generated_rai_client._evaluation_onedp_client.update_red_team_run(
|
|
289
|
+
name=eval_run.id,
|
|
290
|
+
red_team=RedTeamUpload(
|
|
291
|
+
id=eval_run.id,
|
|
292
|
+
display_name=eval_run.display_name
|
|
293
|
+
or f"redteam-agent-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
|
|
294
|
+
status="Completed",
|
|
295
|
+
outputs={
|
|
296
|
+
"evaluationResultId": create_evaluation_result_response.id,
|
|
297
|
+
},
|
|
298
|
+
properties=properties,
|
|
299
|
+
),
|
|
300
|
+
)
|
|
301
|
+
self.logger.debug(f"Updated UploadRun: {update_run_response.id}")
|
|
302
|
+
except Exception as e:
|
|
303
|
+
self.logger.warning(f"Failed to upload red team results to AI Foundry: {str(e)}")
|
|
304
|
+
else:
|
|
305
|
+
# Log the entire directory to MLFlow
|
|
306
|
+
try:
|
|
307
|
+
eval_run.log_artifact(tmpdir, artifact_name)
|
|
308
|
+
if self.scan_output_dir:
|
|
309
|
+
eval_run.log_artifact(tmpdir, eval_info_name)
|
|
310
|
+
self.logger.debug(f"Successfully logged artifacts directory to AI Foundry")
|
|
311
|
+
except Exception as e:
|
|
312
|
+
self.logger.warning(f"Failed to log artifacts to AI Foundry: {str(e)}")
|
|
313
|
+
|
|
314
|
+
for k, v in metrics.items():
|
|
315
|
+
eval_run.log_metric(k, v)
|
|
316
|
+
self.logger.debug(f"Logged metric: {k} = {v}")
|
|
317
|
+
|
|
318
|
+
eval_run.write_properties_to_run_history(properties)
|
|
319
|
+
eval_run._end_run("FINISHED")
|
|
320
|
+
|
|
321
|
+
self.logger.info("Successfully logged results to AI Foundry")
|
|
322
|
+
return None
|