azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ import os
|
|
|
12
12
|
import tempfile
|
|
13
13
|
import uuid
|
|
14
14
|
from datetime import datetime
|
|
15
|
-
from typing import Dict, Optional, cast
|
|
15
|
+
from typing import Any, Dict, List, Optional, Set, cast
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
|
|
18
18
|
# Azure AI Evaluation imports
|
|
@@ -27,7 +27,14 @@ from azure.ai.evaluation._common import RedTeamUpload, ResultType
|
|
|
27
27
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
28
28
|
|
|
29
29
|
# Local imports
|
|
30
|
-
from ._red_team_result import
|
|
30
|
+
from ._red_team_result import (
|
|
31
|
+
RedTeamResult,
|
|
32
|
+
RedTeamRun,
|
|
33
|
+
ResultCount,
|
|
34
|
+
PerTestingCriteriaResult,
|
|
35
|
+
DataSource,
|
|
36
|
+
OutputItemsList,
|
|
37
|
+
)
|
|
31
38
|
from ._utils.logging_utils import log_error
|
|
32
39
|
|
|
33
40
|
|
|
@@ -50,6 +57,32 @@ class MLflowIntegration:
|
|
|
50
57
|
self.scan_output_dir = scan_output_dir
|
|
51
58
|
self.ai_studio_url = None
|
|
52
59
|
self.trace_destination = None
|
|
60
|
+
self._run_id_override: Optional[str] = None
|
|
61
|
+
self._eval_id_override: Optional[str] = None
|
|
62
|
+
self._created_at_override: Optional[int] = None
|
|
63
|
+
|
|
64
|
+
def set_run_identity_overrides(
|
|
65
|
+
self,
|
|
66
|
+
*,
|
|
67
|
+
run_id: Optional[str] = None,
|
|
68
|
+
eval_id: Optional[str] = None,
|
|
69
|
+
created_at: Optional[Any] = None,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""Allow callers to supply pre-existing identifiers for the run payload."""
|
|
72
|
+
|
|
73
|
+
self._run_id_override = str(run_id).strip() if run_id else None
|
|
74
|
+
self._eval_id_override = str(eval_id).strip() if eval_id else None
|
|
75
|
+
|
|
76
|
+
if created_at is None or created_at == "":
|
|
77
|
+
self._created_at_override = None
|
|
78
|
+
else:
|
|
79
|
+
if isinstance(created_at, datetime):
|
|
80
|
+
self._created_at_override = int(created_at.timestamp())
|
|
81
|
+
else:
|
|
82
|
+
try:
|
|
83
|
+
self._created_at_override = int(created_at)
|
|
84
|
+
except (TypeError, ValueError):
|
|
85
|
+
self._created_at_override = None
|
|
53
86
|
|
|
54
87
|
def start_redteam_mlflow_run(
|
|
55
88
|
self,
|
|
@@ -136,6 +169,7 @@ class MLflowIntegration:
|
|
|
136
169
|
eval_run: EvalRun,
|
|
137
170
|
red_team_info: Dict,
|
|
138
171
|
_skip_evals: bool = False,
|
|
172
|
+
aoai_summary: Optional["RedTeamRun"] = None,
|
|
139
173
|
) -> Optional[str]:
|
|
140
174
|
"""Log the Red Team Agent results to MLFlow.
|
|
141
175
|
|
|
@@ -147,43 +181,42 @@ class MLflowIntegration:
|
|
|
147
181
|
:type red_team_info: Dict
|
|
148
182
|
:param _skip_evals: Whether to log only data without evaluation results
|
|
149
183
|
:type _skip_evals: bool
|
|
184
|
+
:param aoai_summary: Pre-built AOAI-compatible summary (optional, will be built if not provided)
|
|
185
|
+
:type aoai_summary: Optional[RedTeamRun]
|
|
150
186
|
:return: The URL to the run in Azure AI Studio, if available
|
|
151
187
|
:rtype: Optional[str]
|
|
152
188
|
"""
|
|
153
189
|
self.logger.debug(f"Logging results to MLFlow, _skip_evals={_skip_evals}")
|
|
154
190
|
artifact_name = "instance_results.json"
|
|
191
|
+
results_name = "results.json"
|
|
155
192
|
eval_info_name = "redteam_info.json"
|
|
156
193
|
properties = {}
|
|
157
194
|
|
|
158
195
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
159
196
|
if self.scan_output_dir:
|
|
197
|
+
# Save new format as results.json
|
|
198
|
+
results_path = os.path.join(self.scan_output_dir, results_name)
|
|
199
|
+
self.logger.debug(f"Saving results to scan output directory: {results_path}")
|
|
200
|
+
with open(results_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
201
|
+
# Use provided aoai_summary
|
|
202
|
+
if aoai_summary is None:
|
|
203
|
+
self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
|
|
204
|
+
raise ValueError("aoai_summary parameter is required but was not provided")
|
|
205
|
+
|
|
206
|
+
payload = dict(aoai_summary) # Make a copy
|
|
207
|
+
json.dump(payload, f)
|
|
208
|
+
|
|
209
|
+
# Save legacy format as instance_results.json
|
|
160
210
|
artifact_path = os.path.join(self.scan_output_dir, artifact_name)
|
|
161
211
|
self.logger.debug(f"Saving artifact to scan output directory: {artifact_path}")
|
|
162
212
|
with open(artifact_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
# Preserve all original fields needed for scorecard generation
|
|
173
|
-
result_with_conversations["scorecard"] = result_with_conversations.get("scorecard", {})
|
|
174
|
-
result_with_conversations["parameters"] = result_with_conversations.get("parameters", {})
|
|
175
|
-
|
|
176
|
-
# Add conversations field with all conversation data including user messages
|
|
177
|
-
result_with_conversations["conversations"] = redteam_result.attack_details or []
|
|
178
|
-
|
|
179
|
-
# Keep original attack_details field to preserve compatibility with existing code
|
|
180
|
-
if (
|
|
181
|
-
"attack_details" not in result_with_conversations
|
|
182
|
-
and redteam_result.attack_details is not None
|
|
183
|
-
):
|
|
184
|
-
result_with_conversations["attack_details"] = redteam_result.attack_details
|
|
185
|
-
|
|
186
|
-
json.dump(result_with_conversations, f)
|
|
213
|
+
legacy_payload = self._build_instance_results_payload(
|
|
214
|
+
redteam_result=redteam_result,
|
|
215
|
+
eval_run=eval_run,
|
|
216
|
+
red_team_info=red_team_info,
|
|
217
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
218
|
+
)
|
|
219
|
+
json.dump(legacy_payload, f)
|
|
187
220
|
|
|
188
221
|
eval_info_path = os.path.join(self.scan_output_dir, eval_info_name)
|
|
189
222
|
self.logger.debug(f"Saving evaluation info to scan output directory: {eval_info_path}")
|
|
@@ -210,16 +243,35 @@ class MLflowIntegration:
|
|
|
210
243
|
self.logger.debug(f"Saved scorecard to: {scorecard_path}")
|
|
211
244
|
|
|
212
245
|
# Create a dedicated artifacts directory with proper structure for MLFlow
|
|
213
|
-
# First, create the main artifact file that MLFlow expects
|
|
246
|
+
# First, create the main artifact file that MLFlow expects (new format)
|
|
247
|
+
with open(
|
|
248
|
+
os.path.join(tmpdir, results_name),
|
|
249
|
+
"w",
|
|
250
|
+
encoding=DefaultOpenEncoding.WRITE,
|
|
251
|
+
) as f:
|
|
252
|
+
# Use provided aoai_summary (required)
|
|
253
|
+
if aoai_summary is None:
|
|
254
|
+
self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
|
|
255
|
+
raise ValueError("aoai_summary parameter is required but was not provided")
|
|
256
|
+
|
|
257
|
+
payload = dict(aoai_summary) # Make a copy
|
|
258
|
+
# Remove conversations for MLFlow artifact
|
|
259
|
+
payload.pop("conversations", None)
|
|
260
|
+
json.dump(payload, f)
|
|
261
|
+
|
|
262
|
+
# Also create legacy instance_results.json for compatibility
|
|
214
263
|
with open(
|
|
215
264
|
os.path.join(tmpdir, artifact_name),
|
|
216
265
|
"w",
|
|
217
266
|
encoding=DefaultOpenEncoding.WRITE,
|
|
218
267
|
) as f:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
268
|
+
legacy_payload = self._build_instance_results_payload(
|
|
269
|
+
redteam_result=redteam_result,
|
|
270
|
+
eval_run=eval_run,
|
|
271
|
+
red_team_info=red_team_info,
|
|
272
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
273
|
+
)
|
|
274
|
+
json.dump(legacy_payload, f)
|
|
223
275
|
|
|
224
276
|
# Copy all relevant files to the temp directory
|
|
225
277
|
import shutil
|
|
@@ -246,12 +298,34 @@ class MLflowIntegration:
|
|
|
246
298
|
properties.update({"scan_output_dir": str(self.scan_output_dir)})
|
|
247
299
|
else:
|
|
248
300
|
# Use temporary directory as before if no scan output directory exists
|
|
301
|
+
results_file = Path(tmpdir) / results_name
|
|
302
|
+
with open(results_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
303
|
+
# Use provided aoai_summary (required)
|
|
304
|
+
if aoai_summary is None:
|
|
305
|
+
self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
|
|
306
|
+
raise ValueError("aoai_summary parameter is required but was not provided")
|
|
307
|
+
|
|
308
|
+
payload = dict(aoai_summary) # Make a copy
|
|
309
|
+
# Include conversations only if _skip_evals is True
|
|
310
|
+
if _skip_evals and "conversations" not in payload:
|
|
311
|
+
payload["conversations"] = (
|
|
312
|
+
redteam_result.attack_details or redteam_result.scan_result.get("attack_details") or []
|
|
313
|
+
)
|
|
314
|
+
elif not _skip_evals:
|
|
315
|
+
payload.pop("conversations", None)
|
|
316
|
+
json.dump(payload, f)
|
|
317
|
+
self.logger.debug(f"Logged artifact: {results_name}")
|
|
318
|
+
|
|
319
|
+
# Also create legacy instance_results.json
|
|
249
320
|
artifact_file = Path(tmpdir) / artifact_name
|
|
250
321
|
with open(artifact_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
322
|
+
legacy_payload = self._build_instance_results_payload(
|
|
323
|
+
redteam_result=redteam_result,
|
|
324
|
+
eval_run=eval_run,
|
|
325
|
+
red_team_info=red_team_info,
|
|
326
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
327
|
+
)
|
|
328
|
+
json.dump(legacy_payload, f)
|
|
255
329
|
self.logger.debug(f"Logged artifact: {artifact_name}")
|
|
256
330
|
|
|
257
331
|
properties.update(
|
|
@@ -278,7 +352,7 @@ class MLflowIntegration:
|
|
|
278
352
|
try:
|
|
279
353
|
create_evaluation_result_response = (
|
|
280
354
|
self.generated_rai_client._evaluation_onedp_client.create_evaluation_result(
|
|
281
|
-
name=uuid.uuid4(),
|
|
355
|
+
name=str(uuid.uuid4()),
|
|
282
356
|
path=tmpdir,
|
|
283
357
|
metrics=metrics,
|
|
284
358
|
result_type=ResultType.REDTEAM,
|
|
@@ -320,3 +394,37 @@ class MLflowIntegration:
|
|
|
320
394
|
|
|
321
395
|
self.logger.info("Successfully logged results to AI Foundry")
|
|
322
396
|
return None
|
|
397
|
+
|
|
398
|
+
def _build_instance_results_payload(
|
|
399
|
+
self,
|
|
400
|
+
redteam_result: RedTeamResult,
|
|
401
|
+
eval_run: Optional[Any] = None,
|
|
402
|
+
red_team_info: Optional[Dict] = None,
|
|
403
|
+
scan_name: Optional[str] = None,
|
|
404
|
+
) -> Dict:
|
|
405
|
+
"""Assemble the legacy structure for instance_results.json (scan_result format)."""
|
|
406
|
+
|
|
407
|
+
scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
|
|
408
|
+
|
|
409
|
+
# Return the scan_result directly for legacy compatibility
|
|
410
|
+
# This maintains the old format that was expected previously
|
|
411
|
+
# Filter out AOAI_Compatible properties - those belong in results.json only
|
|
412
|
+
legacy_payload = (
|
|
413
|
+
{
|
|
414
|
+
k: v
|
|
415
|
+
for k, v in scan_result.items()
|
|
416
|
+
if k not in ["AOAI_Compatible_Summary", "AOAI_Compatible_Row_Results"]
|
|
417
|
+
}
|
|
418
|
+
if scan_result
|
|
419
|
+
else {}
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
# Ensure we have the basic required fields
|
|
423
|
+
if "scorecard" not in legacy_payload:
|
|
424
|
+
legacy_payload["scorecard"] = {}
|
|
425
|
+
if "parameters" not in legacy_payload:
|
|
426
|
+
legacy_payload["parameters"] = {}
|
|
427
|
+
if "attack_details" not in legacy_payload:
|
|
428
|
+
legacy_payload["attack_details"] = redteam_result.attack_details or []
|
|
429
|
+
|
|
430
|
+
return legacy_payload
|