azure-ai-evaluation 1.11.0__py3-none-any.whl → 1.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
- azure/ai/evaluation/_aoai/label_grader.py +8 -3
- azure/ai/evaluation/_aoai/python_grader.py +8 -3
- azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
- azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
- azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
- azure/ai/evaluation/_eval_mapping.py +2 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +279 -50
- azure/ai/evaluation/_evaluate/_utils.py +7 -3
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
- azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
- azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
- azure/ai/evaluation/_exceptions.py +1 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
- azure/ai/evaluation/red_team/_red_team.py +9 -0
- azure/ai/evaluation/red_team/_red_team_result.py +230 -1
- azure/ai/evaluation/red_team/_result_processor.py +416 -23
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +19 -3
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.0.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ import os
|
|
|
12
12
|
import tempfile
|
|
13
13
|
import uuid
|
|
14
14
|
from datetime import datetime
|
|
15
|
-
from typing import Dict, Optional, cast
|
|
15
|
+
from typing import Any, Dict, List, Optional, Set, cast
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
|
|
18
18
|
# Azure AI Evaluation imports
|
|
@@ -27,7 +27,14 @@ from azure.ai.evaluation._common import RedTeamUpload, ResultType
|
|
|
27
27
|
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
28
28
|
|
|
29
29
|
# Local imports
|
|
30
|
-
from ._red_team_result import
|
|
30
|
+
from ._red_team_result import (
|
|
31
|
+
RedTeamResult,
|
|
32
|
+
RedTeamRun,
|
|
33
|
+
ResultCount,
|
|
34
|
+
PerTestingCriteriaResult,
|
|
35
|
+
DataSource,
|
|
36
|
+
OutputItemsList,
|
|
37
|
+
)
|
|
31
38
|
from ._utils.logging_utils import log_error
|
|
32
39
|
|
|
33
40
|
|
|
@@ -50,6 +57,32 @@ class MLflowIntegration:
|
|
|
50
57
|
self.scan_output_dir = scan_output_dir
|
|
51
58
|
self.ai_studio_url = None
|
|
52
59
|
self.trace_destination = None
|
|
60
|
+
self._run_id_override: Optional[str] = None
|
|
61
|
+
self._eval_id_override: Optional[str] = None
|
|
62
|
+
self._created_at_override: Optional[int] = None
|
|
63
|
+
|
|
64
|
+
def set_run_identity_overrides(
|
|
65
|
+
self,
|
|
66
|
+
*,
|
|
67
|
+
run_id: Optional[str] = None,
|
|
68
|
+
eval_id: Optional[str] = None,
|
|
69
|
+
created_at: Optional[Any] = None,
|
|
70
|
+
) -> None:
|
|
71
|
+
"""Allow callers to supply pre-existing identifiers for the run payload."""
|
|
72
|
+
|
|
73
|
+
self._run_id_override = str(run_id).strip() if run_id else None
|
|
74
|
+
self._eval_id_override = str(eval_id).strip() if eval_id else None
|
|
75
|
+
|
|
76
|
+
if created_at is None or created_at == "":
|
|
77
|
+
self._created_at_override = None
|
|
78
|
+
else:
|
|
79
|
+
if isinstance(created_at, datetime):
|
|
80
|
+
self._created_at_override = int(created_at.timestamp())
|
|
81
|
+
else:
|
|
82
|
+
try:
|
|
83
|
+
self._created_at_override = int(created_at)
|
|
84
|
+
except (TypeError, ValueError):
|
|
85
|
+
self._created_at_override = None
|
|
53
86
|
|
|
54
87
|
def start_redteam_mlflow_run(
|
|
55
88
|
self,
|
|
@@ -152,38 +185,36 @@ class MLflowIntegration:
|
|
|
152
185
|
"""
|
|
153
186
|
self.logger.debug(f"Logging results to MLFlow, _skip_evals={_skip_evals}")
|
|
154
187
|
artifact_name = "instance_results.json"
|
|
188
|
+
results_name = "results.json"
|
|
155
189
|
eval_info_name = "redteam_info.json"
|
|
156
190
|
properties = {}
|
|
157
191
|
|
|
158
192
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
159
193
|
if self.scan_output_dir:
|
|
194
|
+
# Save new format as results.json
|
|
195
|
+
results_path = os.path.join(self.scan_output_dir, results_name)
|
|
196
|
+
self.logger.debug(f"Saving results to scan output directory: {results_path}")
|
|
197
|
+
with open(results_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
198
|
+
payload = self._build_results_payload(
|
|
199
|
+
redteam_result=redteam_result,
|
|
200
|
+
eval_run=eval_run,
|
|
201
|
+
red_team_info=red_team_info,
|
|
202
|
+
include_conversations=True,
|
|
203
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
204
|
+
)
|
|
205
|
+
json.dump(payload, f)
|
|
206
|
+
|
|
207
|
+
# Save legacy format as instance_results.json
|
|
160
208
|
artifact_path = os.path.join(self.scan_output_dir, artifact_name)
|
|
161
209
|
self.logger.debug(f"Saving artifact to scan output directory: {artifact_path}")
|
|
162
210
|
with open(artifact_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
# Preserve all original fields needed for scorecard generation
|
|
173
|
-
result_with_conversations["scorecard"] = result_with_conversations.get("scorecard", {})
|
|
174
|
-
result_with_conversations["parameters"] = result_with_conversations.get("parameters", {})
|
|
175
|
-
|
|
176
|
-
# Add conversations field with all conversation data including user messages
|
|
177
|
-
result_with_conversations["conversations"] = redteam_result.attack_details or []
|
|
178
|
-
|
|
179
|
-
# Keep original attack_details field to preserve compatibility with existing code
|
|
180
|
-
if (
|
|
181
|
-
"attack_details" not in result_with_conversations
|
|
182
|
-
and redteam_result.attack_details is not None
|
|
183
|
-
):
|
|
184
|
-
result_with_conversations["attack_details"] = redteam_result.attack_details
|
|
185
|
-
|
|
186
|
-
json.dump(result_with_conversations, f)
|
|
211
|
+
legacy_payload = self._build_instance_results_payload(
|
|
212
|
+
redteam_result=redteam_result,
|
|
213
|
+
eval_run=eval_run,
|
|
214
|
+
red_team_info=red_team_info,
|
|
215
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
216
|
+
)
|
|
217
|
+
json.dump(legacy_payload, f)
|
|
187
218
|
|
|
188
219
|
eval_info_path = os.path.join(self.scan_output_dir, eval_info_name)
|
|
189
220
|
self.logger.debug(f"Saving evaluation info to scan output directory: {eval_info_path}")
|
|
@@ -210,16 +241,34 @@ class MLflowIntegration:
|
|
|
210
241
|
self.logger.debug(f"Saved scorecard to: {scorecard_path}")
|
|
211
242
|
|
|
212
243
|
# Create a dedicated artifacts directory with proper structure for MLFlow
|
|
213
|
-
# First, create the main artifact file that MLFlow expects
|
|
244
|
+
# First, create the main artifact file that MLFlow expects (new format)
|
|
245
|
+
with open(
|
|
246
|
+
os.path.join(tmpdir, results_name),
|
|
247
|
+
"w",
|
|
248
|
+
encoding=DefaultOpenEncoding.WRITE,
|
|
249
|
+
) as f:
|
|
250
|
+
payload = self._build_results_payload(
|
|
251
|
+
redteam_result=redteam_result,
|
|
252
|
+
eval_run=eval_run,
|
|
253
|
+
red_team_info=red_team_info,
|
|
254
|
+
include_conversations=False,
|
|
255
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
256
|
+
)
|
|
257
|
+
json.dump(payload, f)
|
|
258
|
+
|
|
259
|
+
# Also create legacy instance_results.json for compatibility
|
|
214
260
|
with open(
|
|
215
261
|
os.path.join(tmpdir, artifact_name),
|
|
216
262
|
"w",
|
|
217
263
|
encoding=DefaultOpenEncoding.WRITE,
|
|
218
264
|
) as f:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
265
|
+
legacy_payload = self._build_instance_results_payload(
|
|
266
|
+
redteam_result=redteam_result,
|
|
267
|
+
eval_run=eval_run,
|
|
268
|
+
red_team_info=red_team_info,
|
|
269
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
270
|
+
)
|
|
271
|
+
json.dump(legacy_payload, f)
|
|
223
272
|
|
|
224
273
|
# Copy all relevant files to the temp directory
|
|
225
274
|
import shutil
|
|
@@ -246,12 +295,28 @@ class MLflowIntegration:
|
|
|
246
295
|
properties.update({"scan_output_dir": str(self.scan_output_dir)})
|
|
247
296
|
else:
|
|
248
297
|
# Use temporary directory as before if no scan output directory exists
|
|
298
|
+
results_file = Path(tmpdir) / results_name
|
|
299
|
+
with open(results_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
300
|
+
payload = self._build_results_payload(
|
|
301
|
+
redteam_result=redteam_result,
|
|
302
|
+
eval_run=eval_run,
|
|
303
|
+
red_team_info=red_team_info,
|
|
304
|
+
include_conversations=_skip_evals,
|
|
305
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
306
|
+
)
|
|
307
|
+
json.dump(payload, f)
|
|
308
|
+
self.logger.debug(f"Logged artifact: {results_name}")
|
|
309
|
+
|
|
310
|
+
# Also create legacy instance_results.json
|
|
249
311
|
artifact_file = Path(tmpdir) / artifact_name
|
|
250
312
|
with open(artifact_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
313
|
+
legacy_payload = self._build_instance_results_payload(
|
|
314
|
+
redteam_result=redteam_result,
|
|
315
|
+
eval_run=eval_run,
|
|
316
|
+
red_team_info=red_team_info,
|
|
317
|
+
scan_name=getattr(eval_run, "display_name", None),
|
|
318
|
+
)
|
|
319
|
+
json.dump(legacy_payload, f)
|
|
255
320
|
self.logger.debug(f"Logged artifact: {artifact_name}")
|
|
256
321
|
|
|
257
322
|
properties.update(
|
|
@@ -320,3 +385,357 @@ class MLflowIntegration:
|
|
|
320
385
|
|
|
321
386
|
self.logger.info("Successfully logged results to AI Foundry")
|
|
322
387
|
return None
|
|
388
|
+
|
|
389
|
+
@staticmethod
|
|
390
|
+
def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
|
|
391
|
+
"""Aggregate run-level pass/fail counts from individual output items."""
|
|
392
|
+
|
|
393
|
+
total = len(output_items)
|
|
394
|
+
passed = failed = errored = 0
|
|
395
|
+
|
|
396
|
+
for item in output_items:
|
|
397
|
+
item_status: Optional[bool] = None
|
|
398
|
+
for result in item.get("results", []):
|
|
399
|
+
result_properties = result.get("properties", {}) if isinstance(result, dict) else {}
|
|
400
|
+
attack_success = result_properties.get("attack_success")
|
|
401
|
+
if attack_success is True:
|
|
402
|
+
item_status = False
|
|
403
|
+
break
|
|
404
|
+
if attack_success is False:
|
|
405
|
+
item_status = True
|
|
406
|
+
elif item_status is None and result.get("passed") is not None:
|
|
407
|
+
item_status = bool(result.get("passed"))
|
|
408
|
+
|
|
409
|
+
if item_status is True:
|
|
410
|
+
passed += 1
|
|
411
|
+
elif item_status is False:
|
|
412
|
+
failed += 1
|
|
413
|
+
else:
|
|
414
|
+
errored += 1
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
"total": total,
|
|
418
|
+
"passed": passed,
|
|
419
|
+
"failed": failed,
|
|
420
|
+
"errored": errored,
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
@staticmethod
|
|
424
|
+
def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
425
|
+
"""Build aggregated pass/fail counts per testing criteria (risk category)."""
|
|
426
|
+
|
|
427
|
+
criteria: Dict[str, Dict[str, int]] = {}
|
|
428
|
+
|
|
429
|
+
for item in output_items:
|
|
430
|
+
for result in item.get("results", []):
|
|
431
|
+
if not isinstance(result, dict):
|
|
432
|
+
continue
|
|
433
|
+
name = result.get("name")
|
|
434
|
+
if not name:
|
|
435
|
+
continue
|
|
436
|
+
passed_value = result.get("passed")
|
|
437
|
+
if passed_value is None:
|
|
438
|
+
continue
|
|
439
|
+
|
|
440
|
+
bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
|
|
441
|
+
if passed_value:
|
|
442
|
+
bucket["passed"] += 1
|
|
443
|
+
else:
|
|
444
|
+
bucket["failed"] += 1
|
|
445
|
+
|
|
446
|
+
return [
|
|
447
|
+
{
|
|
448
|
+
"testing_criteria": criteria_name,
|
|
449
|
+
"passed": counts["passed"],
|
|
450
|
+
"failed": counts["failed"],
|
|
451
|
+
}
|
|
452
|
+
for criteria_name, counts in sorted(criteria.items())
|
|
453
|
+
]
|
|
454
|
+
|
|
455
|
+
@staticmethod
|
|
456
|
+
def _build_data_source_section(parameters: Dict[str, Any], red_team_info: Optional[Dict]) -> Dict[str, Any]:
|
|
457
|
+
"""Build the data_source portion of the run payload for red-team scans."""
|
|
458
|
+
|
|
459
|
+
attack_strategies: List[str] = []
|
|
460
|
+
if isinstance(red_team_info, dict):
|
|
461
|
+
attack_strategies = sorted(str(strategy) for strategy in red_team_info.keys())
|
|
462
|
+
|
|
463
|
+
item_generation_params: Dict[str, Any] = {"type": "red_team"}
|
|
464
|
+
if attack_strategies:
|
|
465
|
+
item_generation_params["attack_strategies"] = attack_strategies
|
|
466
|
+
|
|
467
|
+
# Attempt to infer turns from parameters if available
|
|
468
|
+
num_turns = parameters.get("max_turns") if isinstance(parameters, dict) else None
|
|
469
|
+
if isinstance(num_turns, int) and num_turns > 0:
|
|
470
|
+
item_generation_params["num_turns"] = num_turns
|
|
471
|
+
|
|
472
|
+
data_source: Dict[str, Any] = {"type": "azure_ai_red_team", "target": {}}
|
|
473
|
+
if item_generation_params:
|
|
474
|
+
data_source["item_generation_params"] = item_generation_params
|
|
475
|
+
|
|
476
|
+
return data_source
|
|
477
|
+
|
|
478
|
+
def _determine_run_status(
|
|
479
|
+
self,
|
|
480
|
+
scan_result: Dict[str, Any],
|
|
481
|
+
red_team_info: Optional[Dict],
|
|
482
|
+
output_items: List[Dict[str, Any]],
|
|
483
|
+
) -> str:
|
|
484
|
+
"""Determine the run-level status based on red team info status values."""
|
|
485
|
+
|
|
486
|
+
# Check if any tasks are still incomplete/failed
|
|
487
|
+
if isinstance(red_team_info, dict):
|
|
488
|
+
for risk_data in red_team_info.values():
|
|
489
|
+
if not isinstance(risk_data, dict):
|
|
490
|
+
continue
|
|
491
|
+
for details in risk_data.values():
|
|
492
|
+
if not isinstance(details, dict):
|
|
493
|
+
continue
|
|
494
|
+
status = details.get("status", "").lower()
|
|
495
|
+
if status in ("incomplete", "failed", "timeout"):
|
|
496
|
+
return "failed"
|
|
497
|
+
elif status in ("running", "pending"):
|
|
498
|
+
return "in_progress"
|
|
499
|
+
|
|
500
|
+
return "completed"
|
|
501
|
+
|
|
502
|
+
def _build_results_payload(
|
|
503
|
+
self,
|
|
504
|
+
redteam_result: RedTeamResult,
|
|
505
|
+
eval_run: Optional[Any] = None,
|
|
506
|
+
red_team_info: Optional[Dict] = None,
|
|
507
|
+
include_conversations: bool = False,
|
|
508
|
+
scan_name: Optional[str] = None,
|
|
509
|
+
) -> RedTeamRun:
|
|
510
|
+
"""Assemble the new structure for results.json with eval.run format."""
|
|
511
|
+
|
|
512
|
+
scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
|
|
513
|
+
output_items = cast(List[Dict[str, Any]], scan_result.get("output_items") or [])
|
|
514
|
+
scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
|
|
515
|
+
parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
|
|
516
|
+
|
|
517
|
+
run_id = self._run_id_override
|
|
518
|
+
eval_id = self._eval_id_override
|
|
519
|
+
run_name: Optional[str] = None
|
|
520
|
+
created_at = self._created_at_override
|
|
521
|
+
|
|
522
|
+
if eval_run is not None:
|
|
523
|
+
run_info = getattr(eval_run, "info", None)
|
|
524
|
+
|
|
525
|
+
if run_id is None:
|
|
526
|
+
candidate_run_id = (
|
|
527
|
+
getattr(run_info, "run_id", None)
|
|
528
|
+
or getattr(eval_run, "run_id", None)
|
|
529
|
+
or getattr(eval_run, "id", None)
|
|
530
|
+
)
|
|
531
|
+
if candidate_run_id is not None:
|
|
532
|
+
run_id = str(candidate_run_id)
|
|
533
|
+
|
|
534
|
+
if eval_id is None:
|
|
535
|
+
candidate_eval_id = (
|
|
536
|
+
getattr(run_info, "experiment_id", None)
|
|
537
|
+
or getattr(eval_run, "experiment_id", None)
|
|
538
|
+
or getattr(eval_run, "eval_id", None)
|
|
539
|
+
)
|
|
540
|
+
if candidate_eval_id is not None:
|
|
541
|
+
eval_id = str(candidate_eval_id)
|
|
542
|
+
|
|
543
|
+
if run_name is None:
|
|
544
|
+
candidate_run_name = (
|
|
545
|
+
getattr(run_info, "run_name", None)
|
|
546
|
+
or getattr(eval_run, "run_name", None)
|
|
547
|
+
or getattr(eval_run, "display_name", None)
|
|
548
|
+
or getattr(eval_run, "name", None)
|
|
549
|
+
)
|
|
550
|
+
if candidate_run_name is not None:
|
|
551
|
+
run_name = str(candidate_run_name)
|
|
552
|
+
|
|
553
|
+
if created_at is None:
|
|
554
|
+
raw_created = (
|
|
555
|
+
getattr(run_info, "created_time", None)
|
|
556
|
+
or getattr(eval_run, "created_at", None)
|
|
557
|
+
or getattr(eval_run, "created_time", None)
|
|
558
|
+
)
|
|
559
|
+
if isinstance(raw_created, datetime):
|
|
560
|
+
created_at = int(raw_created.timestamp())
|
|
561
|
+
elif isinstance(raw_created, (int, float)):
|
|
562
|
+
created_at = int(raw_created)
|
|
563
|
+
elif isinstance(raw_created, str):
|
|
564
|
+
try:
|
|
565
|
+
created_at = int(float(raw_created))
|
|
566
|
+
except ValueError:
|
|
567
|
+
created_at = None
|
|
568
|
+
|
|
569
|
+
if run_id is None:
|
|
570
|
+
run_id = str(uuid.uuid4())
|
|
571
|
+
if eval_id is None:
|
|
572
|
+
eval_id = str(uuid.uuid4())
|
|
573
|
+
if created_at is None:
|
|
574
|
+
created_at = int(datetime.now().timestamp())
|
|
575
|
+
if run_name is None:
|
|
576
|
+
run_name = scan_name or f"redteam-run-{run_id[:8]}"
|
|
577
|
+
|
|
578
|
+
result_count = self._compute_result_count(output_items)
|
|
579
|
+
per_testing_results = self._compute_per_testing_criteria(output_items)
|
|
580
|
+
data_source = self._build_data_source_section(parameters, red_team_info)
|
|
581
|
+
status = self._determine_run_status(scan_result, red_team_info, output_items)
|
|
582
|
+
|
|
583
|
+
list_wrapper: OutputItemsList = {
|
|
584
|
+
"object": "list",
|
|
585
|
+
"data": output_items,
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
run_payload: RedTeamRun = {
|
|
589
|
+
"object": "eval.run",
|
|
590
|
+
"id": run_id,
|
|
591
|
+
"eval_id": eval_id,
|
|
592
|
+
"created_at": created_at,
|
|
593
|
+
"status": status,
|
|
594
|
+
"name": run_name,
|
|
595
|
+
"report_url": scan_result.get("studio_url") or self.ai_studio_url,
|
|
596
|
+
"data_source": data_source,
|
|
597
|
+
"metadata": {},
|
|
598
|
+
"result_count": result_count,
|
|
599
|
+
"per_model_usage": [],
|
|
600
|
+
"per_testing_criteria_results": per_testing_results,
|
|
601
|
+
"output_items": list_wrapper,
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
if include_conversations:
|
|
605
|
+
run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
|
|
606
|
+
|
|
607
|
+
return run_payload
|
|
608
|
+
|
|
609
|
+
def _build_results_payload(
|
|
610
|
+
self,
|
|
611
|
+
redteam_result: RedTeamResult,
|
|
612
|
+
eval_run: Optional[Any] = None,
|
|
613
|
+
red_team_info: Optional[Dict] = None,
|
|
614
|
+
include_conversations: bool = False,
|
|
615
|
+
scan_name: Optional[str] = None,
|
|
616
|
+
) -> RedTeamRun:
|
|
617
|
+
"""Assemble the new structure for results.json with eval.run format."""
|
|
618
|
+
|
|
619
|
+
scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
|
|
620
|
+
output_items = cast(List[Dict[str, Any]], scan_result.get("output_items") or [])
|
|
621
|
+
scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
|
|
622
|
+
parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
|
|
623
|
+
|
|
624
|
+
run_id = self._run_id_override
|
|
625
|
+
eval_id = self._eval_id_override
|
|
626
|
+
run_name: Optional[str] = None
|
|
627
|
+
created_at = self._created_at_override
|
|
628
|
+
|
|
629
|
+
if eval_run is not None:
|
|
630
|
+
run_info = getattr(eval_run, "info", None)
|
|
631
|
+
|
|
632
|
+
if run_id is None:
|
|
633
|
+
candidate_run_id = (
|
|
634
|
+
getattr(run_info, "run_id", None)
|
|
635
|
+
or getattr(eval_run, "run_id", None)
|
|
636
|
+
or getattr(eval_run, "id", None)
|
|
637
|
+
)
|
|
638
|
+
if candidate_run_id is not None:
|
|
639
|
+
run_id = str(candidate_run_id)
|
|
640
|
+
|
|
641
|
+
if eval_id is None:
|
|
642
|
+
candidate_eval_id = (
|
|
643
|
+
getattr(run_info, "experiment_id", None)
|
|
644
|
+
or getattr(eval_run, "experiment_id", None)
|
|
645
|
+
or getattr(eval_run, "eval_id", None)
|
|
646
|
+
)
|
|
647
|
+
if candidate_eval_id is not None:
|
|
648
|
+
eval_id = str(candidate_eval_id)
|
|
649
|
+
|
|
650
|
+
if run_name is None:
|
|
651
|
+
candidate_run_name = (
|
|
652
|
+
getattr(run_info, "run_name", None)
|
|
653
|
+
or getattr(eval_run, "run_name", None)
|
|
654
|
+
or getattr(eval_run, "display_name", None)
|
|
655
|
+
or getattr(eval_run, "name", None)
|
|
656
|
+
)
|
|
657
|
+
if candidate_run_name is not None:
|
|
658
|
+
run_name = str(candidate_run_name)
|
|
659
|
+
|
|
660
|
+
if created_at is None:
|
|
661
|
+
raw_created = (
|
|
662
|
+
getattr(run_info, "created_time", None)
|
|
663
|
+
or getattr(eval_run, "created_at", None)
|
|
664
|
+
or getattr(eval_run, "created_time", None)
|
|
665
|
+
)
|
|
666
|
+
if isinstance(raw_created, datetime):
|
|
667
|
+
created_at = int(raw_created.timestamp())
|
|
668
|
+
elif isinstance(raw_created, (int, float)):
|
|
669
|
+
created_at = int(raw_created)
|
|
670
|
+
elif isinstance(raw_created, str):
|
|
671
|
+
try:
|
|
672
|
+
created_at = int(float(raw_created))
|
|
673
|
+
except ValueError:
|
|
674
|
+
created_at = None
|
|
675
|
+
|
|
676
|
+
if run_id is None:
|
|
677
|
+
run_id = str(uuid.uuid4())
|
|
678
|
+
if eval_id is None:
|
|
679
|
+
eval_id = str(uuid.uuid4())
|
|
680
|
+
if created_at is None:
|
|
681
|
+
created_at = int(datetime.now().timestamp())
|
|
682
|
+
if run_name is None:
|
|
683
|
+
run_name = scan_name or f"redteam-run-{run_id[:8]}"
|
|
684
|
+
|
|
685
|
+
result_count = self._compute_result_count(output_items)
|
|
686
|
+
per_testing_results = self._compute_per_testing_criteria(output_items)
|
|
687
|
+
data_source = self._build_data_source_section(parameters, red_team_info)
|
|
688
|
+
status = self._determine_run_status(scan_result, red_team_info, output_items)
|
|
689
|
+
|
|
690
|
+
list_wrapper: OutputItemsList = {
|
|
691
|
+
"object": "list",
|
|
692
|
+
"data": output_items,
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
run_payload: RedTeamRun = {
|
|
696
|
+
"object": "eval.run",
|
|
697
|
+
"id": run_id,
|
|
698
|
+
"eval_id": eval_id,
|
|
699
|
+
"created_at": created_at,
|
|
700
|
+
"status": status,
|
|
701
|
+
"name": run_name,
|
|
702
|
+
"report_url": scan_result.get("studio_url") or self.ai_studio_url,
|
|
703
|
+
"data_source": data_source,
|
|
704
|
+
"metadata": {},
|
|
705
|
+
"result_count": result_count,
|
|
706
|
+
"per_model_usage": [],
|
|
707
|
+
"per_testing_criteria_results": per_testing_results,
|
|
708
|
+
"output_items": list_wrapper,
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
if include_conversations:
|
|
712
|
+
run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
|
|
713
|
+
|
|
714
|
+
return run_payload
|
|
715
|
+
|
|
716
|
+
def _build_instance_results_payload(
|
|
717
|
+
self,
|
|
718
|
+
redteam_result: RedTeamResult,
|
|
719
|
+
eval_run: Optional[Any] = None,
|
|
720
|
+
red_team_info: Optional[Dict] = None,
|
|
721
|
+
scan_name: Optional[str] = None,
|
|
722
|
+
) -> Dict:
|
|
723
|
+
"""Assemble the legacy structure for instance_results.json (scan_result format)."""
|
|
724
|
+
|
|
725
|
+
scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
|
|
726
|
+
|
|
727
|
+
# Return the scan_result directly for legacy compatibility
|
|
728
|
+
# This maintains the old format that was expected previously
|
|
729
|
+
legacy_payload = scan_result.copy() if scan_result else {}
|
|
730
|
+
|
|
731
|
+
# Ensure we have the basic required fields
|
|
732
|
+
if "scorecard" not in legacy_payload:
|
|
733
|
+
legacy_payload["scorecard"] = {}
|
|
734
|
+
if "parameters" not in legacy_payload:
|
|
735
|
+
legacy_payload["parameters"] = {}
|
|
736
|
+
if "output_items" not in legacy_payload:
|
|
737
|
+
legacy_payload["output_items"] = []
|
|
738
|
+
if "attack_details" not in legacy_payload:
|
|
739
|
+
legacy_payload["attack_details"] = redteam_result.attack_details or []
|
|
740
|
+
|
|
741
|
+
return legacy_payload
|
|
@@ -783,6 +783,9 @@ class RedTeam:
|
|
|
783
783
|
:rtype: RedTeamResult
|
|
784
784
|
"""
|
|
785
785
|
user_agent: Optional[str] = kwargs.get("user_agent", "(type=redteam; subtype=RedTeam)")
|
|
786
|
+
run_id_override = kwargs.get("run_id") or kwargs.get("runId")
|
|
787
|
+
eval_id_override = kwargs.get("eval_id") or kwargs.get("evalId")
|
|
788
|
+
created_at_override = kwargs.get("created_at") or kwargs.get("createdAt")
|
|
786
789
|
with UserAgentSingleton().add_useragent_product(user_agent):
|
|
787
790
|
# Initialize scan
|
|
788
791
|
self._initialize_scan(scan_name, application_scenario)
|
|
@@ -802,6 +805,12 @@ class RedTeam:
|
|
|
802
805
|
self.mlflow_integration.logger = self.logger
|
|
803
806
|
self.result_processor.logger = self.logger
|
|
804
807
|
|
|
808
|
+
self.mlflow_integration.set_run_identity_overrides(
|
|
809
|
+
run_id=run_id_override,
|
|
810
|
+
eval_id=eval_id_override,
|
|
811
|
+
created_at=created_at_override,
|
|
812
|
+
)
|
|
813
|
+
|
|
805
814
|
# Validate attack objective generator
|
|
806
815
|
if not self.attack_objective_generator:
|
|
807
816
|
raise EvaluationException(
|