azure-ai-evaluation 1.11.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (35) hide show
  1. azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
  2. azure/ai/evaluation/_aoai/label_grader.py +8 -3
  3. azure/ai/evaluation/_aoai/python_grader.py +8 -3
  4. azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
  5. azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
  7. azure/ai/evaluation/_eval_mapping.py +2 -0
  8. azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
  9. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +252 -48
  10. azure/ai/evaluation/_evaluate/_utils.py +7 -3
  11. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  12. azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
  13. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  14. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
  15. azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
  16. azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
  17. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
  18. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  19. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  20. azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
  21. azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
  22. azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
  23. azure/ai/evaluation/_exceptions.py +1 -0
  24. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
  25. azure/ai/evaluation/_version.py +1 -1
  26. azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
  27. azure/ai/evaluation/red_team/_red_team.py +9 -0
  28. azure/ai/evaluation/red_team/_red_team_result.py +230 -1
  29. azure/ai/evaluation/red_team/_result_processor.py +416 -23
  30. azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
  31. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +13 -3
  32. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
  33. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
  34. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
  35. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ import os
12
12
  import tempfile
13
13
  import uuid
14
14
  from datetime import datetime
15
- from typing import Dict, Optional, cast
15
+ from typing import Any, Dict, List, Optional, Set, cast
16
16
  from pathlib import Path
17
17
 
18
18
  # Azure AI Evaluation imports
@@ -27,7 +27,14 @@ from azure.ai.evaluation._common import RedTeamUpload, ResultType
27
27
  from azure.ai.evaluation._model_configurations import AzureAIProject
28
28
 
29
29
  # Local imports
30
- from ._red_team_result import RedTeamResult
30
+ from ._red_team_result import (
31
+ RedTeamResult,
32
+ RedTeamRun,
33
+ ResultCount,
34
+ PerTestingCriteriaResult,
35
+ DataSource,
36
+ OutputItemsList,
37
+ )
31
38
  from ._utils.logging_utils import log_error
32
39
 
33
40
 
@@ -50,6 +57,32 @@ class MLflowIntegration:
50
57
  self.scan_output_dir = scan_output_dir
51
58
  self.ai_studio_url = None
52
59
  self.trace_destination = None
60
+ self._run_id_override: Optional[str] = None
61
+ self._eval_id_override: Optional[str] = None
62
+ self._created_at_override: Optional[int] = None
63
+
64
+ def set_run_identity_overrides(
65
+ self,
66
+ *,
67
+ run_id: Optional[str] = None,
68
+ eval_id: Optional[str] = None,
69
+ created_at: Optional[Any] = None,
70
+ ) -> None:
71
+ """Allow callers to supply pre-existing identifiers for the run payload."""
72
+
73
+ self._run_id_override = str(run_id).strip() if run_id else None
74
+ self._eval_id_override = str(eval_id).strip() if eval_id else None
75
+
76
+ if created_at is None or created_at == "":
77
+ self._created_at_override = None
78
+ else:
79
+ if isinstance(created_at, datetime):
80
+ self._created_at_override = int(created_at.timestamp())
81
+ else:
82
+ try:
83
+ self._created_at_override = int(created_at)
84
+ except (TypeError, ValueError):
85
+ self._created_at_override = None
53
86
 
54
87
  def start_redteam_mlflow_run(
55
88
  self,
@@ -152,38 +185,36 @@ class MLflowIntegration:
152
185
  """
153
186
  self.logger.debug(f"Logging results to MLFlow, _skip_evals={_skip_evals}")
154
187
  artifact_name = "instance_results.json"
188
+ results_name = "results.json"
155
189
  eval_info_name = "redteam_info.json"
156
190
  properties = {}
157
191
 
158
192
  with tempfile.TemporaryDirectory() as tmpdir:
159
193
  if self.scan_output_dir:
194
+ # Save new format as results.json
195
+ results_path = os.path.join(self.scan_output_dir, results_name)
196
+ self.logger.debug(f"Saving results to scan output directory: {results_path}")
197
+ with open(results_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
198
+ payload = self._build_results_payload(
199
+ redteam_result=redteam_result,
200
+ eval_run=eval_run,
201
+ red_team_info=red_team_info,
202
+ include_conversations=True,
203
+ scan_name=getattr(eval_run, "display_name", None),
204
+ )
205
+ json.dump(payload, f)
206
+
207
+ # Save legacy format as instance_results.json
160
208
  artifact_path = os.path.join(self.scan_output_dir, artifact_name)
161
209
  self.logger.debug(f"Saving artifact to scan output directory: {artifact_path}")
162
210
  with open(artifact_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
163
- if _skip_evals:
164
- # In _skip_evals mode, we write the conversations in conversation/messages format
165
- f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
166
- elif redteam_result.scan_result:
167
- # Create a copy to avoid modifying the original scan result
168
- result_with_conversations = (
169
- redteam_result.scan_result.copy() if isinstance(redteam_result.scan_result, dict) else {}
170
- )
171
-
172
- # Preserve all original fields needed for scorecard generation
173
- result_with_conversations["scorecard"] = result_with_conversations.get("scorecard", {})
174
- result_with_conversations["parameters"] = result_with_conversations.get("parameters", {})
175
-
176
- # Add conversations field with all conversation data including user messages
177
- result_with_conversations["conversations"] = redteam_result.attack_details or []
178
-
179
- # Keep original attack_details field to preserve compatibility with existing code
180
- if (
181
- "attack_details" not in result_with_conversations
182
- and redteam_result.attack_details is not None
183
- ):
184
- result_with_conversations["attack_details"] = redteam_result.attack_details
185
-
186
- json.dump(result_with_conversations, f)
211
+ legacy_payload = self._build_instance_results_payload(
212
+ redteam_result=redteam_result,
213
+ eval_run=eval_run,
214
+ red_team_info=red_team_info,
215
+ scan_name=getattr(eval_run, "display_name", None),
216
+ )
217
+ json.dump(legacy_payload, f)
187
218
 
188
219
  eval_info_path = os.path.join(self.scan_output_dir, eval_info_name)
189
220
  self.logger.debug(f"Saving evaluation info to scan output directory: {eval_info_path}")
@@ -210,16 +241,34 @@ class MLflowIntegration:
210
241
  self.logger.debug(f"Saved scorecard to: {scorecard_path}")
211
242
 
212
243
  # Create a dedicated artifacts directory with proper structure for MLFlow
213
- # First, create the main artifact file that MLFlow expects
244
+ # First, create the main artifact file that MLFlow expects (new format)
245
+ with open(
246
+ os.path.join(tmpdir, results_name),
247
+ "w",
248
+ encoding=DefaultOpenEncoding.WRITE,
249
+ ) as f:
250
+ payload = self._build_results_payload(
251
+ redteam_result=redteam_result,
252
+ eval_run=eval_run,
253
+ red_team_info=red_team_info,
254
+ include_conversations=False,
255
+ scan_name=getattr(eval_run, "display_name", None),
256
+ )
257
+ json.dump(payload, f)
258
+
259
+ # Also create legacy instance_results.json for compatibility
214
260
  with open(
215
261
  os.path.join(tmpdir, artifact_name),
216
262
  "w",
217
263
  encoding=DefaultOpenEncoding.WRITE,
218
264
  ) as f:
219
- if _skip_evals:
220
- f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
221
- elif redteam_result.scan_result:
222
- json.dump(redteam_result.scan_result, f)
265
+ legacy_payload = self._build_instance_results_payload(
266
+ redteam_result=redteam_result,
267
+ eval_run=eval_run,
268
+ red_team_info=red_team_info,
269
+ scan_name=getattr(eval_run, "display_name", None),
270
+ )
271
+ json.dump(legacy_payload, f)
223
272
 
224
273
  # Copy all relevant files to the temp directory
225
274
  import shutil
@@ -246,12 +295,28 @@ class MLflowIntegration:
246
295
  properties.update({"scan_output_dir": str(self.scan_output_dir)})
247
296
  else:
248
297
  # Use temporary directory as before if no scan output directory exists
298
+ results_file = Path(tmpdir) / results_name
299
+ with open(results_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
300
+ payload = self._build_results_payload(
301
+ redteam_result=redteam_result,
302
+ eval_run=eval_run,
303
+ red_team_info=red_team_info,
304
+ include_conversations=_skip_evals,
305
+ scan_name=getattr(eval_run, "display_name", None),
306
+ )
307
+ json.dump(payload, f)
308
+ self.logger.debug(f"Logged artifact: {results_name}")
309
+
310
+ # Also create legacy instance_results.json
249
311
  artifact_file = Path(tmpdir) / artifact_name
250
312
  with open(artifact_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
251
- if _skip_evals:
252
- f.write(json.dumps({"conversations": redteam_result.attack_details or []}))
253
- elif redteam_result.scan_result:
254
- json.dump(redteam_result.scan_result, f)
313
+ legacy_payload = self._build_instance_results_payload(
314
+ redteam_result=redteam_result,
315
+ eval_run=eval_run,
316
+ red_team_info=red_team_info,
317
+ scan_name=getattr(eval_run, "display_name", None),
318
+ )
319
+ json.dump(legacy_payload, f)
255
320
  self.logger.debug(f"Logged artifact: {artifact_name}")
256
321
 
257
322
  properties.update(
@@ -320,3 +385,357 @@ class MLflowIntegration:
320
385
 
321
386
  self.logger.info("Successfully logged results to AI Foundry")
322
387
  return None
388
+
389
+ @staticmethod
390
+ def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
391
+ """Aggregate run-level pass/fail counts from individual output items."""
392
+
393
+ total = len(output_items)
394
+ passed = failed = errored = 0
395
+
396
+ for item in output_items:
397
+ item_status: Optional[bool] = None
398
+ for result in item.get("results", []):
399
+ result_properties = result.get("properties", {}) if isinstance(result, dict) else {}
400
+ attack_success = result_properties.get("attack_success")
401
+ if attack_success is True:
402
+ item_status = False
403
+ break
404
+ if attack_success is False:
405
+ item_status = True
406
+ elif item_status is None and result.get("passed") is not None:
407
+ item_status = bool(result.get("passed"))
408
+
409
+ if item_status is True:
410
+ passed += 1
411
+ elif item_status is False:
412
+ failed += 1
413
+ else:
414
+ errored += 1
415
+
416
+ return {
417
+ "total": total,
418
+ "passed": passed,
419
+ "failed": failed,
420
+ "errored": errored,
421
+ }
422
+
423
+ @staticmethod
424
+ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
425
+ """Build aggregated pass/fail counts per testing criteria (risk category)."""
426
+
427
+ criteria: Dict[str, Dict[str, int]] = {}
428
+
429
+ for item in output_items:
430
+ for result in item.get("results", []):
431
+ if not isinstance(result, dict):
432
+ continue
433
+ name = result.get("name")
434
+ if not name:
435
+ continue
436
+ passed_value = result.get("passed")
437
+ if passed_value is None:
438
+ continue
439
+
440
+ bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
441
+ if passed_value:
442
+ bucket["passed"] += 1
443
+ else:
444
+ bucket["failed"] += 1
445
+
446
+ return [
447
+ {
448
+ "testing_criteria": criteria_name,
449
+ "passed": counts["passed"],
450
+ "failed": counts["failed"],
451
+ }
452
+ for criteria_name, counts in sorted(criteria.items())
453
+ ]
454
+
455
+ @staticmethod
456
+ def _build_data_source_section(parameters: Dict[str, Any], red_team_info: Optional[Dict]) -> Dict[str, Any]:
457
+ """Build the data_source portion of the run payload for red-team scans."""
458
+
459
+ attack_strategies: List[str] = []
460
+ if isinstance(red_team_info, dict):
461
+ attack_strategies = sorted(str(strategy) for strategy in red_team_info.keys())
462
+
463
+ item_generation_params: Dict[str, Any] = {"type": "red_team"}
464
+ if attack_strategies:
465
+ item_generation_params["attack_strategies"] = attack_strategies
466
+
467
+ # Attempt to infer turns from parameters if available
468
+ num_turns = parameters.get("max_turns") if isinstance(parameters, dict) else None
469
+ if isinstance(num_turns, int) and num_turns > 0:
470
+ item_generation_params["num_turns"] = num_turns
471
+
472
+ data_source: Dict[str, Any] = {"type": "azure_ai_red_team", "target": {}}
473
+ if item_generation_params:
474
+ data_source["item_generation_params"] = item_generation_params
475
+
476
+ return data_source
477
+
478
+ def _determine_run_status(
479
+ self,
480
+ scan_result: Dict[str, Any],
481
+ red_team_info: Optional[Dict],
482
+ output_items: List[Dict[str, Any]],
483
+ ) -> str:
484
+ """Determine the run-level status based on red team info status values."""
485
+
486
+ # Check if any tasks are still incomplete/failed
487
+ if isinstance(red_team_info, dict):
488
+ for risk_data in red_team_info.values():
489
+ if not isinstance(risk_data, dict):
490
+ continue
491
+ for details in risk_data.values():
492
+ if not isinstance(details, dict):
493
+ continue
494
+ status = details.get("status", "").lower()
495
+ if status in ("incomplete", "failed", "timeout"):
496
+ return "failed"
497
+ elif status in ("running", "pending"):
498
+ return "in_progress"
499
+
500
+ return "completed"
501
+
502
+ def _build_results_payload(
503
+ self,
504
+ redteam_result: RedTeamResult,
505
+ eval_run: Optional[Any] = None,
506
+ red_team_info: Optional[Dict] = None,
507
+ include_conversations: bool = False,
508
+ scan_name: Optional[str] = None,
509
+ ) -> RedTeamRun:
510
+ """Assemble the new structure for results.json with eval.run format."""
511
+
512
+ scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
513
+ output_items = cast(List[Dict[str, Any]], scan_result.get("output_items") or [])
514
+ scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
515
+ parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
516
+
517
+ run_id = self._run_id_override
518
+ eval_id = self._eval_id_override
519
+ run_name: Optional[str] = None
520
+ created_at = self._created_at_override
521
+
522
+ if eval_run is not None:
523
+ run_info = getattr(eval_run, "info", None)
524
+
525
+ if run_id is None:
526
+ candidate_run_id = (
527
+ getattr(run_info, "run_id", None)
528
+ or getattr(eval_run, "run_id", None)
529
+ or getattr(eval_run, "id", None)
530
+ )
531
+ if candidate_run_id is not None:
532
+ run_id = str(candidate_run_id)
533
+
534
+ if eval_id is None:
535
+ candidate_eval_id = (
536
+ getattr(run_info, "experiment_id", None)
537
+ or getattr(eval_run, "experiment_id", None)
538
+ or getattr(eval_run, "eval_id", None)
539
+ )
540
+ if candidate_eval_id is not None:
541
+ eval_id = str(candidate_eval_id)
542
+
543
+ if run_name is None:
544
+ candidate_run_name = (
545
+ getattr(run_info, "run_name", None)
546
+ or getattr(eval_run, "run_name", None)
547
+ or getattr(eval_run, "display_name", None)
548
+ or getattr(eval_run, "name", None)
549
+ )
550
+ if candidate_run_name is not None:
551
+ run_name = str(candidate_run_name)
552
+
553
+ if created_at is None:
554
+ raw_created = (
555
+ getattr(run_info, "created_time", None)
556
+ or getattr(eval_run, "created_at", None)
557
+ or getattr(eval_run, "created_time", None)
558
+ )
559
+ if isinstance(raw_created, datetime):
560
+ created_at = int(raw_created.timestamp())
561
+ elif isinstance(raw_created, (int, float)):
562
+ created_at = int(raw_created)
563
+ elif isinstance(raw_created, str):
564
+ try:
565
+ created_at = int(float(raw_created))
566
+ except ValueError:
567
+ created_at = None
568
+
569
+ if run_id is None:
570
+ run_id = str(uuid.uuid4())
571
+ if eval_id is None:
572
+ eval_id = str(uuid.uuid4())
573
+ if created_at is None:
574
+ created_at = int(datetime.now().timestamp())
575
+ if run_name is None:
576
+ run_name = scan_name or f"redteam-run-{run_id[:8]}"
577
+
578
+ result_count = self._compute_result_count(output_items)
579
+ per_testing_results = self._compute_per_testing_criteria(output_items)
580
+ data_source = self._build_data_source_section(parameters, red_team_info)
581
+ status = self._determine_run_status(scan_result, red_team_info, output_items)
582
+
583
+ list_wrapper: OutputItemsList = {
584
+ "object": "list",
585
+ "data": output_items,
586
+ }
587
+
588
+ run_payload: RedTeamRun = {
589
+ "object": "eval.run",
590
+ "id": run_id,
591
+ "eval_id": eval_id,
592
+ "created_at": created_at,
593
+ "status": status,
594
+ "name": run_name,
595
+ "report_url": scan_result.get("studio_url") or self.ai_studio_url,
596
+ "data_source": data_source,
597
+ "metadata": {},
598
+ "result_count": result_count,
599
+ "per_model_usage": [],
600
+ "per_testing_criteria_results": per_testing_results,
601
+ "output_items": list_wrapper,
602
+ }
603
+
604
+ if include_conversations:
605
+ run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
606
+
607
+ return run_payload
608
+
609
+ def _build_results_payload(
610
+ self,
611
+ redteam_result: RedTeamResult,
612
+ eval_run: Optional[Any] = None,
613
+ red_team_info: Optional[Dict] = None,
614
+ include_conversations: bool = False,
615
+ scan_name: Optional[str] = None,
616
+ ) -> RedTeamRun:
617
+ """Assemble the new structure for results.json with eval.run format."""
618
+
619
+ scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
620
+ output_items = cast(List[Dict[str, Any]], scan_result.get("output_items") or [])
621
+ scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
622
+ parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
623
+
624
+ run_id = self._run_id_override
625
+ eval_id = self._eval_id_override
626
+ run_name: Optional[str] = None
627
+ created_at = self._created_at_override
628
+
629
+ if eval_run is not None:
630
+ run_info = getattr(eval_run, "info", None)
631
+
632
+ if run_id is None:
633
+ candidate_run_id = (
634
+ getattr(run_info, "run_id", None)
635
+ or getattr(eval_run, "run_id", None)
636
+ or getattr(eval_run, "id", None)
637
+ )
638
+ if candidate_run_id is not None:
639
+ run_id = str(candidate_run_id)
640
+
641
+ if eval_id is None:
642
+ candidate_eval_id = (
643
+ getattr(run_info, "experiment_id", None)
644
+ or getattr(eval_run, "experiment_id", None)
645
+ or getattr(eval_run, "eval_id", None)
646
+ )
647
+ if candidate_eval_id is not None:
648
+ eval_id = str(candidate_eval_id)
649
+
650
+ if run_name is None:
651
+ candidate_run_name = (
652
+ getattr(run_info, "run_name", None)
653
+ or getattr(eval_run, "run_name", None)
654
+ or getattr(eval_run, "display_name", None)
655
+ or getattr(eval_run, "name", None)
656
+ )
657
+ if candidate_run_name is not None:
658
+ run_name = str(candidate_run_name)
659
+
660
+ if created_at is None:
661
+ raw_created = (
662
+ getattr(run_info, "created_time", None)
663
+ or getattr(eval_run, "created_at", None)
664
+ or getattr(eval_run, "created_time", None)
665
+ )
666
+ if isinstance(raw_created, datetime):
667
+ created_at = int(raw_created.timestamp())
668
+ elif isinstance(raw_created, (int, float)):
669
+ created_at = int(raw_created)
670
+ elif isinstance(raw_created, str):
671
+ try:
672
+ created_at = int(float(raw_created))
673
+ except ValueError:
674
+ created_at = None
675
+
676
+ if run_id is None:
677
+ run_id = str(uuid.uuid4())
678
+ if eval_id is None:
679
+ eval_id = str(uuid.uuid4())
680
+ if created_at is None:
681
+ created_at = int(datetime.now().timestamp())
682
+ if run_name is None:
683
+ run_name = scan_name or f"redteam-run-{run_id[:8]}"
684
+
685
+ result_count = self._compute_result_count(output_items)
686
+ per_testing_results = self._compute_per_testing_criteria(output_items)
687
+ data_source = self._build_data_source_section(parameters, red_team_info)
688
+ status = self._determine_run_status(scan_result, red_team_info, output_items)
689
+
690
+ list_wrapper: OutputItemsList = {
691
+ "object": "list",
692
+ "data": output_items,
693
+ }
694
+
695
+ run_payload: RedTeamRun = {
696
+ "object": "eval.run",
697
+ "id": run_id,
698
+ "eval_id": eval_id,
699
+ "created_at": created_at,
700
+ "status": status,
701
+ "name": run_name,
702
+ "report_url": scan_result.get("studio_url") or self.ai_studio_url,
703
+ "data_source": data_source,
704
+ "metadata": {},
705
+ "result_count": result_count,
706
+ "per_model_usage": [],
707
+ "per_testing_criteria_results": per_testing_results,
708
+ "output_items": list_wrapper,
709
+ }
710
+
711
+ if include_conversations:
712
+ run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
713
+
714
+ return run_payload
715
+
716
+ def _build_instance_results_payload(
717
+ self,
718
+ redteam_result: RedTeamResult,
719
+ eval_run: Optional[Any] = None,
720
+ red_team_info: Optional[Dict] = None,
721
+ scan_name: Optional[str] = None,
722
+ ) -> Dict:
723
+ """Assemble the legacy structure for instance_results.json (scan_result format)."""
724
+
725
+ scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
726
+
727
+ # Return the scan_result directly for legacy compatibility
728
+ # This maintains the old format that was expected previously
729
+ legacy_payload = scan_result.copy() if scan_result else {}
730
+
731
+ # Ensure we have the basic required fields
732
+ if "scorecard" not in legacy_payload:
733
+ legacy_payload["scorecard"] = {}
734
+ if "parameters" not in legacy_payload:
735
+ legacy_payload["parameters"] = {}
736
+ if "output_items" not in legacy_payload:
737
+ legacy_payload["output_items"] = []
738
+ if "attack_details" not in legacy_payload:
739
+ legacy_payload["attack_details"] = redteam_result.attack_details or []
740
+
741
+ return legacy_payload
@@ -783,6 +783,9 @@ class RedTeam:
783
783
  :rtype: RedTeamResult
784
784
  """
785
785
  user_agent: Optional[str] = kwargs.get("user_agent", "(type=redteam; subtype=RedTeam)")
786
+ run_id_override = kwargs.get("run_id") or kwargs.get("runId")
787
+ eval_id_override = kwargs.get("eval_id") or kwargs.get("evalId")
788
+ created_at_override = kwargs.get("created_at") or kwargs.get("createdAt")
786
789
  with UserAgentSingleton().add_useragent_product(user_agent):
787
790
  # Initialize scan
788
791
  self._initialize_scan(scan_name, application_scenario)
@@ -802,6 +805,12 @@ class RedTeam:
802
805
  self.mlflow_integration.logger = self.logger
803
806
  self.result_processor.logger = self.logger
804
807
 
808
+ self.mlflow_integration.set_run_identity_overrides(
809
+ run_id=run_id_override,
810
+ eval_id=eval_id_override,
811
+ created_at=created_at_override,
812
+ )
813
+
805
814
  # Validate attack objective generator
806
815
  if not self.attack_objective_generator:
807
816
  raise EvaluationException(