azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
  3. azure/ai/evaluation/_aoai/label_grader.py +6 -10
  4. azure/ai/evaluation/_aoai/python_grader.py +7 -10
  5. azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
  6. azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +241 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -2
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
  33. azure/ai/evaluation/_evaluate/_utils.py +10 -3
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
  38. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  39. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
  40. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  41. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  42. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  43. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
  44. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  45. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  46. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  47. azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
  48. azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
  49. azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
  50. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
  52. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  53. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  55. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  56. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  57. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  58. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  59. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  60. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  61. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  62. azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
  63. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  64. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  65. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  66. azure/ai/evaluation/_exceptions.py +6 -1
  67. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  68. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  69. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  70. azure/ai/evaluation/_model_configurations.py +26 -0
  71. azure/ai/evaluation/_version.py +1 -1
  72. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  73. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  74. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  75. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  76. azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
  77. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  78. azure/ai/evaluation/red_team/_red_team.py +494 -37
  79. azure/ai/evaluation/red_team/_red_team_result.py +48 -28
  80. azure/ai/evaluation/red_team/_result_processor.py +558 -29
  81. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  82. azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
  83. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  84. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  85. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  86. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  87. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  88. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  90. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  91. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  92. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  94. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  95. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
  96. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
  97. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  98. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  99. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -169,6 +169,7 @@ class MLflowIntegration:
169
169
  eval_run: EvalRun,
170
170
  red_team_info: Dict,
171
171
  _skip_evals: bool = False,
172
+ aoai_summary: Optional["RedTeamRun"] = None,
172
173
  ) -> Optional[str]:
173
174
  """Log the Red Team Agent results to MLFlow.
174
175
 
@@ -180,6 +181,8 @@ class MLflowIntegration:
180
181
  :type red_team_info: Dict
181
182
  :param _skip_evals: Whether to log only data without evaluation results
182
183
  :type _skip_evals: bool
184
+ :param aoai_summary: Pre-built AOAI-compatible summary (optional, will be built if not provided)
185
+ :type aoai_summary: Optional[RedTeamRun]
183
186
  :return: The URL to the run in Azure AI Studio, if available
184
187
  :rtype: Optional[str]
185
188
  """
@@ -195,13 +198,12 @@ class MLflowIntegration:
195
198
  results_path = os.path.join(self.scan_output_dir, results_name)
196
199
  self.logger.debug(f"Saving results to scan output directory: {results_path}")
197
200
  with open(results_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
198
- payload = self._build_results_payload(
199
- redteam_result=redteam_result,
200
- eval_run=eval_run,
201
- red_team_info=red_team_info,
202
- include_conversations=True,
203
- scan_name=getattr(eval_run, "display_name", None),
204
- )
201
+ # Use provided aoai_summary
202
+ if aoai_summary is None:
203
+ self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
204
+ raise ValueError("aoai_summary parameter is required but was not provided")
205
+
206
+ payload = dict(aoai_summary) # Make a copy
205
207
  json.dump(payload, f)
206
208
 
207
209
  # Save legacy format as instance_results.json
@@ -247,13 +249,14 @@ class MLflowIntegration:
247
249
  "w",
248
250
  encoding=DefaultOpenEncoding.WRITE,
249
251
  ) as f:
250
- payload = self._build_results_payload(
251
- redteam_result=redteam_result,
252
- eval_run=eval_run,
253
- red_team_info=red_team_info,
254
- include_conversations=False,
255
- scan_name=getattr(eval_run, "display_name", None),
256
- )
252
+ # Use provided aoai_summary (required)
253
+ if aoai_summary is None:
254
+ self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
255
+ raise ValueError("aoai_summary parameter is required but was not provided")
256
+
257
+ payload = dict(aoai_summary) # Make a copy
258
+ # Remove conversations for MLFlow artifact
259
+ payload.pop("conversations", None)
257
260
  json.dump(payload, f)
258
261
 
259
262
  # Also create legacy instance_results.json for compatibility
@@ -297,13 +300,19 @@ class MLflowIntegration:
297
300
  # Use temporary directory as before if no scan output directory exists
298
301
  results_file = Path(tmpdir) / results_name
299
302
  with open(results_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
300
- payload = self._build_results_payload(
301
- redteam_result=redteam_result,
302
- eval_run=eval_run,
303
- red_team_info=red_team_info,
304
- include_conversations=_skip_evals,
305
- scan_name=getattr(eval_run, "display_name", None),
306
- )
303
+ # Use provided aoai_summary (required)
304
+ if aoai_summary is None:
305
+ self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
306
+ raise ValueError("aoai_summary parameter is required but was not provided")
307
+
308
+ payload = dict(aoai_summary) # Make a copy
309
+ # Include conversations only if _skip_evals is True
310
+ if _skip_evals and "conversations" not in payload:
311
+ payload["conversations"] = (
312
+ redteam_result.attack_details or redteam_result.scan_result.get("attack_details") or []
313
+ )
314
+ elif not _skip_evals:
315
+ payload.pop("conversations", None)
307
316
  json.dump(payload, f)
308
317
  self.logger.debug(f"Logged artifact: {results_name}")
309
318
 
@@ -343,7 +352,7 @@ class MLflowIntegration:
343
352
  try:
344
353
  create_evaluation_result_response = (
345
354
  self.generated_rai_client._evaluation_onedp_client.create_evaluation_result(
346
- name=uuid.uuid4(),
355
+ name=str(uuid.uuid4()),
347
356
  path=tmpdir,
348
357
  metrics=metrics,
349
358
  result_type=ResultType.REDTEAM,
@@ -386,333 +395,6 @@ class MLflowIntegration:
386
395
  self.logger.info("Successfully logged results to AI Foundry")
387
396
  return None
388
397
 
389
- @staticmethod
390
- def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
391
- """Aggregate run-level pass/fail counts from individual output items."""
392
-
393
- total = len(output_items)
394
- passed = failed = errored = 0
395
-
396
- for item in output_items:
397
- item_status: Optional[bool] = None
398
- for result in item.get("results", []):
399
- result_properties = result.get("properties", {}) if isinstance(result, dict) else {}
400
- attack_success = result_properties.get("attack_success")
401
- if attack_success is True:
402
- item_status = False
403
- break
404
- if attack_success is False:
405
- item_status = True
406
- elif item_status is None and result.get("passed") is not None:
407
- item_status = bool(result.get("passed"))
408
-
409
- if item_status is True:
410
- passed += 1
411
- elif item_status is False:
412
- failed += 1
413
- else:
414
- errored += 1
415
-
416
- return {
417
- "total": total,
418
- "passed": passed,
419
- "failed": failed,
420
- "errored": errored,
421
- }
422
-
423
- @staticmethod
424
- def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
425
- """Build aggregated pass/fail counts per testing criteria (risk category)."""
426
-
427
- criteria: Dict[str, Dict[str, int]] = {}
428
-
429
- for item in output_items:
430
- for result in item.get("results", []):
431
- if not isinstance(result, dict):
432
- continue
433
- name = result.get("name")
434
- if not name:
435
- continue
436
- passed_value = result.get("passed")
437
- if passed_value is None:
438
- continue
439
-
440
- bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
441
- if passed_value:
442
- bucket["passed"] += 1
443
- else:
444
- bucket["failed"] += 1
445
-
446
- return [
447
- {
448
- "testing_criteria": criteria_name,
449
- "passed": counts["passed"],
450
- "failed": counts["failed"],
451
- }
452
- for criteria_name, counts in sorted(criteria.items())
453
- ]
454
-
455
- @staticmethod
456
- def _build_data_source_section(parameters: Dict[str, Any], red_team_info: Optional[Dict]) -> Dict[str, Any]:
457
- """Build the data_source portion of the run payload for red-team scans."""
458
-
459
- attack_strategies: List[str] = []
460
- if isinstance(red_team_info, dict):
461
- attack_strategies = sorted(str(strategy) for strategy in red_team_info.keys())
462
-
463
- item_generation_params: Dict[str, Any] = {"type": "red_team"}
464
- if attack_strategies:
465
- item_generation_params["attack_strategies"] = attack_strategies
466
-
467
- # Attempt to infer turns from parameters if available
468
- num_turns = parameters.get("max_turns") if isinstance(parameters, dict) else None
469
- if isinstance(num_turns, int) and num_turns > 0:
470
- item_generation_params["num_turns"] = num_turns
471
-
472
- data_source: Dict[str, Any] = {"type": "azure_ai_red_team", "target": {}}
473
- if item_generation_params:
474
- data_source["item_generation_params"] = item_generation_params
475
-
476
- return data_source
477
-
478
- def _determine_run_status(
479
- self,
480
- scan_result: Dict[str, Any],
481
- red_team_info: Optional[Dict],
482
- output_items: List[Dict[str, Any]],
483
- ) -> str:
484
- """Determine the run-level status based on red team info status values."""
485
-
486
- # Check if any tasks are still incomplete/failed
487
- if isinstance(red_team_info, dict):
488
- for risk_data in red_team_info.values():
489
- if not isinstance(risk_data, dict):
490
- continue
491
- for details in risk_data.values():
492
- if not isinstance(details, dict):
493
- continue
494
- status = details.get("status", "").lower()
495
- if status in ("incomplete", "failed", "timeout"):
496
- return "failed"
497
- elif status in ("running", "pending"):
498
- return "in_progress"
499
-
500
- return "completed"
501
-
502
- def _build_results_payload(
503
- self,
504
- redteam_result: RedTeamResult,
505
- eval_run: Optional[Any] = None,
506
- red_team_info: Optional[Dict] = None,
507
- include_conversations: bool = False,
508
- scan_name: Optional[str] = None,
509
- ) -> RedTeamRun:
510
- """Assemble the new structure for results.json with eval.run format."""
511
-
512
- scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
513
- output_items = cast(List[Dict[str, Any]], scan_result.get("output_items") or [])
514
- scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
515
- parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
516
-
517
- run_id = self._run_id_override
518
- eval_id = self._eval_id_override
519
- run_name: Optional[str] = None
520
- created_at = self._created_at_override
521
-
522
- if eval_run is not None:
523
- run_info = getattr(eval_run, "info", None)
524
-
525
- if run_id is None:
526
- candidate_run_id = (
527
- getattr(run_info, "run_id", None)
528
- or getattr(eval_run, "run_id", None)
529
- or getattr(eval_run, "id", None)
530
- )
531
- if candidate_run_id is not None:
532
- run_id = str(candidate_run_id)
533
-
534
- if eval_id is None:
535
- candidate_eval_id = (
536
- getattr(run_info, "experiment_id", None)
537
- or getattr(eval_run, "experiment_id", None)
538
- or getattr(eval_run, "eval_id", None)
539
- )
540
- if candidate_eval_id is not None:
541
- eval_id = str(candidate_eval_id)
542
-
543
- if run_name is None:
544
- candidate_run_name = (
545
- getattr(run_info, "run_name", None)
546
- or getattr(eval_run, "run_name", None)
547
- or getattr(eval_run, "display_name", None)
548
- or getattr(eval_run, "name", None)
549
- )
550
- if candidate_run_name is not None:
551
- run_name = str(candidate_run_name)
552
-
553
- if created_at is None:
554
- raw_created = (
555
- getattr(run_info, "created_time", None)
556
- or getattr(eval_run, "created_at", None)
557
- or getattr(eval_run, "created_time", None)
558
- )
559
- if isinstance(raw_created, datetime):
560
- created_at = int(raw_created.timestamp())
561
- elif isinstance(raw_created, (int, float)):
562
- created_at = int(raw_created)
563
- elif isinstance(raw_created, str):
564
- try:
565
- created_at = int(float(raw_created))
566
- except ValueError:
567
- created_at = None
568
-
569
- if run_id is None:
570
- run_id = str(uuid.uuid4())
571
- if eval_id is None:
572
- eval_id = str(uuid.uuid4())
573
- if created_at is None:
574
- created_at = int(datetime.now().timestamp())
575
- if run_name is None:
576
- run_name = scan_name or f"redteam-run-{run_id[:8]}"
577
-
578
- result_count = self._compute_result_count(output_items)
579
- per_testing_results = self._compute_per_testing_criteria(output_items)
580
- data_source = self._build_data_source_section(parameters, red_team_info)
581
- status = self._determine_run_status(scan_result, red_team_info, output_items)
582
-
583
- list_wrapper: OutputItemsList = {
584
- "object": "list",
585
- "data": output_items,
586
- }
587
-
588
- run_payload: RedTeamRun = {
589
- "object": "eval.run",
590
- "id": run_id,
591
- "eval_id": eval_id,
592
- "created_at": created_at,
593
- "status": status,
594
- "name": run_name,
595
- "report_url": scan_result.get("studio_url") or self.ai_studio_url,
596
- "data_source": data_source,
597
- "metadata": {},
598
- "result_count": result_count,
599
- "per_model_usage": [],
600
- "per_testing_criteria_results": per_testing_results,
601
- "output_items": list_wrapper,
602
- }
603
-
604
- if include_conversations:
605
- run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
606
-
607
- return run_payload
608
-
609
- def _build_results_payload(
610
- self,
611
- redteam_result: RedTeamResult,
612
- eval_run: Optional[Any] = None,
613
- red_team_info: Optional[Dict] = None,
614
- include_conversations: bool = False,
615
- scan_name: Optional[str] = None,
616
- ) -> RedTeamRun:
617
- """Assemble the new structure for results.json with eval.run format."""
618
-
619
- scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
620
- output_items = cast(List[Dict[str, Any]], scan_result.get("output_items") or [])
621
- scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
622
- parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
623
-
624
- run_id = self._run_id_override
625
- eval_id = self._eval_id_override
626
- run_name: Optional[str] = None
627
- created_at = self._created_at_override
628
-
629
- if eval_run is not None:
630
- run_info = getattr(eval_run, "info", None)
631
-
632
- if run_id is None:
633
- candidate_run_id = (
634
- getattr(run_info, "run_id", None)
635
- or getattr(eval_run, "run_id", None)
636
- or getattr(eval_run, "id", None)
637
- )
638
- if candidate_run_id is not None:
639
- run_id = str(candidate_run_id)
640
-
641
- if eval_id is None:
642
- candidate_eval_id = (
643
- getattr(run_info, "experiment_id", None)
644
- or getattr(eval_run, "experiment_id", None)
645
- or getattr(eval_run, "eval_id", None)
646
- )
647
- if candidate_eval_id is not None:
648
- eval_id = str(candidate_eval_id)
649
-
650
- if run_name is None:
651
- candidate_run_name = (
652
- getattr(run_info, "run_name", None)
653
- or getattr(eval_run, "run_name", None)
654
- or getattr(eval_run, "display_name", None)
655
- or getattr(eval_run, "name", None)
656
- )
657
- if candidate_run_name is not None:
658
- run_name = str(candidate_run_name)
659
-
660
- if created_at is None:
661
- raw_created = (
662
- getattr(run_info, "created_time", None)
663
- or getattr(eval_run, "created_at", None)
664
- or getattr(eval_run, "created_time", None)
665
- )
666
- if isinstance(raw_created, datetime):
667
- created_at = int(raw_created.timestamp())
668
- elif isinstance(raw_created, (int, float)):
669
- created_at = int(raw_created)
670
- elif isinstance(raw_created, str):
671
- try:
672
- created_at = int(float(raw_created))
673
- except ValueError:
674
- created_at = None
675
-
676
- if run_id is None:
677
- run_id = str(uuid.uuid4())
678
- if eval_id is None:
679
- eval_id = str(uuid.uuid4())
680
- if created_at is None:
681
- created_at = int(datetime.now().timestamp())
682
- if run_name is None:
683
- run_name = scan_name or f"redteam-run-{run_id[:8]}"
684
-
685
- result_count = self._compute_result_count(output_items)
686
- per_testing_results = self._compute_per_testing_criteria(output_items)
687
- data_source = self._build_data_source_section(parameters, red_team_info)
688
- status = self._determine_run_status(scan_result, red_team_info, output_items)
689
-
690
- list_wrapper: OutputItemsList = {
691
- "object": "list",
692
- "data": output_items,
693
- }
694
-
695
- run_payload: RedTeamRun = {
696
- "object": "eval.run",
697
- "id": run_id,
698
- "eval_id": eval_id,
699
- "created_at": created_at,
700
- "status": status,
701
- "name": run_name,
702
- "report_url": scan_result.get("studio_url") or self.ai_studio_url,
703
- "data_source": data_source,
704
- "metadata": {},
705
- "result_count": result_count,
706
- "per_model_usage": [],
707
- "per_testing_criteria_results": per_testing_results,
708
- "output_items": list_wrapper,
709
- }
710
-
711
- if include_conversations:
712
- run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
713
-
714
- return run_payload
715
-
716
398
  def _build_instance_results_payload(
717
399
  self,
718
400
  redteam_result: RedTeamResult,
@@ -726,15 +408,22 @@ class MLflowIntegration:
726
408
 
727
409
  # Return the scan_result directly for legacy compatibility
728
410
  # This maintains the old format that was expected previously
729
- legacy_payload = scan_result.copy() if scan_result else {}
411
+ # Filter out AOAI_Compatible properties - those belong in results.json only
412
+ legacy_payload = (
413
+ {
414
+ k: v
415
+ for k, v in scan_result.items()
416
+ if k not in ["AOAI_Compatible_Summary", "AOAI_Compatible_Row_Results"]
417
+ }
418
+ if scan_result
419
+ else {}
420
+ )
730
421
 
731
422
  # Ensure we have the basic required fields
732
423
  if "scorecard" not in legacy_payload:
733
424
  legacy_payload["scorecard"] = {}
734
425
  if "parameters" not in legacy_payload:
735
426
  legacy_payload["parameters"] = {}
736
- if "output_items" not in legacy_payload:
737
- legacy_payload["output_items"] = []
738
427
  if "attack_details" not in legacy_payload:
739
428
  legacy_payload["attack_details"] = redteam_result.attack_details or []
740
429