azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -169,6 +169,7 @@ class MLflowIntegration:
|
|
|
169
169
|
eval_run: EvalRun,
|
|
170
170
|
red_team_info: Dict,
|
|
171
171
|
_skip_evals: bool = False,
|
|
172
|
+
aoai_summary: Optional["RedTeamRun"] = None,
|
|
172
173
|
) -> Optional[str]:
|
|
173
174
|
"""Log the Red Team Agent results to MLFlow.
|
|
174
175
|
|
|
@@ -180,6 +181,8 @@ class MLflowIntegration:
|
|
|
180
181
|
:type red_team_info: Dict
|
|
181
182
|
:param _skip_evals: Whether to log only data without evaluation results
|
|
182
183
|
:type _skip_evals: bool
|
|
184
|
+
:param aoai_summary: Pre-built AOAI-compatible summary (optional, will be built if not provided)
|
|
185
|
+
:type aoai_summary: Optional[RedTeamRun]
|
|
183
186
|
:return: The URL to the run in Azure AI Studio, if available
|
|
184
187
|
:rtype: Optional[str]
|
|
185
188
|
"""
|
|
@@ -195,13 +198,12 @@ class MLflowIntegration:
|
|
|
195
198
|
results_path = os.path.join(self.scan_output_dir, results_name)
|
|
196
199
|
self.logger.debug(f"Saving results to scan output directory: {results_path}")
|
|
197
200
|
with open(results_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
)
|
|
201
|
+
# Use provided aoai_summary
|
|
202
|
+
if aoai_summary is None:
|
|
203
|
+
self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
|
|
204
|
+
raise ValueError("aoai_summary parameter is required but was not provided")
|
|
205
|
+
|
|
206
|
+
payload = dict(aoai_summary) # Make a copy
|
|
205
207
|
json.dump(payload, f)
|
|
206
208
|
|
|
207
209
|
# Save legacy format as instance_results.json
|
|
@@ -247,13 +249,14 @@ class MLflowIntegration:
|
|
|
247
249
|
"w",
|
|
248
250
|
encoding=DefaultOpenEncoding.WRITE,
|
|
249
251
|
) as f:
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
252
|
+
# Use provided aoai_summary (required)
|
|
253
|
+
if aoai_summary is None:
|
|
254
|
+
self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
|
|
255
|
+
raise ValueError("aoai_summary parameter is required but was not provided")
|
|
256
|
+
|
|
257
|
+
payload = dict(aoai_summary) # Make a copy
|
|
258
|
+
# Remove conversations for MLFlow artifact
|
|
259
|
+
payload.pop("conversations", None)
|
|
257
260
|
json.dump(payload, f)
|
|
258
261
|
|
|
259
262
|
# Also create legacy instance_results.json for compatibility
|
|
@@ -297,13 +300,19 @@ class MLflowIntegration:
|
|
|
297
300
|
# Use temporary directory as before if no scan output directory exists
|
|
298
301
|
results_file = Path(tmpdir) / results_name
|
|
299
302
|
with open(results_file, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
303
|
+
# Use provided aoai_summary (required)
|
|
304
|
+
if aoai_summary is None:
|
|
305
|
+
self.logger.error("aoai_summary must be provided to log_redteam_results_to_mlflow")
|
|
306
|
+
raise ValueError("aoai_summary parameter is required but was not provided")
|
|
307
|
+
|
|
308
|
+
payload = dict(aoai_summary) # Make a copy
|
|
309
|
+
# Include conversations only if _skip_evals is True
|
|
310
|
+
if _skip_evals and "conversations" not in payload:
|
|
311
|
+
payload["conversations"] = (
|
|
312
|
+
redteam_result.attack_details or redteam_result.scan_result.get("attack_details") or []
|
|
313
|
+
)
|
|
314
|
+
elif not _skip_evals:
|
|
315
|
+
payload.pop("conversations", None)
|
|
307
316
|
json.dump(payload, f)
|
|
308
317
|
self.logger.debug(f"Logged artifact: {results_name}")
|
|
309
318
|
|
|
@@ -343,7 +352,7 @@ class MLflowIntegration:
|
|
|
343
352
|
try:
|
|
344
353
|
create_evaluation_result_response = (
|
|
345
354
|
self.generated_rai_client._evaluation_onedp_client.create_evaluation_result(
|
|
346
|
-
name=uuid.uuid4(),
|
|
355
|
+
name=str(uuid.uuid4()),
|
|
347
356
|
path=tmpdir,
|
|
348
357
|
metrics=metrics,
|
|
349
358
|
result_type=ResultType.REDTEAM,
|
|
@@ -386,333 +395,6 @@ class MLflowIntegration:
|
|
|
386
395
|
self.logger.info("Successfully logged results to AI Foundry")
|
|
387
396
|
return None
|
|
388
397
|
|
|
389
|
-
@staticmethod
|
|
390
|
-
def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
|
|
391
|
-
"""Aggregate run-level pass/fail counts from individual output items."""
|
|
392
|
-
|
|
393
|
-
total = len(output_items)
|
|
394
|
-
passed = failed = errored = 0
|
|
395
|
-
|
|
396
|
-
for item in output_items:
|
|
397
|
-
item_status: Optional[bool] = None
|
|
398
|
-
for result in item.get("results", []):
|
|
399
|
-
result_properties = result.get("properties", {}) if isinstance(result, dict) else {}
|
|
400
|
-
attack_success = result_properties.get("attack_success")
|
|
401
|
-
if attack_success is True:
|
|
402
|
-
item_status = False
|
|
403
|
-
break
|
|
404
|
-
if attack_success is False:
|
|
405
|
-
item_status = True
|
|
406
|
-
elif item_status is None and result.get("passed") is not None:
|
|
407
|
-
item_status = bool(result.get("passed"))
|
|
408
|
-
|
|
409
|
-
if item_status is True:
|
|
410
|
-
passed += 1
|
|
411
|
-
elif item_status is False:
|
|
412
|
-
failed += 1
|
|
413
|
-
else:
|
|
414
|
-
errored += 1
|
|
415
|
-
|
|
416
|
-
return {
|
|
417
|
-
"total": total,
|
|
418
|
-
"passed": passed,
|
|
419
|
-
"failed": failed,
|
|
420
|
-
"errored": errored,
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
@staticmethod
|
|
424
|
-
def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
425
|
-
"""Build aggregated pass/fail counts per testing criteria (risk category)."""
|
|
426
|
-
|
|
427
|
-
criteria: Dict[str, Dict[str, int]] = {}
|
|
428
|
-
|
|
429
|
-
for item in output_items:
|
|
430
|
-
for result in item.get("results", []):
|
|
431
|
-
if not isinstance(result, dict):
|
|
432
|
-
continue
|
|
433
|
-
name = result.get("name")
|
|
434
|
-
if not name:
|
|
435
|
-
continue
|
|
436
|
-
passed_value = result.get("passed")
|
|
437
|
-
if passed_value is None:
|
|
438
|
-
continue
|
|
439
|
-
|
|
440
|
-
bucket = criteria.setdefault(str(name), {"passed": 0, "failed": 0})
|
|
441
|
-
if passed_value:
|
|
442
|
-
bucket["passed"] += 1
|
|
443
|
-
else:
|
|
444
|
-
bucket["failed"] += 1
|
|
445
|
-
|
|
446
|
-
return [
|
|
447
|
-
{
|
|
448
|
-
"testing_criteria": criteria_name,
|
|
449
|
-
"passed": counts["passed"],
|
|
450
|
-
"failed": counts["failed"],
|
|
451
|
-
}
|
|
452
|
-
for criteria_name, counts in sorted(criteria.items())
|
|
453
|
-
]
|
|
454
|
-
|
|
455
|
-
@staticmethod
|
|
456
|
-
def _build_data_source_section(parameters: Dict[str, Any], red_team_info: Optional[Dict]) -> Dict[str, Any]:
|
|
457
|
-
"""Build the data_source portion of the run payload for red-team scans."""
|
|
458
|
-
|
|
459
|
-
attack_strategies: List[str] = []
|
|
460
|
-
if isinstance(red_team_info, dict):
|
|
461
|
-
attack_strategies = sorted(str(strategy) for strategy in red_team_info.keys())
|
|
462
|
-
|
|
463
|
-
item_generation_params: Dict[str, Any] = {"type": "red_team"}
|
|
464
|
-
if attack_strategies:
|
|
465
|
-
item_generation_params["attack_strategies"] = attack_strategies
|
|
466
|
-
|
|
467
|
-
# Attempt to infer turns from parameters if available
|
|
468
|
-
num_turns = parameters.get("max_turns") if isinstance(parameters, dict) else None
|
|
469
|
-
if isinstance(num_turns, int) and num_turns > 0:
|
|
470
|
-
item_generation_params["num_turns"] = num_turns
|
|
471
|
-
|
|
472
|
-
data_source: Dict[str, Any] = {"type": "azure_ai_red_team", "target": {}}
|
|
473
|
-
if item_generation_params:
|
|
474
|
-
data_source["item_generation_params"] = item_generation_params
|
|
475
|
-
|
|
476
|
-
return data_source
|
|
477
|
-
|
|
478
|
-
def _determine_run_status(
|
|
479
|
-
self,
|
|
480
|
-
scan_result: Dict[str, Any],
|
|
481
|
-
red_team_info: Optional[Dict],
|
|
482
|
-
output_items: List[Dict[str, Any]],
|
|
483
|
-
) -> str:
|
|
484
|
-
"""Determine the run-level status based on red team info status values."""
|
|
485
|
-
|
|
486
|
-
# Check if any tasks are still incomplete/failed
|
|
487
|
-
if isinstance(red_team_info, dict):
|
|
488
|
-
for risk_data in red_team_info.values():
|
|
489
|
-
if not isinstance(risk_data, dict):
|
|
490
|
-
continue
|
|
491
|
-
for details in risk_data.values():
|
|
492
|
-
if not isinstance(details, dict):
|
|
493
|
-
continue
|
|
494
|
-
status = details.get("status", "").lower()
|
|
495
|
-
if status in ("incomplete", "failed", "timeout"):
|
|
496
|
-
return "failed"
|
|
497
|
-
elif status in ("running", "pending"):
|
|
498
|
-
return "in_progress"
|
|
499
|
-
|
|
500
|
-
return "completed"
|
|
501
|
-
|
|
502
|
-
def _build_results_payload(
|
|
503
|
-
self,
|
|
504
|
-
redteam_result: RedTeamResult,
|
|
505
|
-
eval_run: Optional[Any] = None,
|
|
506
|
-
red_team_info: Optional[Dict] = None,
|
|
507
|
-
include_conversations: bool = False,
|
|
508
|
-
scan_name: Optional[str] = None,
|
|
509
|
-
) -> RedTeamRun:
|
|
510
|
-
"""Assemble the new structure for results.json with eval.run format."""
|
|
511
|
-
|
|
512
|
-
scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
|
|
513
|
-
output_items = cast(List[Dict[str, Any]], scan_result.get("output_items") or [])
|
|
514
|
-
scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
|
|
515
|
-
parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
|
|
516
|
-
|
|
517
|
-
run_id = self._run_id_override
|
|
518
|
-
eval_id = self._eval_id_override
|
|
519
|
-
run_name: Optional[str] = None
|
|
520
|
-
created_at = self._created_at_override
|
|
521
|
-
|
|
522
|
-
if eval_run is not None:
|
|
523
|
-
run_info = getattr(eval_run, "info", None)
|
|
524
|
-
|
|
525
|
-
if run_id is None:
|
|
526
|
-
candidate_run_id = (
|
|
527
|
-
getattr(run_info, "run_id", None)
|
|
528
|
-
or getattr(eval_run, "run_id", None)
|
|
529
|
-
or getattr(eval_run, "id", None)
|
|
530
|
-
)
|
|
531
|
-
if candidate_run_id is not None:
|
|
532
|
-
run_id = str(candidate_run_id)
|
|
533
|
-
|
|
534
|
-
if eval_id is None:
|
|
535
|
-
candidate_eval_id = (
|
|
536
|
-
getattr(run_info, "experiment_id", None)
|
|
537
|
-
or getattr(eval_run, "experiment_id", None)
|
|
538
|
-
or getattr(eval_run, "eval_id", None)
|
|
539
|
-
)
|
|
540
|
-
if candidate_eval_id is not None:
|
|
541
|
-
eval_id = str(candidate_eval_id)
|
|
542
|
-
|
|
543
|
-
if run_name is None:
|
|
544
|
-
candidate_run_name = (
|
|
545
|
-
getattr(run_info, "run_name", None)
|
|
546
|
-
or getattr(eval_run, "run_name", None)
|
|
547
|
-
or getattr(eval_run, "display_name", None)
|
|
548
|
-
or getattr(eval_run, "name", None)
|
|
549
|
-
)
|
|
550
|
-
if candidate_run_name is not None:
|
|
551
|
-
run_name = str(candidate_run_name)
|
|
552
|
-
|
|
553
|
-
if created_at is None:
|
|
554
|
-
raw_created = (
|
|
555
|
-
getattr(run_info, "created_time", None)
|
|
556
|
-
or getattr(eval_run, "created_at", None)
|
|
557
|
-
or getattr(eval_run, "created_time", None)
|
|
558
|
-
)
|
|
559
|
-
if isinstance(raw_created, datetime):
|
|
560
|
-
created_at = int(raw_created.timestamp())
|
|
561
|
-
elif isinstance(raw_created, (int, float)):
|
|
562
|
-
created_at = int(raw_created)
|
|
563
|
-
elif isinstance(raw_created, str):
|
|
564
|
-
try:
|
|
565
|
-
created_at = int(float(raw_created))
|
|
566
|
-
except ValueError:
|
|
567
|
-
created_at = None
|
|
568
|
-
|
|
569
|
-
if run_id is None:
|
|
570
|
-
run_id = str(uuid.uuid4())
|
|
571
|
-
if eval_id is None:
|
|
572
|
-
eval_id = str(uuid.uuid4())
|
|
573
|
-
if created_at is None:
|
|
574
|
-
created_at = int(datetime.now().timestamp())
|
|
575
|
-
if run_name is None:
|
|
576
|
-
run_name = scan_name or f"redteam-run-{run_id[:8]}"
|
|
577
|
-
|
|
578
|
-
result_count = self._compute_result_count(output_items)
|
|
579
|
-
per_testing_results = self._compute_per_testing_criteria(output_items)
|
|
580
|
-
data_source = self._build_data_source_section(parameters, red_team_info)
|
|
581
|
-
status = self._determine_run_status(scan_result, red_team_info, output_items)
|
|
582
|
-
|
|
583
|
-
list_wrapper: OutputItemsList = {
|
|
584
|
-
"object": "list",
|
|
585
|
-
"data": output_items,
|
|
586
|
-
}
|
|
587
|
-
|
|
588
|
-
run_payload: RedTeamRun = {
|
|
589
|
-
"object": "eval.run",
|
|
590
|
-
"id": run_id,
|
|
591
|
-
"eval_id": eval_id,
|
|
592
|
-
"created_at": created_at,
|
|
593
|
-
"status": status,
|
|
594
|
-
"name": run_name,
|
|
595
|
-
"report_url": scan_result.get("studio_url") or self.ai_studio_url,
|
|
596
|
-
"data_source": data_source,
|
|
597
|
-
"metadata": {},
|
|
598
|
-
"result_count": result_count,
|
|
599
|
-
"per_model_usage": [],
|
|
600
|
-
"per_testing_criteria_results": per_testing_results,
|
|
601
|
-
"output_items": list_wrapper,
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
if include_conversations:
|
|
605
|
-
run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
|
|
606
|
-
|
|
607
|
-
return run_payload
|
|
608
|
-
|
|
609
|
-
def _build_results_payload(
|
|
610
|
-
self,
|
|
611
|
-
redteam_result: RedTeamResult,
|
|
612
|
-
eval_run: Optional[Any] = None,
|
|
613
|
-
red_team_info: Optional[Dict] = None,
|
|
614
|
-
include_conversations: bool = False,
|
|
615
|
-
scan_name: Optional[str] = None,
|
|
616
|
-
) -> RedTeamRun:
|
|
617
|
-
"""Assemble the new structure for results.json with eval.run format."""
|
|
618
|
-
|
|
619
|
-
scan_result = cast(Dict[str, Any], redteam_result.scan_result or {})
|
|
620
|
-
output_items = cast(List[Dict[str, Any]], scan_result.get("output_items") or [])
|
|
621
|
-
scorecard = cast(Dict[str, Any], scan_result.get("scorecard") or {})
|
|
622
|
-
parameters = cast(Dict[str, Any], scan_result.get("parameters") or {})
|
|
623
|
-
|
|
624
|
-
run_id = self._run_id_override
|
|
625
|
-
eval_id = self._eval_id_override
|
|
626
|
-
run_name: Optional[str] = None
|
|
627
|
-
created_at = self._created_at_override
|
|
628
|
-
|
|
629
|
-
if eval_run is not None:
|
|
630
|
-
run_info = getattr(eval_run, "info", None)
|
|
631
|
-
|
|
632
|
-
if run_id is None:
|
|
633
|
-
candidate_run_id = (
|
|
634
|
-
getattr(run_info, "run_id", None)
|
|
635
|
-
or getattr(eval_run, "run_id", None)
|
|
636
|
-
or getattr(eval_run, "id", None)
|
|
637
|
-
)
|
|
638
|
-
if candidate_run_id is not None:
|
|
639
|
-
run_id = str(candidate_run_id)
|
|
640
|
-
|
|
641
|
-
if eval_id is None:
|
|
642
|
-
candidate_eval_id = (
|
|
643
|
-
getattr(run_info, "experiment_id", None)
|
|
644
|
-
or getattr(eval_run, "experiment_id", None)
|
|
645
|
-
or getattr(eval_run, "eval_id", None)
|
|
646
|
-
)
|
|
647
|
-
if candidate_eval_id is not None:
|
|
648
|
-
eval_id = str(candidate_eval_id)
|
|
649
|
-
|
|
650
|
-
if run_name is None:
|
|
651
|
-
candidate_run_name = (
|
|
652
|
-
getattr(run_info, "run_name", None)
|
|
653
|
-
or getattr(eval_run, "run_name", None)
|
|
654
|
-
or getattr(eval_run, "display_name", None)
|
|
655
|
-
or getattr(eval_run, "name", None)
|
|
656
|
-
)
|
|
657
|
-
if candidate_run_name is not None:
|
|
658
|
-
run_name = str(candidate_run_name)
|
|
659
|
-
|
|
660
|
-
if created_at is None:
|
|
661
|
-
raw_created = (
|
|
662
|
-
getattr(run_info, "created_time", None)
|
|
663
|
-
or getattr(eval_run, "created_at", None)
|
|
664
|
-
or getattr(eval_run, "created_time", None)
|
|
665
|
-
)
|
|
666
|
-
if isinstance(raw_created, datetime):
|
|
667
|
-
created_at = int(raw_created.timestamp())
|
|
668
|
-
elif isinstance(raw_created, (int, float)):
|
|
669
|
-
created_at = int(raw_created)
|
|
670
|
-
elif isinstance(raw_created, str):
|
|
671
|
-
try:
|
|
672
|
-
created_at = int(float(raw_created))
|
|
673
|
-
except ValueError:
|
|
674
|
-
created_at = None
|
|
675
|
-
|
|
676
|
-
if run_id is None:
|
|
677
|
-
run_id = str(uuid.uuid4())
|
|
678
|
-
if eval_id is None:
|
|
679
|
-
eval_id = str(uuid.uuid4())
|
|
680
|
-
if created_at is None:
|
|
681
|
-
created_at = int(datetime.now().timestamp())
|
|
682
|
-
if run_name is None:
|
|
683
|
-
run_name = scan_name or f"redteam-run-{run_id[:8]}"
|
|
684
|
-
|
|
685
|
-
result_count = self._compute_result_count(output_items)
|
|
686
|
-
per_testing_results = self._compute_per_testing_criteria(output_items)
|
|
687
|
-
data_source = self._build_data_source_section(parameters, red_team_info)
|
|
688
|
-
status = self._determine_run_status(scan_result, red_team_info, output_items)
|
|
689
|
-
|
|
690
|
-
list_wrapper: OutputItemsList = {
|
|
691
|
-
"object": "list",
|
|
692
|
-
"data": output_items,
|
|
693
|
-
}
|
|
694
|
-
|
|
695
|
-
run_payload: RedTeamRun = {
|
|
696
|
-
"object": "eval.run",
|
|
697
|
-
"id": run_id,
|
|
698
|
-
"eval_id": eval_id,
|
|
699
|
-
"created_at": created_at,
|
|
700
|
-
"status": status,
|
|
701
|
-
"name": run_name,
|
|
702
|
-
"report_url": scan_result.get("studio_url") or self.ai_studio_url,
|
|
703
|
-
"data_source": data_source,
|
|
704
|
-
"metadata": {},
|
|
705
|
-
"result_count": result_count,
|
|
706
|
-
"per_model_usage": [],
|
|
707
|
-
"per_testing_criteria_results": per_testing_results,
|
|
708
|
-
"output_items": list_wrapper,
|
|
709
|
-
}
|
|
710
|
-
|
|
711
|
-
if include_conversations:
|
|
712
|
-
run_payload["conversations"] = redteam_result.attack_details or scan_result.get("attack_details") or []
|
|
713
|
-
|
|
714
|
-
return run_payload
|
|
715
|
-
|
|
716
398
|
def _build_instance_results_payload(
|
|
717
399
|
self,
|
|
718
400
|
redteam_result: RedTeamResult,
|
|
@@ -726,15 +408,22 @@ class MLflowIntegration:
|
|
|
726
408
|
|
|
727
409
|
# Return the scan_result directly for legacy compatibility
|
|
728
410
|
# This maintains the old format that was expected previously
|
|
729
|
-
|
|
411
|
+
# Filter out AOAI_Compatible properties - those belong in results.json only
|
|
412
|
+
legacy_payload = (
|
|
413
|
+
{
|
|
414
|
+
k: v
|
|
415
|
+
for k, v in scan_result.items()
|
|
416
|
+
if k not in ["AOAI_Compatible_Summary", "AOAI_Compatible_Row_Results"]
|
|
417
|
+
}
|
|
418
|
+
if scan_result
|
|
419
|
+
else {}
|
|
420
|
+
)
|
|
730
421
|
|
|
731
422
|
# Ensure we have the basic required fields
|
|
732
423
|
if "scorecard" not in legacy_payload:
|
|
733
424
|
legacy_payload["scorecard"] = {}
|
|
734
425
|
if "parameters" not in legacy_payload:
|
|
735
426
|
legacy_payload["parameters"] = {}
|
|
736
|
-
if "output_items" not in legacy_payload:
|
|
737
|
-
legacy_payload["output_items"] = []
|
|
738
427
|
if "attack_details" not in legacy_payload:
|
|
739
428
|
legacy_payload["attack_details"] = redteam_result.attack_details or []
|
|
740
429
|
|