invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/tiers.yaml +61 -0
  3. invarlock/adapters/hf_loading.py +97 -0
  4. invarlock/calibration/__init__.py +6 -0
  5. invarlock/calibration/spectral_null.py +301 -0
  6. invarlock/calibration/variance_ve.py +154 -0
  7. invarlock/cli/app.py +15 -0
  8. invarlock/cli/commands/calibrate.py +576 -0
  9. invarlock/cli/commands/doctor.py +9 -3
  10. invarlock/cli/commands/explain_gates.py +53 -9
  11. invarlock/cli/commands/plugins.py +12 -2
  12. invarlock/cli/commands/run.py +181 -79
  13. invarlock/cli/commands/verify.py +40 -0
  14. invarlock/cli/config.py +11 -1
  15. invarlock/cli/determinism.py +252 -0
  16. invarlock/core/auto_tuning.py +215 -17
  17. invarlock/core/bootstrap.py +137 -5
  18. invarlock/core/registry.py +9 -4
  19. invarlock/core/runner.py +305 -35
  20. invarlock/eval/bench.py +467 -141
  21. invarlock/eval/bench_regression.py +12 -0
  22. invarlock/eval/bootstrap.py +3 -1
  23. invarlock/eval/data.py +29 -7
  24. invarlock/eval/primary_metric.py +20 -5
  25. invarlock/guards/rmt.py +536 -46
  26. invarlock/guards/spectral.py +217 -10
  27. invarlock/guards/variance.py +124 -42
  28. invarlock/reporting/certificate.py +476 -45
  29. invarlock/reporting/certificate_schema.py +4 -1
  30. invarlock/reporting/guards_analysis.py +108 -10
  31. invarlock/reporting/normalizer.py +24 -1
  32. invarlock/reporting/policy_utils.py +97 -15
  33. invarlock/reporting/primary_metric_utils.py +17 -0
  34. invarlock/reporting/validate.py +10 -10
  35. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
  36. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
  37. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
  38. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
  39. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
  40. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
invarlock/eval/bench.py CHANGED
@@ -116,6 +116,7 @@ class ScenarioResult:
116
116
  config: ScenarioConfig
117
117
  bare_result: RunResult | None = None
118
118
  guarded_result: RunResult | None = None
119
+ artifacts: dict[str, Any] = field(default_factory=dict)
119
120
  metrics: dict[str, Any] = field(default_factory=dict)
120
121
  gates: dict[str, bool] = field(default_factory=dict)
121
122
  skipped: bool = False
@@ -269,6 +270,7 @@ class MetricsAggregator:
269
270
  def extract_core_metrics(report: RunReport) -> dict[str, float]:
270
271
  """Extract core metrics from a RunReport (primary_metric-first)."""
271
272
  metrics = report.get("metrics", {}) or {}
273
+ meta = report.get("meta", {}) or {}
272
274
  pm = metrics.get("primary_metric", {}) if isinstance(metrics, dict) else {}
273
275
  pm_preview = float("nan")
274
276
  pm_final = float("nan")
@@ -281,29 +283,78 @@ class MetricsAggregator:
281
283
  except Exception:
282
284
  pm_preview = float("nan")
283
285
  pm_final = float("nan")
286
+ duration_s = float("nan")
287
+ try:
288
+ if isinstance(meta, dict):
289
+ dur = meta.get("duration_s", meta.get("duration"))
290
+ if isinstance(dur, int | float):
291
+ duration_s = float(dur)
292
+ except Exception:
293
+ duration_s = float("nan")
284
294
  return {
285
295
  "primary_metric_preview": pm_preview,
286
296
  "primary_metric_final": pm_final,
287
297
  "latency_ms_per_tok": metrics.get("latency_ms_per_tok", float("nan")),
288
298
  "memory_mb_peak": metrics.get("memory_mb_peak", float("nan")),
299
+ "duration_s": duration_s,
289
300
  }
290
301
 
291
302
  @staticmethod
292
303
  def extract_guard_metrics(report: RunReport) -> dict[str, Any]:
293
304
  """Extract guard-specific metrics from a RunReport."""
294
- guard_metrics = {}
305
+ guard_metrics: dict[str, Any] = {}
306
+
307
+ # Prefer structured guard reports when available
308
+ guards = report.get("guards", [])
309
+ if isinstance(guards, list):
310
+ for guard in guards:
311
+ if not isinstance(guard, dict):
312
+ continue
313
+ name = str(guard.get("name", "")).lower()
314
+ metrics = (
315
+ guard.get("metrics", {})
316
+ if isinstance(guard.get("metrics"), dict)
317
+ else {}
318
+ )
319
+ violations = guard.get("violations", [])
320
+ if name == "rmt":
321
+ for key in ("outliers_total", "rmt_outliers", "layers_flagged"):
322
+ val = metrics.get(key)
323
+ if isinstance(val, int | float):
324
+ guard_metrics["rmt_outliers"] = int(val)
325
+ break
326
+ if name == "invariants":
327
+ val = metrics.get("violations_found")
328
+ if isinstance(val, int | float):
329
+ guard_metrics["tying_violations_post"] = int(val)
330
+ elif isinstance(violations, list):
331
+ guard_metrics["tying_violations_post"] = len(violations)
295
332
 
296
333
  # Extract RMT outliers
297
- rmt_metrics = report.get("metrics", {}).get("rmt", {})
298
- guard_metrics["rmt_outliers"] = rmt_metrics.get("outliers", 0)
334
+ if "rmt_outliers" not in guard_metrics:
335
+ rmt_metrics = report.get("metrics", {}).get("rmt", {})
336
+ if isinstance(rmt_metrics, dict):
337
+ guard_metrics["rmt_outliers"] = int(rmt_metrics.get("outliers", 0) or 0)
338
+ else:
339
+ guard_metrics["rmt_outliers"] = 0
299
340
 
300
341
  # Extract invariant violations
301
- invariant_metrics = report.get("metrics", {}).get("invariants", {})
302
- guard_metrics["tying_violations_post"] = invariant_metrics.get("violations", 0)
342
+ if "tying_violations_post" not in guard_metrics:
343
+ invariant_metrics = report.get("metrics", {}).get("invariants", {})
344
+ if isinstance(invariant_metrics, dict):
345
+ guard_metrics["tying_violations_post"] = int(
346
+ invariant_metrics.get("violations", 0) or 0
347
+ )
348
+ else:
349
+ guard_metrics["tying_violations_post"] = 0
303
350
 
304
351
  # Check if rollback occurred (catastrophic spike)
305
- guard_metrics["catastrophic_spike"] = report.get("flags", {}).get(
306
- "guard_recovered", False
352
+ flags = report.get("flags", {}) or {}
353
+ meta = report.get("meta", {}) or {}
354
+ guard_metrics["catastrophic_spike"] = bool(
355
+ (flags.get("guard_recovered") if isinstance(flags, dict) else False)
356
+ or (meta.get("guard_recovered") if isinstance(meta, dict) else False)
357
+ or (meta.get("rollback_reason") if isinstance(meta, dict) else False)
307
358
  )
308
359
 
309
360
  return guard_metrics
@@ -342,6 +393,8 @@ class MetricsAggregator:
342
393
  "latency_guarded": guarded_metrics.get(
343
394
  "latency_ms_per_tok", float("nan")
344
395
  ),
396
+ "duration_bare_s": bare_metrics.get("duration_s", float("nan")),
397
+ "duration_guarded_s": guarded_metrics.get("duration_s", float("nan")),
345
398
  "mem_bare": bare_metrics.get("memory_mb_peak", float("nan")),
346
399
  "mem_guarded": guarded_metrics.get("memory_mb_peak", float("nan")),
347
400
  }
@@ -355,17 +408,30 @@ class MetricsAggregator:
355
408
  else:
356
409
  comparison["primary_metric_overhead"] = float("nan")
357
410
 
358
- latency_bare = comparison["latency_bare"]
359
- latency_guarded = comparison["latency_guarded"]
411
+ # Prefer end-to-end pipeline duration when available; fall back to per-token latency
412
+ duration_bare = comparison.get("duration_bare_s", float("nan"))
413
+ duration_guarded = comparison.get("duration_guarded_s", float("nan"))
360
414
  if (
361
- not (math.isnan(latency_bare) or math.isnan(latency_guarded))
362
- and latency_bare > 0
415
+ isinstance(duration_bare, int | float)
416
+ and isinstance(duration_guarded, int | float)
417
+ and not (math.isnan(duration_bare) or math.isnan(duration_guarded))
418
+ and float(duration_bare) > 0
363
419
  ):
364
420
  comparison["guard_overhead_time"] = (
365
- latency_guarded - latency_bare
366
- ) / latency_bare
421
+ float(duration_guarded) - float(duration_bare)
422
+ ) / float(duration_bare)
367
423
  else:
368
- comparison["guard_overhead_time"] = float("nan")
424
+ latency_bare = comparison["latency_bare"]
425
+ latency_guarded = comparison["latency_guarded"]
426
+ if (
427
+ not (math.isnan(latency_bare) or math.isnan(latency_guarded))
428
+ and latency_bare > 0
429
+ ):
430
+ comparison["guard_overhead_time"] = (
431
+ latency_guarded - latency_bare
432
+ ) / latency_bare
433
+ else:
434
+ comparison["guard_overhead_time"] = float("nan")
369
435
 
370
436
  mem_bare = comparison["mem_bare"]
371
437
  mem_guarded = comparison["mem_guarded"]
@@ -506,141 +572,333 @@ def execute_single_run(
506
572
  scenario: ScenarioConfig,
507
573
  run_type: str,
508
574
  output_dir: Path,
575
+ *,
576
+ runtime: dict[str, Any] | None = None,
509
577
  ) -> RunResult:
510
578
  """Execute a single benchmark run and return results."""
511
579
  try:
512
- # For now, create a mock run since we don't have the full pipeline
513
- # In real implementation, this would call the actual InvarLock pipeline
580
+ # Deferred imports: heavy deps only when executing real pipeline
581
+ from invarlock.core.api import RunConfig as _RunConfig
582
+ from invarlock.core.auto_tuning import get_tier_policies as _get_tier_policies
583
+ from invarlock.core.registry import get_registry as _get_registry
584
+ from invarlock.core.runner import CoreRunner as _CoreRunner
585
+ from invarlock.eval.data import get_provider as _get_provider
586
+ from invarlock.guards.rmt import capture_baseline_mp_stats as _capture_mp_stats
587
+ from invarlock.guards.rmt import rmt_detect as _rmt_detect
588
+ from invarlock.model_profile import detect_model_profile as _detect_profile
589
+
590
+ def _ensure_dir(path: Path) -> None:
591
+ path.mkdir(parents=True, exist_ok=True)
592
+
593
+ def _write_json(path: Path, payload: dict[str, Any]) -> None:
594
+ path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
595
+
596
+ if runtime is None:
597
+ runtime = {}
598
+
599
+ # Resolve shared runtime resources (tokenizer/windows/model snapshot) when absent.
600
+ adapter = runtime.get("adapter")
601
+ model = runtime.get("model")
602
+ baseline_snapshot = runtime.get("baseline_snapshot")
603
+ pairing_schedule = runtime.get("pairing_schedule")
604
+ calibration_data = runtime.get("calibration_data")
605
+ tokenizer_hash = runtime.get("tokenizer_hash")
606
+ split = runtime.get("split", "validation")
607
+ dataset_name = runtime.get("dataset_name")
608
+
609
+ if not isinstance(dataset_name, str) or not dataset_name:
610
+ dataset_name = str(
611
+ run_config.get("dataset", {}).get("provider", "wikitext2")
612
+ )
514
613
 
515
- # Create a mock RunReport with realistic values
516
- report = create_empty_report()
614
+ # Tokenizer + pairing schedule
615
+ if not (
616
+ isinstance(pairing_schedule, dict) and isinstance(calibration_data, list)
617
+ ):
618
+ profile = _detect_profile(scenario.model_id, adapter=scenario.adapter)
619
+ tokenizer, tokenizer_hash = profile.make_tokenizer()
620
+ provider_kwargs: dict[str, Any] = {}
621
+ if scenario.device != "auto" and dataset_name == "wikitext2":
622
+ provider_kwargs["device_hint"] = str(scenario.device)
623
+ provider = _get_provider(dataset_name, **provider_kwargs)
624
+ preview_window, final_window = provider.windows(
625
+ tokenizer=tokenizer,
626
+ seq_len=scenario.seq_len,
627
+ stride=scenario.stride,
628
+ preview_n=scenario.preview_n or 0,
629
+ final_n=scenario.final_n or 0,
630
+ seed=scenario.seed,
631
+ split=split,
632
+ )
633
+ prev_ids = list(range(len(preview_window.input_ids)))
634
+ fin_ids = list(
635
+ range(
636
+ len(preview_window.input_ids),
637
+ len(preview_window.input_ids) + len(final_window.input_ids),
638
+ )
639
+ )
640
+ pairing_schedule = {
641
+ "preview": {
642
+ "window_ids": prev_ids,
643
+ "input_ids": preview_window.input_ids,
644
+ "attention_masks": preview_window.attention_masks,
645
+ },
646
+ "final": {
647
+ "window_ids": fin_ids,
648
+ "input_ids": final_window.input_ids,
649
+ "attention_masks": final_window.attention_masks,
650
+ },
651
+ }
652
+ calibration_data = []
653
+ for idx, (input_ids, attention_mask) in enumerate(
654
+ zip(
655
+ preview_window.input_ids,
656
+ preview_window.attention_masks,
657
+ strict=False,
658
+ )
659
+ ):
660
+ calibration_data.append(
661
+ {
662
+ "input_ids": input_ids,
663
+ "attention_mask": attention_mask,
664
+ "window_id": f"preview::{idx}",
665
+ }
666
+ )
667
+ for idx, (input_ids, attention_mask) in enumerate(
668
+ zip(final_window.input_ids, final_window.attention_masks, strict=False)
669
+ ):
670
+ calibration_data.append(
671
+ {
672
+ "input_ids": input_ids,
673
+ "attention_mask": attention_mask,
674
+ "window_id": f"final::{idx}",
675
+ }
676
+ )
677
+ runtime["pairing_schedule"] = pairing_schedule
678
+ runtime["calibration_data"] = calibration_data
679
+ runtime["tokenizer_hash"] = tokenizer_hash
680
+ runtime["split"] = split
681
+ runtime["dataset_name"] = dataset_name
682
+
683
+ # Adapter/model snapshot
684
+ if adapter is None or model is None or baseline_snapshot is None:
685
+ registry = _get_registry()
686
+ adapter = registry.get_adapter(scenario.adapter)
687
+ model = adapter.load_model(scenario.model_id, device=scenario.device)
688
+ baseline_snapshot = adapter.snapshot(model)
689
+ runtime["adapter"] = adapter
690
+ runtime["model"] = model
691
+ runtime["baseline_snapshot"] = baseline_snapshot
692
+
693
+ # Baseline RMT stats (used to compute comparable outlier counts for bare vs guarded)
694
+ rmt_baseline_mp_stats = runtime.get("rmt_baseline_mp_stats")
695
+ rmt_baseline_sigmas = runtime.get("rmt_baseline_sigmas")
696
+ if not isinstance(rmt_baseline_mp_stats, dict) or not isinstance(
697
+ rmt_baseline_sigmas, dict
698
+ ):
699
+ adapter.restore(model, baseline_snapshot)
700
+ rmt_baseline_mp_stats = _capture_mp_stats(model)
701
+ rmt_baseline_sigmas = {
702
+ name: float(stats.get("sigma_base", 0.0) or 0.0)
703
+ for name, stats in rmt_baseline_mp_stats.items()
704
+ if isinstance(stats, dict)
705
+ }
706
+ runtime["rmt_baseline_mp_stats"] = rmt_baseline_mp_stats
707
+ runtime["rmt_baseline_sigmas"] = rmt_baseline_sigmas
517
708
 
518
- # Fill in metadata
519
- report["meta"]["model_id"] = run_config["model"]["id"]
520
- report["meta"]["adapter"] = run_config["model"]["adapter"]
521
- report["meta"]["device"] = run_config["model"]["device"]
522
- report["meta"]["ts"] = datetime.now().isoformat()
523
- report["meta"]["seed"] = run_config["dataset"]["seed"]
524
-
525
- # Fill in dataset config
526
- report["data"]["dataset"] = run_config["dataset"]["provider"]
527
- report["data"]["seq_len"] = run_config["dataset"]["seq_len"]
528
- report["data"]["stride"] = run_config["dataset"]["stride"]
529
- report["data"]["preview_n"] = run_config["dataset"]["preview_n"]
530
- report["data"]["final_n"] = run_config["dataset"]["final_n"]
531
-
532
- # Fill in edit info
533
- report["edit"]["name"] = scenario.edit
534
- report["edit"]["plan_digest"] = (
535
- f"mock_digest_{scenario.edit}_{scenario.tier}_{scenario.probes}"
709
+ tier_policies = _get_tier_policies()
710
+ tier_policy = tier_policies.get(
711
+ scenario.tier, tier_policies.get("balanced", {})
536
712
  )
713
+ rmt_policy = tier_policy.get("rmt", {}) if isinstance(tier_policy, dict) else {}
714
+ rmt_margin = float(rmt_policy.get("margin", 1.5) or 1.5)
715
+ rmt_deadband = float(rmt_policy.get("deadband", 0.10) or 0.10)
716
+
717
+ # Restore baseline model for this run
718
+ adapter.restore(model, baseline_snapshot)
719
+
720
+ run_dir = output_dir / run_type
721
+ _ensure_dir(run_dir)
722
+ event_path = run_dir / "events.jsonl"
723
+
724
+ # Core objects
725
+ registry = _get_registry()
726
+ edit_op = registry.get_edit(scenario.edit)
727
+
728
+ guards: list[Any] = []
729
+ auto_config = None
730
+ if run_type == "guarded":
731
+ for guard_name in ("invariants", "spectral", "rmt", "variance"):
732
+ try:
733
+ guards.append(registry.get_guard(guard_name))
734
+ except Exception:
735
+ continue
736
+ auto_config = {
737
+ "tier": scenario.tier,
738
+ "probes": scenario.probes,
739
+ "enabled": True,
740
+ }
537
741
 
538
- # Mock realistic metrics based on run type and scenario
539
- if run_type == "bare":
540
- # Bare runs: no guard overhead, potentially higher PM (ppl-like)
541
- base_ppl = 45.0 + (hash(f"{scenario.edit}_{scenario.tier}") % 100) / 100.0
542
- report["metrics"]["primary_metric"] = {
543
- "kind": "perplexity",
544
- "preview": base_ppl,
545
- "final": base_ppl + 1.0,
742
+ # Wire run context for pairing verification
743
+ run_context = {
744
+ "profile": scenario.profile,
745
+ "dataset": {"provider": dataset_name, "seed": scenario.seed},
746
+ "pairing_baseline": pairing_schedule,
747
+ "eval": {"loss": {"resolved_type": "causal"}},
748
+ "run_id": f"{scenario.edit}-{scenario.tier}-p{scenario.probes}-{run_type}",
749
+ }
750
+
751
+ spike_threshold = float(
752
+ run_config.get("eval", {}).get("spike_threshold", 2.0) or 2.0
753
+ )
754
+ cfg = _RunConfig(
755
+ device=scenario.device,
756
+ max_pm_ratio=spike_threshold,
757
+ spike_threshold=spike_threshold,
758
+ event_path=event_path,
759
+ context=run_context,
760
+ )
761
+
762
+ runner = _CoreRunner()
763
+ core_report = runner.execute(
764
+ model=model,
765
+ adapter=adapter,
766
+ edit=edit_op,
767
+ guards=guards,
768
+ config=cfg,
769
+ calibration_data=calibration_data,
770
+ auto_config=auto_config,
771
+ edit_config=run_config.get("edit", {}).get("plan", {}),
772
+ preview_n=scenario.preview_n,
773
+ final_n=scenario.final_n,
774
+ )
775
+
776
+ # Convert to evaluation RunReport (dict) for downstream tooling
777
+ report = create_empty_report()
778
+ report["meta"].update(
779
+ {
780
+ "model_id": scenario.model_id,
781
+ "adapter": scenario.adapter,
782
+ "device": str(scenario.device),
783
+ "commit": "",
784
+ "seed": scenario.seed,
785
+ "ts": datetime.now().isoformat(),
546
786
  }
547
- report["metrics"]["latency_ms_per_tok"] = (
548
- 12.0 + (hash(scenario.tier) % 20) / 10.0
549
- )
550
- report["metrics"]["memory_mb_peak"] = 2000.0 + (
551
- hash(str(scenario.probes)) % 200
552
- )
553
- report["metrics"]["rmt"] = {"outliers": 2 + (hash(scenario.edit) % 3)}
554
- report["metrics"]["invariants"] = {"violations": 0}
555
- else:
556
- # Guarded runs: guard overhead, better stability, varies by tier
557
- tier_factor = {"conservative": 0.95, "balanced": 0.97, "aggressive": 0.99}[
558
- scenario.tier
559
- ]
560
- probe_factor = 1.0 - (
561
- scenario.probes * 0.01
562
- ) # Small improvement with probes
563
-
564
- base_ppl = 45.0 + (hash(f"{scenario.edit}_{scenario.tier}") % 100) / 100.0
565
- report["metrics"]["primary_metric"] = {
566
- "kind": "perplexity",
567
- "preview": base_ppl * tier_factor,
568
- "final": base_ppl * tier_factor * probe_factor,
787
+ )
788
+ if tokenizer_hash:
789
+ report["meta"]["tokenizer_hash"] = tokenizer_hash
790
+ dur = core_report.meta.get("duration") if hasattr(core_report, "meta") else None
791
+ if isinstance(dur, int | float):
792
+ report["meta"]["duration_s"] = float(dur)
793
+
794
+ report["data"].update(
795
+ {
796
+ "dataset": dataset_name,
797
+ "split": split,
798
+ "seq_len": scenario.seq_len,
799
+ "stride": scenario.stride,
800
+ "preview_n": int(scenario.preview_n or 0),
801
+ "final_n": int(scenario.final_n or 0),
569
802
  }
803
+ )
570
804
 
571
- # Guard overhead varies by tier
572
- time_overhead = {
573
- "conservative": 0.12,
574
- "balanced": 0.08,
575
- "aggressive": 0.05,
576
- }[scenario.tier]
577
- mem_overhead = {"conservative": 0.08, "balanced": 0.06, "aggressive": 0.04}[
578
- scenario.tier
579
- ]
580
-
581
- report["metrics"]["latency_ms_per_tok"] = (
582
- 12.0 + (hash(scenario.tier) % 20) / 10.0
583
- ) * (1 + time_overhead)
584
- report["metrics"]["memory_mb_peak"] = (
585
- 2000.0 + (hash(str(scenario.probes)) % 200)
586
- ) * (1 + mem_overhead)
587
- report["metrics"]["rmt"] = {
588
- "outliers": max(
589
- 0,
590
- 2
591
- + (hash(scenario.edit) % 3)
592
- - (1 if scenario.tier == "conservative" else 0),
593
- )
805
+ edit_meta = core_report.edit if hasattr(core_report, "edit") else {}
806
+ plan_digest = ""
807
+ try:
808
+ if isinstance(edit_meta, dict):
809
+ plan_digest = str(edit_meta.get("plan_digest", ""))
810
+ except Exception:
811
+ plan_digest = ""
812
+ report["edit"].update(
813
+ {
814
+ "name": scenario.edit,
815
+ "plan_digest": plan_digest,
816
+ "deltas": (
817
+ edit_meta.get("deltas", report["edit"]["deltas"])
818
+ if isinstance(edit_meta, dict)
819
+ else report["edit"]["deltas"]
820
+ ),
594
821
  }
595
- report["metrics"]["invariants"] = {"violations": 0}
822
+ )
596
823
 
597
- # Mock guard reports for guarded runs
598
- report["guards"] = [
599
- {
600
- "name": "invariants",
601
- "policy": {"mode": "enforce"},
602
- "metrics": {"checks": 5, "violations": 0},
603
- "actions": ["validated"],
604
- "violations": [],
605
- },
606
- {
607
- "name": "spectral",
608
- "policy": {
609
- "sigma_quantile": tier_factor,
610
- "scope": "ffn",
611
- "deadband": 0.10,
612
- },
613
- "metrics": {
614
- "max_sigma": 1.2,
615
- "corrections": 1 if scenario.tier == "conservative" else 0,
616
- },
617
- "actions": ["monitored"],
618
- "violations": [],
619
- },
620
- {
621
- "name": "rmt",
622
- "policy": {
623
- "deadband": 0.05 if scenario.tier == "conservative" else 0.10,
624
- "margin": 1.5,
625
- },
626
- "metrics": {
627
- "outliers": report["metrics"]["rmt"]["outliers"],
628
- "mp_fit": 0.95,
629
- },
630
- "actions": ["validated"],
631
- "violations": [],
632
- },
633
- ]
824
+ # Transfer metrics
825
+ if hasattr(core_report, "metrics") and isinstance(core_report.metrics, dict):
826
+ report["metrics"].update(core_report.metrics)
827
+
828
+ if hasattr(core_report, "evaluation_windows") and isinstance(
829
+ core_report.evaluation_windows, dict
830
+ ):
831
+ report["evaluation_windows"] = core_report.evaluation_windows
832
+
833
+ # Transfer guards
834
+ if hasattr(core_report, "guards") and isinstance(core_report.guards, dict):
835
+ for name, guard_result in core_report.guards.items():
836
+ if not isinstance(guard_result, dict):
837
+ continue
838
+ report["guards"].append(
839
+ {
840
+ "name": name,
841
+ "passed": guard_result.get("passed"),
842
+ "action": guard_result.get("action"),
843
+ "policy": guard_result.get("policy", {}),
844
+ "metrics": guard_result.get("metrics", {}),
845
+ "actions": guard_result.get("actions", []),
846
+ "violations": guard_result.get("violations", []),
847
+ "warnings": guard_result.get("warnings", []),
848
+ "errors": guard_result.get("errors", []),
849
+ "details": guard_result.get("details", {}),
850
+ }
851
+ )
634
852
 
635
- # Mock artifacts
636
- report["artifacts"]["events_path"] = (
637
- f"mock_events_{scenario.edit}_{scenario.tier}_{scenario.probes}_{run_type}.jsonl"
853
+ # Compute comparable RMT outliers for both bare and guarded models
854
+ try:
855
+ detection = _rmt_detect(
856
+ model=model,
857
+ threshold=rmt_margin,
858
+ detect_only=True,
859
+ baseline_sigmas=rmt_baseline_sigmas,
860
+ baseline_mp_stats=rmt_baseline_mp_stats,
861
+ deadband=rmt_deadband,
862
+ )
863
+ report["metrics"].setdefault("rmt", {})
864
+ if isinstance(report["metrics"].get("rmt"), dict):
865
+ report["metrics"]["rmt"]["outliers"] = int(
866
+ detection.get("n_layers_flagged", 0) or 0
867
+ )
868
+ except Exception:
869
+ pass
870
+
871
+ # Flags and artifacts
872
+ status = getattr(core_report, "status", "")
873
+ rollback_reason = (
874
+ core_report.meta.get("rollback_reason")
875
+ if hasattr(core_report, "meta") and isinstance(core_report.meta, dict)
876
+ else None
638
877
  )
639
- report["artifacts"]["logs_path"] = (
640
- f"mock_logs_{scenario.edit}_{scenario.tier}_{scenario.probes}_{run_type}.txt"
878
+ report["flags"].update(
879
+ {
880
+ "guard_recovered": bool(
881
+ (
882
+ hasattr(core_report, "meta")
883
+ and core_report.meta.get("guard_recovered")
884
+ )
885
+ or str(status).lower() == "rollback"
886
+ ),
887
+ "rollback_reason": rollback_reason,
888
+ }
641
889
  )
890
+ report["artifacts"].update(
891
+ {
892
+ "events_path": str(event_path),
893
+ "logs_path": "",
894
+ "checkpoint_path": None,
895
+ "report_path": str(run_dir / "report.json"),
896
+ }
897
+ )
898
+ _write_json(Path(report["artifacts"]["report_path"]), report)
642
899
 
643
- return RunResult(run_type=run_type, report=report, success=True)
900
+ success = str(status).lower() != "failed"
901
+ return RunResult(run_type=run_type, report=report, success=success)
644
902
 
645
903
  except Exception as e:
646
904
  logger.error(f"Run failed for {scenario.edit} ({run_type}): {e}")
@@ -671,21 +929,70 @@ def execute_scenario(
671
929
  config_manager = ConfigurationManager()
672
930
  metrics_aggregator = MetricsAggregator()
673
931
 
932
+ # Scenario-scoped artifact directory
933
+ scenario_slug = f"{scenario.edit}__{scenario.tier}__p{scenario.probes}"
934
+ scenario_dir = output_dir / "scenarios" / scenario_slug
935
+ scenario_dir.mkdir(parents=True, exist_ok=True)
936
+
937
+ runtime: dict[str, Any] = {"dataset_name": config.dataset}
938
+
674
939
  # Run bare configuration
675
940
  logger.debug(f"Running bare configuration for {scenario.edit}")
676
941
  bare_config = config_manager.create_bare_config(scenario)
677
- bare_result = execute_single_run(bare_config, scenario, "bare", output_dir)
942
+ try:
943
+ bare_config.setdefault("dataset", {})["provider"] = config.dataset
944
+ except Exception:
945
+ pass
946
+ bare_result = execute_single_run(
947
+ bare_config, scenario, "bare", scenario_dir, runtime=runtime
948
+ )
678
949
 
679
950
  # Run guarded configuration
680
951
  logger.debug(f"Running guarded configuration for {scenario.edit}")
681
952
  guarded_config = config_manager.create_guarded_config(scenario)
682
- guarded_result = execute_single_run(guarded_config, scenario, "guarded", output_dir)
683
-
684
- # Compute comparison metrics
685
- comparison_metrics = metrics_aggregator.compute_comparison_metrics(
686
- bare_result, guarded_result
953
+ try:
954
+ guarded_config.setdefault("dataset", {})["provider"] = config.dataset
955
+ except Exception:
956
+ pass
957
+ guarded_result = execute_single_run(
958
+ guarded_config, scenario, "guarded", scenario_dir, runtime=runtime
687
959
  )
688
960
 
961
+ artifacts: dict[str, Any] = {"scenario_dir": str(scenario_dir)}
962
+ pairing_schedule = runtime.get("pairing_schedule")
963
+ if isinstance(pairing_schedule, dict):
964
+ pairing_path = scenario_dir / "pairing_schedule.json"
965
+ pairing_path.write_text(
966
+ json.dumps(pairing_schedule, indent=2), encoding="utf-8"
967
+ )
968
+ artifacts["pairing_schedule"] = str(pairing_path)
969
+ try:
970
+ if bare_result and bare_result.report:
971
+ artifacts["bare_report"] = bare_result.report.get("artifacts", {}).get(
972
+ "report_path"
973
+ )
974
+ except Exception:
975
+ pass
976
+ try:
977
+ if guarded_result and guarded_result.report:
978
+ artifacts["guarded_report"] = guarded_result.report.get(
979
+ "artifacts", {}
980
+ ).get("report_path")
981
+ except Exception:
982
+ pass
983
+
984
+ # Generate certificate artifact when both runs produced reports
985
+ try:
986
+ if bare_result.success and guarded_result.success:
987
+ from invarlock.reporting.certificate import make_certificate
988
+
989
+ cert = make_certificate(guarded_result.report, bare_result.report)
990
+ cert_path = scenario_dir / "certificate.json"
991
+ cert_path.write_text(json.dumps(cert, indent=2), encoding="utf-8")
992
+ artifacts["certificate"] = str(cert_path)
993
+ except Exception as exc:
994
+ logger.warning(f"Certificate generation failed for {scenario_slug}: {exc}")
995
+
689
996
  # Resolve epsilon from runtime or use config
690
997
  epsilon_used = config.epsilon
691
998
  if epsilon_used is None and guarded_result.success:
@@ -693,8 +1000,24 @@ def execute_scenario(
693
1000
  elif epsilon_used is None:
694
1001
  epsilon_used = 0.10 # Default fallback
695
1002
 
696
- # Validate gates
697
- gates = ValidationGates.validate_all_gates(comparison_metrics, config, epsilon_used)
1003
+ # Compute comparison metrics and validate gates.
1004
+ comparison_metrics = metrics_aggregator.compute_comparison_metrics(
1005
+ bare_result, guarded_result
1006
+ )
1007
+ if not (bare_result.success and guarded_result.success):
1008
+ # Treat execution failures as a hard FAIL: benchmarks are only meaningful
1009
+ # when both paired runs complete.
1010
+ comparison_metrics = {
1011
+ "error_bare": bare_result.error_message,
1012
+ "error_guarded": guarded_result.error_message,
1013
+ }
1014
+ gates = dict.fromkeys(
1015
+ ("spike", "tying", "rmt", "quality", "time", "mem"), False
1016
+ )
1017
+ else:
1018
+ gates = ValidationGates.validate_all_gates(
1019
+ comparison_metrics, config, epsilon_used
1020
+ )
698
1021
 
699
1022
  # Mock probes_used based on scenario.probes (in real implementation, this would come from auto-tuner)
700
1023
  probes_used = min(
@@ -705,6 +1028,7 @@ def execute_scenario(
705
1028
  config=scenario,
706
1029
  bare_result=bare_result,
707
1030
  guarded_result=guarded_result,
1031
+ artifacts=artifacts,
708
1032
  metrics=comparison_metrics,
709
1033
  gates=gates,
710
1034
  probes_used=probes_used,
@@ -843,6 +1167,7 @@ def _summary_to_step14_json(summary: BenchmarkSummary) -> dict[str, Any]:
843
1167
  "probes_used": result.probes_used,
844
1168
  "skip": result.skipped,
845
1169
  "skip_reason": result.skip_reason,
1170
+ "artifacts": result.artifacts,
846
1171
  }
847
1172
 
848
1173
  if not result.skipped and result.metrics:
@@ -1033,6 +1358,7 @@ def _scenario_result_to_dict(result: ScenarioResult) -> dict[str, Any]:
1033
1358
  "probes_used": result.probes_used,
1034
1359
  "skipped": result.skipped,
1035
1360
  "skip_reason": result.skip_reason,
1361
+ "artifacts": result.artifacts,
1036
1362
  "metrics": result.metrics,
1037
1363
  "gates": result.gates,
1038
1364
  "epsilon_used": result.epsilon_used,