invarlock 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/profiles/ci_cpu.yaml +5 -0
  3. invarlock/_data/runtime/tiers.yaml +61 -0
  4. invarlock/adapters/hf_loading.py +97 -0
  5. invarlock/calibration/__init__.py +6 -0
  6. invarlock/calibration/spectral_null.py +301 -0
  7. invarlock/calibration/variance_ve.py +154 -0
  8. invarlock/cli/app.py +15 -0
  9. invarlock/cli/commands/calibrate.py +576 -0
  10. invarlock/cli/commands/doctor.py +16 -4
  11. invarlock/cli/commands/explain_gates.py +53 -9
  12. invarlock/cli/commands/plugins.py +12 -2
  13. invarlock/cli/commands/run.py +323 -81
  14. invarlock/cli/commands/verify.py +40 -0
  15. invarlock/cli/determinism.py +237 -0
  16. invarlock/core/auto_tuning.py +215 -17
  17. invarlock/core/registry.py +9 -4
  18. invarlock/eval/bench.py +467 -141
  19. invarlock/eval/bench_regression.py +12 -0
  20. invarlock/eval/data.py +29 -7
  21. invarlock/guards/spectral.py +216 -9
  22. invarlock/guards/variance.py +6 -3
  23. invarlock/reporting/certificate.py +403 -51
  24. invarlock/reporting/certificate_schema.py +4 -1
  25. invarlock/reporting/guards_analysis.py +108 -10
  26. invarlock/reporting/normalizer.py +21 -1
  27. invarlock/reporting/policy_utils.py +100 -16
  28. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/METADATA +12 -10
  29. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/RECORD +33 -26
  30. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/WHEEL +0 -0
  31. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/entry_points.txt +0 -0
  32. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/licenses/LICENSE +0 -0
  33. {invarlock-0.3.0.dist-info → invarlock-0.3.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ # Policy-change regression baseline identifiers.
4
+ #
5
+ # When the benchmark golden outputs are intentionally updated, bump
6
+ # `BENCH_GOLDEN_ID` and update `BENCH_GOLDEN_SHA256` accordingly, then add a
7
+ # matching entry to `CHANGELOG.md`.
8
+
9
+ BENCH_GOLDEN_ID = "bench-golden-2025-12-13"
10
+ BENCH_GOLDEN_SHA256 = "0d9ff3274d29dad16ad580b4a0cf37b4f89e4f7c2e4345ce3d30a39f146ff5a7"
11
+
12
+ __all__ = ["BENCH_GOLDEN_ID", "BENCH_GOLDEN_SHA256"]
invarlock/eval/data.py CHANGED
@@ -855,6 +855,13 @@ class WikiText2Provider:
855
855
  eval_device_override = os.environ.get("INVARLOCK_EVAL_DEVICE")
856
856
  device_hint = getattr(self, "_device_hint", None)
857
857
 
858
+ def _is_device_usable(device: torch.device) -> bool:
859
+ try:
860
+ _ = torch.zeros((1, 1), dtype=torch.long, device=device)
861
+ return True
862
+ except Exception:
863
+ return False
864
+
858
865
  if self._difficulty_model is None:
859
866
  from transformers import GPT2LMHeadModel
860
867
 
@@ -874,6 +881,13 @@ class WikiText2Provider:
874
881
  else:
875
882
  device = self._pick_default_scorer_device()
876
883
 
884
+ if device.type != "cpu" and not _is_device_usable(device):
885
+ warnings.warn(
886
+ f"Difficulty scorer device {device} unavailable; falling back to CPU",
887
+ stacklevel=2,
888
+ )
889
+ device = torch.device("cpu")
890
+
877
891
  model.to(device)
878
892
  self._difficulty_model = model
879
893
  self._difficulty_device = device
@@ -898,16 +912,24 @@ class WikiText2Provider:
898
912
  desired_device = device
899
913
 
900
914
  if desired_device != device:
901
- try:
902
- model.to(desired_device)
903
- device = desired_device
904
- self._difficulty_device = desired_device
905
- self.__class__._MODEL_DEVICE = desired_device
906
- except Exception as exc:
915
+ if desired_device.type != "cpu" and not _is_device_usable(
916
+ desired_device
917
+ ):
907
918
  warnings.warn(
908
- f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
919
+ f"Difficulty scorer device {desired_device} unavailable; keeping {device}",
909
920
  stacklevel=2,
910
921
  )
922
+ else:
923
+ try:
924
+ model.to(desired_device)
925
+ device = desired_device
926
+ self._difficulty_device = desired_device
927
+ self.__class__._MODEL_DEVICE = desired_device
928
+ except Exception as exc:
929
+ warnings.warn(
930
+ f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
931
+ stacklevel=2,
932
+ )
911
933
 
912
934
  if not self._scorer_warmed:
913
935
  with torch.no_grad():
@@ -26,6 +26,80 @@ from invarlock.core.api import Guard
26
26
  from ._contracts import guard_assert
27
27
 
28
28
 
29
+ def _z_to_two_sided_pvalue(z: Any) -> float:
30
+ try:
31
+ zf = float(z)
32
+ if not math.isfinite(zf):
33
+ return 1.0
34
+ return float(math.erfc(abs(zf) / math.sqrt(2.0)))
35
+ except Exception:
36
+ return 1.0
37
+
38
+
39
+ def _finite01(value: Any) -> bool:
40
+ try:
41
+ f = float(value)
42
+ return math.isfinite(f) and 0.0 <= f <= 1.0
43
+ except Exception:
44
+ return False
45
+
46
+
47
+ def _bh_reject_families(
48
+ family_pvals: dict[str, float], *, alpha: float, m: int
49
+ ) -> set[str]:
50
+ """BH family selection with denominator `m` (conservative if m >= #families)."""
51
+ if not family_pvals:
52
+ return set()
53
+ try:
54
+ alpha_f = float(alpha)
55
+ except Exception:
56
+ alpha_f = 0.05
57
+ if not (0.0 < alpha_f <= 1.0):
58
+ return set()
59
+
60
+ names = list(family_pvals.keys())
61
+ pvals = [family_pvals[n] for n in names]
62
+ n = len(pvals)
63
+ m_eff = max(int(m) if isinstance(m, int) else 0, n, 1)
64
+
65
+ order = sorted(
66
+ range(n),
67
+ key=lambda idx: (float("inf") if not _finite01(pvals[idx]) else pvals[idx]),
68
+ )
69
+ max_k = 0
70
+ for rank, idx in enumerate(order, start=1):
71
+ p = pvals[idx]
72
+ if not _finite01(p):
73
+ continue
74
+ if p <= (alpha_f * rank) / m_eff:
75
+ max_k = rank
76
+ if max_k <= 0:
77
+ return set()
78
+ cutoff = (alpha_f * max_k) / m_eff
79
+ selected: set[str] = set()
80
+ for idx in order:
81
+ p = pvals[idx]
82
+ if _finite01(p) and p <= cutoff:
83
+ selected.add(names[idx])
84
+ return selected
85
+
86
+
87
+ def _bonferroni_reject_families(
88
+ family_pvals: dict[str, float], *, alpha: float, m: int
89
+ ) -> set[str]:
90
+ if not family_pvals:
91
+ return set()
92
+ try:
93
+ alpha_f = float(alpha)
94
+ except Exception:
95
+ alpha_f = 0.05
96
+ if not (0.0 < alpha_f <= 1.0):
97
+ return set()
98
+ m_eff = max(int(m) if isinstance(m, int) else 0, len(family_pvals), 1)
99
+ cutoff = alpha_f / m_eff
100
+ return {fam for fam, p in family_pvals.items() if _finite01(p) and p <= cutoff}
101
+
102
+
29
103
  class SpectralPolicy(TypedDict, total=False):
30
104
  """Type definition for spectral guard policy configuration."""
31
105
 
@@ -567,6 +641,121 @@ class SpectralGuard(Guard):
567
641
 
568
642
  return family_quantiles, top_z_scores
569
643
 
644
+ def _select_budgeted_violations(
645
+ self, budgeted_violations: list[dict[str, Any]]
646
+ ) -> tuple[list[dict[str, Any]], dict[str, Any]]:
647
+ """Apply BH/Bonferroni selection at the family level.
648
+
649
+ Returns:
650
+ (selected_violations, selection_metrics)
651
+ """
652
+ mt = self.multiple_testing if isinstance(self.multiple_testing, dict) else {}
653
+ method = str(mt.get("method", "bh")).lower()
654
+ try:
655
+ alpha = float(mt.get("alpha", 0.05) or 0.05)
656
+ except Exception:
657
+ alpha = 0.05
658
+ m_raw = mt.get("m")
659
+ m = None
660
+ try:
661
+ if m_raw is not None:
662
+ m = int(m_raw)
663
+ except Exception:
664
+ m = None
665
+
666
+ # Fill in missing family assignments deterministically.
667
+ for violation in budgeted_violations:
668
+ if violation.get("family"):
669
+ continue
670
+ module = violation.get("module")
671
+ if isinstance(module, str):
672
+ family = self.module_family_map.get(module)
673
+ if isinstance(family, str) and family:
674
+ violation["family"] = family
675
+ continue
676
+ violation["family"] = "other"
677
+
678
+ # Family p-values derived from the most significant (min p) module in each family.
679
+ family_pvals: dict[str, float] = {}
680
+ family_max_abs_z: dict[str, float] = {}
681
+ family_counts: dict[str, int] = {}
682
+ for violation in budgeted_violations:
683
+ fam = violation.get("family")
684
+ if fam is None:
685
+ continue
686
+ family = str(fam)
687
+ z_val = violation.get("z_score")
688
+ try:
689
+ zf = float(z_val)
690
+ except Exception:
691
+ continue
692
+ if not math.isfinite(zf):
693
+ continue
694
+ p = _z_to_two_sided_pvalue(zf)
695
+ family_counts[family] = family_counts.get(family, 0) + 1
696
+ cur = family_pvals.get(family)
697
+ if cur is None or p < cur:
698
+ family_pvals[family] = p
699
+ family_max_abs_z[family] = abs(zf)
700
+
701
+ families_tested = sorted(family_pvals.keys())
702
+ m_eff = m if isinstance(m, int) and m > 0 else len(families_tested)
703
+ m_eff = max(m_eff, len(families_tested), 1)
704
+ if isinstance(self.multiple_testing, dict):
705
+ self.multiple_testing.setdefault("m", m_eff)
706
+
707
+ if method in {"bh", "benjamini-hochberg", "benjamini_hochberg"}:
708
+ selected_families = _bh_reject_families(family_pvals, alpha=alpha, m=m_eff)
709
+ applied_method = "bh"
710
+ elif method in {"bonferroni", "bonf"}:
711
+ selected_families = _bonferroni_reject_families(
712
+ family_pvals, alpha=alpha, m=m_eff
713
+ )
714
+ applied_method = "bonferroni"
715
+ else:
716
+ selected_families = _bonferroni_reject_families(
717
+ family_pvals, alpha=alpha, m=m_eff
718
+ )
719
+ applied_method = "bonferroni"
720
+
721
+ selected: list[dict[str, Any]] = []
722
+ default_selected_without_pvalue = 0
723
+ for violation in budgeted_violations:
724
+ fam = violation.get("family")
725
+ family = str(fam) if fam is not None else ""
726
+ z_val = violation.get("z_score")
727
+ p_val: float | None = None
728
+ try:
729
+ zf = float(z_val)
730
+ except Exception:
731
+ zf = None
732
+ if zf is not None and math.isfinite(zf):
733
+ p_val = _z_to_two_sided_pvalue(zf)
734
+ is_selected = family in selected_families
735
+ else:
736
+ # If we cannot compute a p-value, fail closed: keep the violation.
737
+ is_selected = True
738
+ default_selected_without_pvalue += 1
739
+ violation["p_value"] = p_val
740
+ violation["selected"] = is_selected
741
+ if is_selected:
742
+ selected.append(violation)
743
+
744
+ selection_metrics = {
745
+ "method": applied_method,
746
+ "alpha": alpha,
747
+ "m": int(m_eff),
748
+ "families_tested": families_tested,
749
+ "families_selected": sorted(selected_families),
750
+ "family_pvalues": {k: float(family_pvals[k]) for k in families_tested},
751
+ "family_max_abs_z": {
752
+ k: float(family_max_abs_z[k]) for k in families_tested
753
+ },
754
+ "family_violation_counts": dict(family_counts),
755
+ "default_selected_without_pvalue": int(default_selected_without_pvalue),
756
+ }
757
+ return selected, selection_metrics
758
+
570
759
  def validate(
571
760
  self, model: Any, adapter: Any, context: dict[str, Any]
572
761
  ) -> dict[str, Any]:
@@ -607,7 +796,13 @@ class SpectralGuard(Guard):
607
796
  if violation.get("type") in fatal_violation_types
608
797
  ]
609
798
 
610
- caps_applied = len(budgeted_violations)
799
+ selected_budgeted, mt_selection = self._select_budgeted_violations(
800
+ budgeted_violations
801
+ )
802
+ selected_violations = [*fatal_violations, *selected_budgeted]
803
+ candidate_budgeted = len(budgeted_violations)
804
+
805
+ caps_applied = len(selected_budgeted)
611
806
  caps_exceeded = caps_applied > int(self.max_caps)
612
807
  passed = not fatal_violations and not caps_exceeded
613
808
  if fatal_violations or caps_exceeded:
@@ -623,8 +818,9 @@ class SpectralGuard(Guard):
623
818
  )
624
819
  metrics = {
625
820
  "modules_checked": len(current_metrics),
626
- "violations_found": len(violations),
821
+ "violations_found": len(selected_violations),
627
822
  "budgeted_violations": caps_applied,
823
+ "candidate_budgeted_violations": candidate_budgeted,
628
824
  "fatal_violations": len(fatal_violations),
629
825
  "max_spectral_norm": max(current_metrics.values())
630
826
  if current_metrics
@@ -642,6 +838,7 @@ class SpectralGuard(Guard):
642
838
  "caps_applied": caps_applied,
643
839
  "caps_exceeded": caps_exceeded,
644
840
  "multiple_testing": self.multiple_testing,
841
+ "multiple_testing_selection": mt_selection,
645
842
  }
646
843
 
647
844
  family_quantiles, top_z_scores = self._compute_family_observability()
@@ -653,7 +850,7 @@ class SpectralGuard(Guard):
653
850
  if passed:
654
851
  message = (
655
852
  "Spectral validation passed with "
656
- f"{len(violations)} violations "
853
+ f"{len(selected_violations)} violations "
657
854
  f"(caps_applied={caps_applied}, max_caps={self.max_caps})"
658
855
  )
659
856
  else:
@@ -683,7 +880,7 @@ class SpectralGuard(Guard):
683
880
  "passed": passed,
684
881
  "action": action,
685
882
  "metrics": metrics,
686
- "violations": violations,
883
+ "violations": selected_violations,
687
884
  "message": message,
688
885
  "policy": self._serialize_policy(),
689
886
  "final_z_scores": self.latest_z_scores.copy(),
@@ -743,15 +940,23 @@ class SpectralGuard(Guard):
743
940
  if violation.get("type") in fatal_violation_types
744
941
  ]
745
942
 
746
- caps_applied = len(budgeted_violations)
943
+ selected_budgeted, mt_selection = self._select_budgeted_violations(
944
+ budgeted_violations
945
+ )
946
+ selected_final_violations = [*fatal_violations, *selected_budgeted]
947
+ candidate_budgeted = len(budgeted_violations)
948
+
949
+ caps_applied = len(selected_budgeted)
747
950
  caps_exceeded = caps_applied > int(self.max_caps)
748
951
  passed = not fatal_violations and not caps_exceeded
749
952
 
750
953
  # Compute comprehensive metrics
751
954
  metrics = {
752
955
  "modules_analyzed": len(final_metrics),
753
- "violations_detected": len(final_violations),
956
+ "violations_detected": len(selected_final_violations),
754
957
  "budgeted_violations": caps_applied,
958
+ "candidate_violations_detected": len(final_violations),
959
+ "candidate_budgeted_violations": candidate_budgeted,
755
960
  "fatal_violations": len(fatal_violations),
756
961
  "baseline_modules": len(self.baseline_metrics),
757
962
  "scope": self.scope,
@@ -764,7 +969,8 @@ class SpectralGuard(Guard):
764
969
  "spectral_stability_score": 1.0
765
970
  - min(len(final_violations) / max(len(final_metrics), 1), 1.0),
766
971
  "target_sigma": self.target_sigma,
767
- "correction_applied": len(final_violations) > 0 and self.correction_enabled,
972
+ "correction_applied": len(selected_final_violations) > 0
973
+ and self.correction_enabled,
768
974
  "family_caps": self.family_caps,
769
975
  "family_z_summary": final_z_summary,
770
976
  "family_stats": final_family_stats,
@@ -774,6 +980,7 @@ class SpectralGuard(Guard):
774
980
  "caps_applied": caps_applied,
775
981
  "caps_exceeded": caps_exceeded,
776
982
  "multiple_testing": self.multiple_testing,
983
+ "multiple_testing_selection": mt_selection,
777
984
  "family_z_quantiles": family_quantiles,
778
985
  "top_z_scores": top_z_scores,
779
986
  }
@@ -782,7 +989,7 @@ class SpectralGuard(Guard):
782
989
  warnings = []
783
990
  errors = []
784
991
 
785
- for violation in final_violations:
992
+ for violation in selected_final_violations:
786
993
  if violation["type"] in ["max_spectral_norm", "ill_conditioned"]:
787
994
  errors.append(violation["message"])
788
995
  else:
@@ -793,7 +1000,7 @@ class SpectralGuard(Guard):
793
1000
  "metrics": metrics,
794
1001
  "warnings": warnings,
795
1002
  "errors": errors,
796
- "violations": final_violations,
1003
+ "violations": selected_final_violations,
797
1004
  "events": self.events,
798
1005
  "baseline_metrics": self.baseline_metrics,
799
1006
  "final_metrics": final_metrics,
@@ -403,15 +403,18 @@ def _predictive_gate_outcome(
403
403
  ):
404
404
  return False, "ci_unavailable"
405
405
 
406
- lower, upper = float(delta_ci[0]), float(delta_ci[1])
406
+ upper = float(delta_ci[1])
407
407
  min_effect = float(min_effect or 0.0)
408
408
 
409
409
  if one_sided:
410
- if lower >= 0.0:
410
+ # One-sided improvement (ΔlogNLL < 0): certify a minimum effect by
411
+ # requiring the *upper* bound of the (two-sided) CI to clear -min_effect.
412
+ if upper >= 0.0:
411
413
  return False, "ci_contains_zero"
412
414
  if mean_delta >= 0.0:
413
415
  return False, "mean_not_negative"
414
- if min_effect > 0.0 and (-mean_delta) < min_effect:
416
+ gain_lower_bound = -upper # worst-case gain under CI
417
+ if gain_lower_bound < min_effect:
415
418
  return False, "gain_below_threshold"
416
419
  return True, "ci_gain_met"
417
420