invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. invarlock/__init__.py +1 -1
  2. invarlock/_data/runtime/tiers.yaml +61 -0
  3. invarlock/adapters/hf_loading.py +97 -0
  4. invarlock/calibration/__init__.py +6 -0
  5. invarlock/calibration/spectral_null.py +301 -0
  6. invarlock/calibration/variance_ve.py +154 -0
  7. invarlock/cli/app.py +15 -0
  8. invarlock/cli/commands/calibrate.py +576 -0
  9. invarlock/cli/commands/doctor.py +9 -3
  10. invarlock/cli/commands/explain_gates.py +53 -9
  11. invarlock/cli/commands/plugins.py +12 -2
  12. invarlock/cli/commands/run.py +181 -79
  13. invarlock/cli/commands/verify.py +40 -0
  14. invarlock/cli/config.py +11 -1
  15. invarlock/cli/determinism.py +252 -0
  16. invarlock/core/auto_tuning.py +215 -17
  17. invarlock/core/bootstrap.py +137 -5
  18. invarlock/core/registry.py +9 -4
  19. invarlock/core/runner.py +305 -35
  20. invarlock/eval/bench.py +467 -141
  21. invarlock/eval/bench_regression.py +12 -0
  22. invarlock/eval/bootstrap.py +3 -1
  23. invarlock/eval/data.py +29 -7
  24. invarlock/eval/primary_metric.py +20 -5
  25. invarlock/guards/rmt.py +536 -46
  26. invarlock/guards/spectral.py +217 -10
  27. invarlock/guards/variance.py +124 -42
  28. invarlock/reporting/certificate.py +476 -45
  29. invarlock/reporting/certificate_schema.py +4 -1
  30. invarlock/reporting/guards_analysis.py +108 -10
  31. invarlock/reporting/normalizer.py +24 -1
  32. invarlock/reporting/policy_utils.py +97 -15
  33. invarlock/reporting/primary_metric_utils.py +17 -0
  34. invarlock/reporting/validate.py +10 -10
  35. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
  36. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
  37. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
  38. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
  39. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
  40. {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ # Policy-change regression baseline identifiers.
4
+ #
5
+ # When the benchmark golden outputs are intentionally updated, bump
6
+ # `BENCH_GOLDEN_ID` and update `BENCH_GOLDEN_SHA256` accordingly, then add a
7
+ # matching entry to `CHANGELOG.md`.
8
+
9
+ BENCH_GOLDEN_ID = "bench-golden-2025-12-13"
10
+ BENCH_GOLDEN_SHA256 = "0d9ff3274d29dad16ad580b4a0cf37b4f89e4f7c2e4345ce3d30a39f146ff5a7"
11
+
12
+ __all__ = ["BENCH_GOLDEN_ID", "BENCH_GOLDEN_SHA256"]
@@ -16,6 +16,7 @@ from invarlock.core.exceptions import ValidationError
16
16
  def paired_delta_mean_ci(
17
17
  subject: Iterable[float],
18
18
  baseline: Iterable[float],
19
+ weights: Iterable[float] | None = None,
19
20
  *,
20
21
  reps: int = 2000,
21
22
  seed: int = 0,
@@ -27,7 +28,7 @@ def paired_delta_mean_ci(
27
28
 
28
29
  Notes:
29
30
  - When `method == 'bca'`, this dispatches to the core BCa implementation.
30
- - `weights` are currently not supported; pass pre-aggregated per-example values.
31
+ - Optional `weights` apply token-weighted resampling when provided.
31
32
  """
32
33
  alpha = 1.0 - float(ci_level)
33
34
  if method not in {"bca", "percentile"}:
@@ -43,6 +44,7 @@ def paired_delta_mean_ci(
43
44
  return _paired_delta_bca(
44
45
  list(subject),
45
46
  list(baseline),
47
+ weights=list(weights) if weights is not None else None,
46
48
  method="bca" if method == "bca" else "percentile",
47
49
  replicates=int(reps),
48
50
  alpha=alpha,
invarlock/eval/data.py CHANGED
@@ -855,6 +855,13 @@ class WikiText2Provider:
855
855
  eval_device_override = os.environ.get("INVARLOCK_EVAL_DEVICE")
856
856
  device_hint = getattr(self, "_device_hint", None)
857
857
 
858
+ def _is_device_usable(device: torch.device) -> bool:
859
+ try:
860
+ _ = torch.zeros((1, 1), dtype=torch.long, device=device)
861
+ return True
862
+ except Exception:
863
+ return False
864
+
858
865
  if self._difficulty_model is None:
859
866
  from transformers import GPT2LMHeadModel
860
867
 
@@ -874,6 +881,13 @@ class WikiText2Provider:
874
881
  else:
875
882
  device = self._pick_default_scorer_device()
876
883
 
884
+ if device.type != "cpu" and not _is_device_usable(device):
885
+ warnings.warn(
886
+ f"Difficulty scorer device {device} unavailable; falling back to CPU",
887
+ stacklevel=2,
888
+ )
889
+ device = torch.device("cpu")
890
+
877
891
  model.to(device)
878
892
  self._difficulty_model = model
879
893
  self._difficulty_device = device
@@ -898,16 +912,24 @@ class WikiText2Provider:
898
912
  desired_device = device
899
913
 
900
914
  if desired_device != device:
901
- try:
902
- model.to(desired_device)
903
- device = desired_device
904
- self._difficulty_device = desired_device
905
- self.__class__._MODEL_DEVICE = desired_device
906
- except Exception as exc:
915
+ if desired_device.type != "cpu" and not _is_device_usable(
916
+ desired_device
917
+ ):
907
918
  warnings.warn(
908
- f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
919
+ f"Difficulty scorer device {desired_device} unavailable; keeping {device}",
909
920
  stacklevel=2,
910
921
  )
922
+ else:
923
+ try:
924
+ model.to(desired_device)
925
+ device = desired_device
926
+ self._difficulty_device = desired_device
927
+ self.__class__._MODEL_DEVICE = desired_device
928
+ except Exception as exc:
929
+ warnings.warn(
930
+ f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
931
+ stacklevel=2,
932
+ )
911
933
 
912
934
  if not self._scorer_warmed:
913
935
  with torch.no_grad():
@@ -214,9 +214,15 @@ class _PPLCausal(PrimaryMetric):
214
214
  ) -> dict[str, Any]:
215
215
  subj = self._coerce_contrib_array(subject)
216
216
  base = self._coerce_contrib_array(baseline)
217
- # Compute simple (unweighted) per-example arrays in log space; weights ignored for bootstrap here
217
+ # Compute per-example arrays in log space; use weights for paired bootstrap
218
218
  subj_vals = [v for (v, _w) in subj]
219
219
  base_vals = [v for (v, _w) in base]
220
+ pair_weights = []
221
+ for (_sv, sw), (_bv, bw) in zip(subj, base, strict=False):
222
+ weight = bw if math.isfinite(bw) and bw > 0 else sw
223
+ if not math.isfinite(weight) or weight <= 0:
224
+ weight = 1.0
225
+ pair_weights.append(float(weight))
220
226
 
221
227
  # Points in display space
222
228
  def _point(
@@ -249,15 +255,24 @@ class _PPLCausal(PrimaryMetric):
249
255
  dlog_lo, dlog_hi = compute_paired_delta_log_ci(
250
256
  subj_vals,
251
257
  base_vals,
258
+ weights=pair_weights,
252
259
  method="bca",
253
260
  replicates=reps_eff,
254
261
  alpha=alpha,
255
262
  seed=seed_eff,
256
263
  )
257
- delta_log = float(
258
- sum((s - b) for s, b in zip(subj_vals, base_vals, strict=False))
259
- / max(1, min(len(subj_vals), len(base_vals)))
260
- )
264
+ if pair_weights and len(pair_weights) >= min(len(subj_vals), len(base_vals)):
265
+ sw = 0.0
266
+ swx = 0.0
267
+ for s, b, w in zip(subj_vals, base_vals, pair_weights, strict=False):
268
+ sw += w
269
+ swx += w * (s - b)
270
+ delta_log = float(swx / sw) if sw > 0 else float("nan")
271
+ else:
272
+ delta_log = float(
273
+ sum((s - b) for s, b in zip(subj_vals, base_vals, strict=False))
274
+ / max(1, min(len(subj_vals), len(base_vals)))
275
+ )
261
276
  ratio = self.display_transform(delta_log)
262
277
  return {
263
278
  "kind": self.kind,