invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +1 -1
- invarlock/_data/runtime/tiers.yaml +61 -0
- invarlock/adapters/hf_loading.py +97 -0
- invarlock/calibration/__init__.py +6 -0
- invarlock/calibration/spectral_null.py +301 -0
- invarlock/calibration/variance_ve.py +154 -0
- invarlock/cli/app.py +15 -0
- invarlock/cli/commands/calibrate.py +576 -0
- invarlock/cli/commands/doctor.py +9 -3
- invarlock/cli/commands/explain_gates.py +53 -9
- invarlock/cli/commands/plugins.py +12 -2
- invarlock/cli/commands/run.py +181 -79
- invarlock/cli/commands/verify.py +40 -0
- invarlock/cli/config.py +11 -1
- invarlock/cli/determinism.py +252 -0
- invarlock/core/auto_tuning.py +215 -17
- invarlock/core/bootstrap.py +137 -5
- invarlock/core/registry.py +9 -4
- invarlock/core/runner.py +305 -35
- invarlock/eval/bench.py +467 -141
- invarlock/eval/bench_regression.py +12 -0
- invarlock/eval/bootstrap.py +3 -1
- invarlock/eval/data.py +29 -7
- invarlock/eval/primary_metric.py +20 -5
- invarlock/guards/rmt.py +536 -46
- invarlock/guards/spectral.py +217 -10
- invarlock/guards/variance.py +124 -42
- invarlock/reporting/certificate.py +476 -45
- invarlock/reporting/certificate_schema.py +4 -1
- invarlock/reporting/guards_analysis.py +108 -10
- invarlock/reporting/normalizer.py +24 -1
- invarlock/reporting/policy_utils.py +97 -15
- invarlock/reporting/primary_metric_utils.py +17 -0
- invarlock/reporting/validate.py +10 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# Policy-change regression baseline identifiers.
|
|
4
|
+
#
|
|
5
|
+
# When the benchmark golden outputs are intentionally updated, bump
|
|
6
|
+
# `BENCH_GOLDEN_ID` and update `BENCH_GOLDEN_SHA256` accordingly, then add a
|
|
7
|
+
# matching entry to `CHANGELOG.md`.
|
|
8
|
+
|
|
9
|
+
BENCH_GOLDEN_ID = "bench-golden-2025-12-13"
|
|
10
|
+
BENCH_GOLDEN_SHA256 = "0d9ff3274d29dad16ad580b4a0cf37b4f89e4f7c2e4345ce3d30a39f146ff5a7"
|
|
11
|
+
|
|
12
|
+
__all__ = ["BENCH_GOLDEN_ID", "BENCH_GOLDEN_SHA256"]
|
invarlock/eval/bootstrap.py
CHANGED
|
@@ -16,6 +16,7 @@ from invarlock.core.exceptions import ValidationError
|
|
|
16
16
|
def paired_delta_mean_ci(
|
|
17
17
|
subject: Iterable[float],
|
|
18
18
|
baseline: Iterable[float],
|
|
19
|
+
weights: Iterable[float] | None = None,
|
|
19
20
|
*,
|
|
20
21
|
reps: int = 2000,
|
|
21
22
|
seed: int = 0,
|
|
@@ -27,7 +28,7 @@ def paired_delta_mean_ci(
|
|
|
27
28
|
|
|
28
29
|
Notes:
|
|
29
30
|
- When `method == 'bca'`, this dispatches to the core BCa implementation.
|
|
30
|
-
- `weights`
|
|
31
|
+
- Optional `weights` apply token-weighted resampling when provided.
|
|
31
32
|
"""
|
|
32
33
|
alpha = 1.0 - float(ci_level)
|
|
33
34
|
if method not in {"bca", "percentile"}:
|
|
@@ -43,6 +44,7 @@ def paired_delta_mean_ci(
|
|
|
43
44
|
return _paired_delta_bca(
|
|
44
45
|
list(subject),
|
|
45
46
|
list(baseline),
|
|
47
|
+
weights=list(weights) if weights is not None else None,
|
|
46
48
|
method="bca" if method == "bca" else "percentile",
|
|
47
49
|
replicates=int(reps),
|
|
48
50
|
alpha=alpha,
|
invarlock/eval/data.py
CHANGED
|
@@ -855,6 +855,13 @@ class WikiText2Provider:
|
|
|
855
855
|
eval_device_override = os.environ.get("INVARLOCK_EVAL_DEVICE")
|
|
856
856
|
device_hint = getattr(self, "_device_hint", None)
|
|
857
857
|
|
|
858
|
+
def _is_device_usable(device: torch.device) -> bool:
|
|
859
|
+
try:
|
|
860
|
+
_ = torch.zeros((1, 1), dtype=torch.long, device=device)
|
|
861
|
+
return True
|
|
862
|
+
except Exception:
|
|
863
|
+
return False
|
|
864
|
+
|
|
858
865
|
if self._difficulty_model is None:
|
|
859
866
|
from transformers import GPT2LMHeadModel
|
|
860
867
|
|
|
@@ -874,6 +881,13 @@ class WikiText2Provider:
|
|
|
874
881
|
else:
|
|
875
882
|
device = self._pick_default_scorer_device()
|
|
876
883
|
|
|
884
|
+
if device.type != "cpu" and not _is_device_usable(device):
|
|
885
|
+
warnings.warn(
|
|
886
|
+
f"Difficulty scorer device {device} unavailable; falling back to CPU",
|
|
887
|
+
stacklevel=2,
|
|
888
|
+
)
|
|
889
|
+
device = torch.device("cpu")
|
|
890
|
+
|
|
877
891
|
model.to(device)
|
|
878
892
|
self._difficulty_model = model
|
|
879
893
|
self._difficulty_device = device
|
|
@@ -898,16 +912,24 @@ class WikiText2Provider:
|
|
|
898
912
|
desired_device = device
|
|
899
913
|
|
|
900
914
|
if desired_device != device:
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
self._difficulty_device = desired_device
|
|
905
|
-
self.__class__._MODEL_DEVICE = desired_device
|
|
906
|
-
except Exception as exc:
|
|
915
|
+
if desired_device.type != "cpu" and not _is_device_usable(
|
|
916
|
+
desired_device
|
|
917
|
+
):
|
|
907
918
|
warnings.warn(
|
|
908
|
-
f"
|
|
919
|
+
f"Difficulty scorer device {desired_device} unavailable; keeping {device}",
|
|
909
920
|
stacklevel=2,
|
|
910
921
|
)
|
|
922
|
+
else:
|
|
923
|
+
try:
|
|
924
|
+
model.to(desired_device)
|
|
925
|
+
device = desired_device
|
|
926
|
+
self._difficulty_device = desired_device
|
|
927
|
+
self.__class__._MODEL_DEVICE = desired_device
|
|
928
|
+
except Exception as exc:
|
|
929
|
+
warnings.warn(
|
|
930
|
+
f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
|
|
931
|
+
stacklevel=2,
|
|
932
|
+
)
|
|
911
933
|
|
|
912
934
|
if not self._scorer_warmed:
|
|
913
935
|
with torch.no_grad():
|
invarlock/eval/primary_metric.py
CHANGED
|
@@ -214,9 +214,15 @@ class _PPLCausal(PrimaryMetric):
|
|
|
214
214
|
) -> dict[str, Any]:
|
|
215
215
|
subj = self._coerce_contrib_array(subject)
|
|
216
216
|
base = self._coerce_contrib_array(baseline)
|
|
217
|
-
# Compute
|
|
217
|
+
# Compute per-example arrays in log space; use weights for paired bootstrap
|
|
218
218
|
subj_vals = [v for (v, _w) in subj]
|
|
219
219
|
base_vals = [v for (v, _w) in base]
|
|
220
|
+
pair_weights = []
|
|
221
|
+
for (_sv, sw), (_bv, bw) in zip(subj, base, strict=False):
|
|
222
|
+
weight = bw if math.isfinite(bw) and bw > 0 else sw
|
|
223
|
+
if not math.isfinite(weight) or weight <= 0:
|
|
224
|
+
weight = 1.0
|
|
225
|
+
pair_weights.append(float(weight))
|
|
220
226
|
|
|
221
227
|
# Points in display space
|
|
222
228
|
def _point(
|
|
@@ -249,15 +255,24 @@ class _PPLCausal(PrimaryMetric):
|
|
|
249
255
|
dlog_lo, dlog_hi = compute_paired_delta_log_ci(
|
|
250
256
|
subj_vals,
|
|
251
257
|
base_vals,
|
|
258
|
+
weights=pair_weights,
|
|
252
259
|
method="bca",
|
|
253
260
|
replicates=reps_eff,
|
|
254
261
|
alpha=alpha,
|
|
255
262
|
seed=seed_eff,
|
|
256
263
|
)
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
264
|
+
if pair_weights and len(pair_weights) >= min(len(subj_vals), len(base_vals)):
|
|
265
|
+
sw = 0.0
|
|
266
|
+
swx = 0.0
|
|
267
|
+
for s, b, w in zip(subj_vals, base_vals, pair_weights, strict=False):
|
|
268
|
+
sw += w
|
|
269
|
+
swx += w * (s - b)
|
|
270
|
+
delta_log = float(swx / sw) if sw > 0 else float("nan")
|
|
271
|
+
else:
|
|
272
|
+
delta_log = float(
|
|
273
|
+
sum((s - b) for s, b in zip(subj_vals, base_vals, strict=False))
|
|
274
|
+
/ max(1, min(len(subj_vals), len(base_vals)))
|
|
275
|
+
)
|
|
261
276
|
ratio = self.display_transform(delta_log)
|
|
262
277
|
return {
|
|
263
278
|
"kind": self.kind,
|