PyPI - invarlock - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

invarlock 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

invarlock/__init__.py +1 -1
invarlock/_data/runtime/tiers.yaml +61 -0
invarlock/adapters/hf_loading.py +97 -0
invarlock/calibration/__init__.py +6 -0
invarlock/calibration/spectral_null.py +301 -0
invarlock/calibration/variance_ve.py +154 -0
invarlock/cli/app.py +15 -0
invarlock/cli/commands/calibrate.py +576 -0
invarlock/cli/commands/doctor.py +9 -3
invarlock/cli/commands/explain_gates.py +53 -9
invarlock/cli/commands/plugins.py +12 -2
invarlock/cli/commands/run.py +181 -79
invarlock/cli/commands/verify.py +40 -0
invarlock/cli/config.py +11 -1
invarlock/cli/determinism.py +252 -0
invarlock/core/auto_tuning.py +215 -17
invarlock/core/bootstrap.py +137 -5
invarlock/core/registry.py +9 -4
invarlock/core/runner.py +305 -35
invarlock/eval/bench.py +467 -141
invarlock/eval/bench_regression.py +12 -0
invarlock/eval/bootstrap.py +3 -1
invarlock/eval/data.py +29 -7
invarlock/eval/primary_metric.py +20 -5
invarlock/guards/rmt.py +536 -46
invarlock/guards/spectral.py +217 -10
invarlock/guards/variance.py +124 -42
invarlock/reporting/certificate.py +476 -45
invarlock/reporting/certificate_schema.py +4 -1
invarlock/reporting/guards_analysis.py +108 -10
invarlock/reporting/normalizer.py +24 -1
invarlock/reporting/policy_utils.py +97 -15
invarlock/reporting/primary_metric_utils.py +17 -0
invarlock/reporting/validate.py +10 -10
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/METADATA +12 -10
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/RECORD +40 -33
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/WHEEL +0 -0
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/entry_points.txt +0 -0
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.1.dist-info → invarlock-0.3.3.dist-info}/top_level.txt +0 -0

invarlock/eval/bench_regression.py ADDED Viewed

@@ -0,0 +1,12 @@
+from __future__ import annotations
+# Policy-change regression baseline identifiers.
+#
+# When the benchmark golden outputs are intentionally updated, bump
+# `BENCH_GOLDEN_ID` and update `BENCH_GOLDEN_SHA256` accordingly, then add a
+# matching entry to `CHANGELOG.md`.
+BENCH_GOLDEN_ID = "bench-golden-2025-12-13"
+BENCH_GOLDEN_SHA256 = "0d9ff3274d29dad16ad580b4a0cf37b4f89e4f7c2e4345ce3d30a39f146ff5a7"
+__all__ = ["BENCH_GOLDEN_ID", "BENCH_GOLDEN_SHA256"]

invarlock/eval/bootstrap.py CHANGED Viewed

@@ -16,6 +16,7 @@ from invarlock.core.exceptions import ValidationError
 def paired_delta_mean_ci(
     subject: Iterable[float],
     baseline: Iterable[float],
+    weights: Iterable[float] | None = None,
     *,
     reps: int = 2000,
     seed: int = 0,
@@ -27,7 +28,7 @@ def paired_delta_mean_ci(
     Notes:
     - When `method == 'bca'`, this dispatches to the core BCa implementation.
-    - `weights` are currently not supported; pass pre-aggregated per-example values.
+    - Optional `weights` apply token-weighted resampling when provided.
     """
     alpha = 1.0 - float(ci_level)
     if method not in {"bca", "percentile"}:
@@ -43,6 +44,7 @@ def paired_delta_mean_ci(
     return _paired_delta_bca(
         list(subject),
         list(baseline),
+        weights=list(weights) if weights is not None else None,
         method="bca" if method == "bca" else "percentile",
         replicates=int(reps),
         alpha=alpha,

invarlock/eval/data.py CHANGED Viewed

@@ -855,6 +855,13 @@ class WikiText2Provider:
             eval_device_override = os.environ.get("INVARLOCK_EVAL_DEVICE")
             device_hint = getattr(self, "_device_hint", None)
+            def _is_device_usable(device: torch.device) -> bool:
+                try:
+                    _ = torch.zeros((1, 1), dtype=torch.long, device=device)
+                    return True
+                except Exception:
+                    return False
             if self._difficulty_model is None:
                 from transformers import GPT2LMHeadModel
@@ -874,6 +881,13 @@ class WikiText2Provider:
                 else:
                     device = self._pick_default_scorer_device()
+                if device.type != "cpu" and not _is_device_usable(device):
+                    warnings.warn(
+                        f"Difficulty scorer device {device} unavailable; falling back to CPU",
+                        stacklevel=2,
+                    )
+                    device = torch.device("cpu")
                 model.to(device)
                 self._difficulty_model = model
                 self._difficulty_device = device
@@ -898,16 +912,24 @@ class WikiText2Provider:
                     desired_device = device
             if desired_device != device:
-                try:
-                    model.to(desired_device)
-                    device = desired_device
-                    self._difficulty_device = desired_device
-                    self.__class__._MODEL_DEVICE = desired_device
-                except Exception as exc:
+                if desired_device.type != "cpu" and not _is_device_usable(
+                    desired_device
+                ):
                     warnings.warn(
-                        f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
+                        f"Difficulty scorer device {desired_device} unavailable; keeping {device}",
                         stacklevel=2,
                     )
+                else:
+                    try:
+                        model.to(desired_device)
+                        device = desired_device
+                        self._difficulty_device = desired_device
+                        self.__class__._MODEL_DEVICE = desired_device
+                    except Exception as exc:
+                        warnings.warn(
+                            f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
+                            stacklevel=2,
+                        )
             if not self._scorer_warmed:
                 with torch.no_grad():

invarlock/eval/primary_metric.py CHANGED Viewed

@@ -214,9 +214,15 @@ class _PPLCausal(PrimaryMetric):
     ) -> dict[str, Any]:
         subj = self._coerce_contrib_array(subject)
         base = self._coerce_contrib_array(baseline)
-        # Compute simple (unweighted) per-example arrays in log space; weights ignored for bootstrap here
+        # Compute per-example arrays in log space; use weights for paired bootstrap
         subj_vals = [v for (v, _w) in subj]
         base_vals = [v for (v, _w) in base]
+        pair_weights = []
+        for (_sv, sw), (_bv, bw) in zip(subj, base, strict=False):
+            weight = bw if math.isfinite(bw) and bw > 0 else sw
+            if not math.isfinite(weight) or weight <= 0:
+                weight = 1.0
+            pair_weights.append(float(weight))
         # Points in display space
         def _point(
@@ -249,15 +255,24 @@ class _PPLCausal(PrimaryMetric):
         dlog_lo, dlog_hi = compute_paired_delta_log_ci(
             subj_vals,
             base_vals,
+            weights=pair_weights,
             method="bca",
             replicates=reps_eff,
             alpha=alpha,
             seed=seed_eff,
         )
-        delta_log = float(
-            sum((s - b) for s, b in zip(subj_vals, base_vals, strict=False))
-            / max(1, min(len(subj_vals), len(base_vals)))
-        )
+        if pair_weights and len(pair_weights) >= min(len(subj_vals), len(base_vals)):
+            sw = 0.0
+            swx = 0.0
+            for s, b, w in zip(subj_vals, base_vals, pair_weights, strict=False):
+                sw += w
+                swx += w * (s - b)
+            delta_log = float(swx / sw) if sw > 0 else float("nan")
+        else:
+            delta_log = float(
+                sum((s - b) for s, b in zip(subj_vals, base_vals, strict=False))
+                / max(1, min(len(subj_vals), len(base_vals)))
+            )
         ratio = self.display_transform(delta_log)
         return {
             "kind": self.kind,

invarlock 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

invarlock 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl