PyPI - invarlock - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

invarlock 0.3.3py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

invarlock/__init__.py +1 -1
invarlock/adapters/auto.py +4 -4
invarlock/adapters/hf_bert.py +6 -5
invarlock/adapters/hf_gpt2.py +5 -4
invarlock/adapters/hf_llama.py +4 -2
invarlock/adapters/hf_mixin.py +88 -9
invarlock/adapters/hf_t5.py +5 -3
invarlock/cli/commands/certify.py +1 -1
invarlock/cli/commands/run.py +560 -141
invarlock/cli/commands/verify.py +12 -0
invarlock/core/runner.py +40 -15
invarlock/eval/bench_regression.py +1 -1
invarlock/eval/data.py +59 -244
invarlock/eval/metrics.py +130 -3
invarlock/reporting/certificate.py +27 -0
{invarlock-0.3.3.dist-info → invarlock-0.3.5.dist-info}/METADATA +2 -2
{invarlock-0.3.3.dist-info → invarlock-0.3.5.dist-info}/RECORD +21 -21
{invarlock-0.3.3.dist-info → invarlock-0.3.5.dist-info}/WHEEL +0 -0
{invarlock-0.3.3.dist-info → invarlock-0.3.5.dist-info}/entry_points.txt +0 -0
{invarlock-0.3.3.dist-info → invarlock-0.3.5.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.3.dist-info → invarlock-0.3.5.dist-info}/top_level.txt +0 -0

invarlock/cli/commands/verify.py CHANGED Viewed

@@ -117,6 +117,18 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
     match_fraction = stats.get("window_match_fraction")
     overlap_fraction = stats.get("window_overlap_fraction")
+    pairing_reason = stats.get("window_pairing_reason")
+    paired_windows = _coerce_int(stats.get("paired_windows"))
+    if pairing_reason is not None:
+        errors.append(
+            "window_pairing_reason must be null/None for paired certificates "
+            f"(found {pairing_reason!r})."
+        )
+    if paired_windows is None:
+        errors.append("Certificate missing paired_windows metric.")
+    elif paired_windows == 0:
+        errors.append("paired_windows must be > 0 for paired certificates (found 0).")
     if match_fraction is None:
         errors.append("Certificate missing window_match_fraction metric.")

invarlock/core/runner.py CHANGED Viewed

@@ -1528,7 +1528,7 @@ class CoreRunner:
                     pairing_reason = "duplicate_windows"
                 elif count_mismatch:
                     pairing_reason = "count_mismatch"
-                elif not pairing_context:
+                else:
                     pairing_reason = preview_pair_stats.get(
                         "reason"
                     ) or final_pair_stats.get("reason")
@@ -2079,24 +2079,49 @@ class CoreRunner:
             # Perform rollback if checkpoint available
             if self.checkpoint_manager and "initial_checkpoint" in report.meta:
                 checkpoint_id = report.meta["initial_checkpoint"]
-                self.checkpoint_manager.restore_checkpoint(
-                    model, adapter, checkpoint_id
-                )
-                # Match test expectation: only include checkpoint and reason
-                self._log_event(
-                    "finalize",
-                    "rollback",
-                    LogLevel.WARNING,
-                    {
-                        "checkpoint": checkpoint_id,
-                        "reason": rollback_reason,
-                    },
-                )
+                restored = False
+                restore_error: str | None = None
+                try:
+                    restored = bool(
+                        self.checkpoint_manager.restore_checkpoint(
+                            model, adapter, checkpoint_id
+                        )
+                    )
+                except Exception as exc:
+                    restored = False
+                    restore_error = str(exc)
+                if restored:
+                    # Match test expectation: only include checkpoint and reason
+                    self._log_event(
+                        "finalize",
+                        "rollback",
+                        LogLevel.WARNING,
+                        {
+                            "checkpoint": checkpoint_id,
+                            "reason": rollback_reason,
+                        },
+                    )
+                else:
+                    self._log_event(
+                        "finalize",
+                        "rollback_failed",
+                        LogLevel.CRITICAL,
+                        {
+                            "mode": "finalize",
+                            "checkpoint": checkpoint_id,
+                            "reason": rollback_reason,
+                            "error": restore_error or "restore_failed",
+                        },
+                    )
                 # Store rollback metadata in report
                 report.meta["rollback_reason"] = rollback_reason
                 report.meta["rollback_checkpoint"] = checkpoint_id
-                report.meta["guard_recovered"] = True
+                report.meta["guard_recovered"] = bool(restored)
+                report.meta["rollback_failed"] = not bool(restored)
+                if not restored:
+                    report.meta["rollback_error"] = restore_error or "restore_failed"
             else:
                 # Match test expectation: log without additional data payload

invarlock/eval/bench_regression.py CHANGED Viewed

@@ -7,6 +7,6 @@ from __future__ import annotations
 # matching entry to `CHANGELOG.md`.
 BENCH_GOLDEN_ID = "bench-golden-2025-12-13"
-BENCH_GOLDEN_SHA256 = "0d9ff3274d29dad16ad580b4a0cf37b4f89e4f7c2e4345ce3d30a39f146ff5a7"
+BENCH_GOLDEN_SHA256 = "2627b8872cd6bfc37bda31fbc11b78ed814751cbf2a9ad1396e173f1f4e5383a"
 __all__ = ["BENCH_GOLDEN_ID", "BENCH_GOLDEN_SHA256"]

invarlock/eval/data.py CHANGED Viewed

@@ -7,7 +7,6 @@ Pluggable data loading system with deterministic windowing for reproducible eval
 from __future__ import annotations
-import atexit
 import hashlib
 import json
 import math
@@ -51,7 +50,6 @@ except ImportError:
 try:
     import torch
-    import torch.nn.functional as F
     HAS_TORCH = True
 except ImportError:
@@ -160,9 +158,9 @@ class WikiText2Provider:
     """
     name = "wikitext2"
-    _MODEL_CACHE: Any | None | bool = None
-    _MODEL_DEVICE: Any | None = None
-    _CLEANUP_REGISTERED: bool = False
+    _BYTE_NGRAM_ORDER = 4
+    _BYTE_NGRAM_PAD = 256
+    _BYTE_NGRAM_ALPHA = 1.0
     def __init__(
         self,
@@ -178,13 +176,9 @@ class WikiText2Provider:
         """
         self.cache_dir = cache_dir
         self._validate_dependencies()
-        self._register_cleanup()
-        self._difficulty_model = self.__class__._MODEL_CACHE
-        self._difficulty_device = self.__class__._MODEL_DEVICE
         self._last_stratification_stats: dict[str, Any] | None = None
         self._last_batch_size_used: int = 0
         self._last_scorer_profile: dict[str, Any] | None = None
-        self._scorer_warmed: bool = False
         # In-process cache for loaded/filtered texts to avoid repeated
         # load_dataset() calls across stratification retries.
         self._texts_cache: dict[str, list[str]] = {}
@@ -192,43 +186,6 @@ class WikiText2Provider:
         normalized_hint = (device_hint or "").strip().lower()
         self._device_hint: str | None = normalized_hint or None
-    @classmethod
-    def _register_cleanup(cls) -> None:
-        """Register an atexit hook once per process to release cached models."""
-        if cls._CLEANUP_REGISTERED or not HAS_TORCH:
-            return
-        def _cleanup() -> None:
-            cls._cleanup_model_cache()
-        atexit.register(_cleanup)
-        cls._CLEANUP_REGISTERED = True
-    @classmethod
-    def _cleanup_model_cache(cls) -> None:
-        """Release cached models to avoid leaking multiprocessing semaphores."""
-        cache = cls._MODEL_CACHE
-        if cache is not None and cache is not False and HAS_TORCH:
-            try:
-                cache.to("cpu")
-            except Exception:
-                pass
-        cls._MODEL_CACHE = None
-        cls._MODEL_DEVICE = None
-    @staticmethod
-    def _pick_default_scorer_device() -> torch.device:
-        """
-        Choose a default device for the difficulty scorer model.
-        Prefers CUDA → MPS → CPU when available.
-        """
-        if torch.cuda.is_available():
-            return torch.device("cuda")
-        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            return torch.device("mps")
-        return torch.device("cpu")
     def _validate_dependencies(self) -> None:
         """Check that required dependencies are available."""
         if not HAS_DATASETS:
@@ -513,9 +470,11 @@ class WikiText2Provider:
                 candidates.append(
                     {
                         "dataset_index": idx,
+                        "text": texts[idx],
                         "input_ids": input_ids_list,
                         "attention_mask": attention_mask_list,
                         "token_count": real_tokens,
+                        "seq_len": len(input_ids_list),
                     }
                 )
@@ -531,32 +490,7 @@ class WikiText2Provider:
                 details={"needed": int(total_required), "got": int(len(candidates))},
             )
-        if not self._score_candidates_with_model(candidates):
-            token_counter: Counter[int] = Counter()
-            for candidate in candidates:
-                for token_id, mask in zip(
-                    candidate["input_ids"], candidate["attention_mask"], strict=False
-                ):
-                    if mask:
-                        token_counter[int(token_id)] += 1
-            total_tokens = sum(token_counter.values()) or 1
-            vocab_size = max(len(token_counter), 1)
-            for candidate in candidates:
-                difficulty = 0.0
-                real_tokens = 0
-                for token_id, mask in zip(
-                    candidate["input_ids"], candidate["attention_mask"], strict=False
-                ):
-                    if not mask:
-                        continue
-                    freq = (token_counter[int(token_id)] + 1.0) / (
-                        total_tokens + vocab_size
-                    )
-                    difficulty -= math.log(freq)
-                    real_tokens += 1
-                candidate["difficulty"] = difficulty / max(real_tokens, 1)
+        self._score_candidates_byte_ngram(candidates)
         sorted_candidates = sorted(
             candidates, key=lambda item: (item["difficulty"], item["dataset_index"])
@@ -843,182 +777,63 @@ class WikiText2Provider:
         return results
-    def _score_candidates_with_model(self, candidates: list[dict[str, Any]]) -> bool:
-        """Score candidate windows using a pretrained GPT-2 model if available."""
-        if not HAS_TORCH:
-            return False
-        if self._difficulty_model is False:
-            return False
-        try:
-            eval_device_override = os.environ.get("INVARLOCK_EVAL_DEVICE")
-            device_hint = getattr(self, "_device_hint", None)
-            def _is_device_usable(device: torch.device) -> bool:
-                try:
-                    _ = torch.zeros((1, 1), dtype=torch.long, device=device)
-                    return True
-                except Exception:
-                    return False
-            if self._difficulty_model is None:
-                from transformers import GPT2LMHeadModel
-                model = GPT2LMHeadModel.from_pretrained("gpt2")
-                model.eval()
-                # Decide initial scorer device: env override → provider hint → heuristic
-                if eval_device_override:
-                    try:
-                        device = torch.device(eval_device_override)
-                    except Exception:
-                        device = self._pick_default_scorer_device()
-                elif device_hint and device_hint != "auto":
-                    try:
-                        device = torch.device(device_hint)
-                    except Exception:
-                        device = self._pick_default_scorer_device()
-                else:
-                    device = self._pick_default_scorer_device()
-                if device.type != "cpu" and not _is_device_usable(device):
-                    warnings.warn(
-                        f"Difficulty scorer device {device} unavailable; falling back to CPU",
-                        stacklevel=2,
-                    )
-                    device = torch.device("cpu")
-                model.to(device)
-                self._difficulty_model = model
-                self._difficulty_device = device
-                self.__class__._MODEL_CACHE = model
-                self.__class__._MODEL_DEVICE = device
-            assert self._difficulty_model is not None
-            model = self._difficulty_model
-            device = self._difficulty_device or torch.device("cpu")
-            # If a new override/hint is provided, move the cached model if needed.
-            desired_device = device
-            if eval_device_override:
-                try:
-                    desired_device = torch.device(eval_device_override)
-                except Exception:
-                    desired_device = device
-            elif device_hint and device_hint != "auto":
-                try:
-                    desired_device = torch.device(device_hint)
-                except Exception:
-                    desired_device = device
-            if desired_device != device:
-                if desired_device.type != "cpu" and not _is_device_usable(
-                    desired_device
-                ):
-                    warnings.warn(
-                        f"Difficulty scorer device {desired_device} unavailable; keeping {device}",
-                        stacklevel=2,
-                    )
-                else:
-                    try:
-                        model.to(desired_device)
-                        device = desired_device
-                        self._difficulty_device = desired_device
-                        self.__class__._MODEL_DEVICE = desired_device
-                    except Exception as exc:
-                        warnings.warn(
-                            f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
-                            stacklevel=2,
-                        )
-            if not self._scorer_warmed:
-                with torch.no_grad():
-                    dummy_input = torch.zeros((1, 8), dtype=torch.long, device=device)
-                    dummy_attention = torch.ones_like(dummy_input)
-                    model(dummy_input, attention_mask=dummy_attention)
-                self._scorer_warmed = True
-            batch_override = os.environ.get("INVARLOCK_SCORES_BATCH_SIZE")
-            override_size = None
-            if batch_override:
-                try:
-                    override_size = max(1, int(batch_override))
-                except ValueError:
-                    override_size = None
-            batch_size = min(32, max(4, len(candidates)))
-            if override_size is not None:
-                batch_size = max(1, min(override_size, len(candidates)))
-            input_batch: list[list[int]] = []
-            attention_batch: list[list[int]] = []
-            candidate_batch: list[dict[str, Any]] = []
-            total_tokens = 0
-            start_time = time.perf_counter()
-            with torch.no_grad():
-                for candidate in candidates:
-                    input_batch.append(candidate["input_ids"])
-                    attention_batch.append(candidate["attention_mask"])
-                    candidate_batch.append(candidate)
-                    if len(input_batch) == batch_size or candidate is candidates[-1]:
-                        input_tensor = torch.tensor(
-                            input_batch, dtype=torch.long, device=device
-                        )
-                        attention_tensor = torch.tensor(
-                            attention_batch, dtype=torch.long, device=device
-                        )
-                        outputs = model(input_tensor, attention_mask=attention_tensor)
-                        shift_logits = outputs.logits[:, :-1, :].contiguous()
-                        shift_labels = input_tensor[:, 1:].contiguous()
-                        shift_mask = attention_tensor[:, 1:].contiguous()
-                        shift_labels = shift_labels.masked_fill(shift_mask == 0, 0)
-                        vocab_size = shift_logits.size(-1)
-                        losses = F.cross_entropy(
-                            shift_logits.view(-1, vocab_size),
-                            shift_labels.view(-1),
-                            reduction="none",
-                        )
-                        losses = losses.view(shift_labels.size()) * shift_mask
-                        token_counts = shift_mask.sum(dim=1).clamp(min=1)
-                        loss_per_example = (
-                            (losses.sum(dim=1) / token_counts).cpu().tolist()
-                        )
-                        for cand_obj, loss_value in zip(
-                            candidate_batch, loss_per_example, strict=False
-                        ):
-                            cand_obj["difficulty"] = float(loss_value)
-                        total_tokens += int(token_counts.sum().item())
-                        input_batch.clear()
-                        attention_batch.clear()
-                        candidate_batch.clear()
-            self._last_batch_size_used = batch_size
-            elapsed = max(time.perf_counter() - start_time, 1e-9)
-            tokens_per_sec = total_tokens / elapsed if total_tokens else 0.0
-            self._last_scorer_profile = {
-                "batch_size": batch_size,
-                "tokens_processed": total_tokens,
-                "elapsed_seconds": elapsed,
-                "tokens_per_second": tokens_per_sec,
-            }
-            return True
-        except Exception as exc:  # pragma: no cover - defensive
-            warnings.warn(
-                f"Failed to compute GPT-2 difficulty scores: {exc}", stacklevel=2
-            )
-            self._difficulty_model = False
-            self._difficulty_device = None
-            self.__class__._MODEL_CACHE = False
-            self.__class__._MODEL_DEVICE = None
+    def _score_candidates_byte_ngram(self, candidates: list[dict[str, Any]]) -> bool:
+        if not candidates:
             self._last_batch_size_used = 0
             self._last_scorer_profile = None
             return False
+        order = max(1, int(self._BYTE_NGRAM_ORDER))
+        pad_token = int(self._BYTE_NGRAM_PAD)
+        alpha = float(self._BYTE_NGRAM_ALPHA)
+        vocab_size = pad_token + 1
+        context_counts: Counter[tuple[int, ...]] = Counter()
+        ngram_counts: Counter[tuple[int, ...]] = Counter()
+        sequences: list[list[int]] = []
+        start_time = time.perf_counter()
+        for candidate in candidates:
+            text = candidate.get("text")
+            if not isinstance(text, str):
+                text = ""
+            byte_values = list(text.encode("utf-8", errors="replace"))
+            tokens = ([pad_token] * (order - 1)) + byte_values
+            sequences.append(tokens)
+            for idx in range(order - 1, len(tokens)):
+                context = tuple(tokens[idx - order + 1 : idx])
+                ngram = context + (tokens[idx],)
+                context_counts[context] += 1
+                ngram_counts[ngram] += 1
+        total_tokens = 0
+        for candidate, tokens in zip(candidates, sequences, strict=False):
+            loss_sum = 0.0
+            token_count = 0
+            for idx in range(order - 1, len(tokens)):
+                context = tuple(tokens[idx - order + 1 : idx])
+                ngram = context + (tokens[idx],)
+                context_count = context_counts.get(context, 0)
+                ngram_count = ngram_counts.get(ngram, 0)
+                prob = (ngram_count + alpha) / (context_count + alpha * vocab_size)
+                loss_sum += -math.log(prob)
+                token_count += 1
+            candidate["difficulty"] = loss_sum / max(token_count, 1)
+            total_tokens += token_count
+        self._last_batch_size_used = len(candidates)
+        elapsed = max(time.perf_counter() - start_time, 1e-9)
+        tokens_per_sec = total_tokens / elapsed if total_tokens else 0.0
+        self._last_scorer_profile = {
+            "mode": "byte_ngram",
+            "order": order,
+            "vocab_size": vocab_size,
+            "tokens_processed": total_tokens,
+            "elapsed_seconds": elapsed,
+            "tokens_per_second": tokens_per_sec,
+        }
+        return True
     def _tokenize_samples(
         self,
         texts: list[str],

invarlock/eval/metrics.py CHANGED Viewed

@@ -1379,6 +1379,88 @@ def _resolve_eval_device(
     return resolved
+def _infer_model_vocab_size(model: nn.Module) -> int | None:
+    """Best-effort vocab size for guarding against invalid token IDs.
+    Prefer the actual embedding size (more reliable than config.vocab_size when
+    tokenizers have added tokens), and fall back to config when embeddings are
+    unavailable (e.g., stub models in tests).
+    """
+    try:
+        get_emb = getattr(model, "get_input_embeddings", None)
+        if callable(get_emb):
+            emb = get_emb()
+            weight = getattr(emb, "weight", None)
+            if weight is not None and hasattr(weight, "shape"):
+                size = int(weight.shape[0])
+                if size > 0:
+                    return size
+    except Exception:
+        pass
+    # Fallback for lightweight/stub models: pick the largest nn.Embedding module.
+    # This is not guaranteed to be the token embedding, but is a good heuristic
+    # when get_input_embeddings/config.vocab_size are unavailable.
+    try:
+        max_embeddings = 0
+        for module in model.modules():
+            if isinstance(module, nn.Embedding):
+                max_embeddings = max(max_embeddings, int(module.num_embeddings))
+        if max_embeddings > 0:
+            return max_embeddings
+    except Exception:
+        pass
+    config = getattr(model, "config", None)
+    vocab_size = getattr(config, "vocab_size", None)
+    if isinstance(vocab_size, int) and vocab_size > 0:
+        return vocab_size
+    return None
+def _resolve_pad_token_id(model: nn.Module, vocab_size: int | None) -> int:
+    """Pick a safe pad token id for sanitizing invalid token IDs."""
+    config = getattr(model, "config", None)
+    pad_token_id = getattr(config, "pad_token_id", None)
+    if isinstance(pad_token_id, int) and pad_token_id >= 0:
+        if vocab_size is None or pad_token_id < vocab_size:
+            return pad_token_id
+    return 0
+def _sanitize_token_ids_for_model(
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    labels: torch.Tensor | None,
+    *,
+    vocab_size: int,
+    pad_token_id: int,
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+    """Prevent device-side asserts from out-of-range token IDs.
+    Out-of-range token IDs can trigger CUDA device-side asserts in embedding and
+    gather kernels, poisoning the CUDA context for the entire process. Instead,
+    mask them out as padding and ignore them in labels.
+    """
+    if vocab_size <= 0:
+        return input_ids, attention_mask, labels
+    invalid_inputs = (input_ids < 0) | (input_ids >= vocab_size)
+    if invalid_inputs.any():
+        input_ids = input_ids.masked_fill(invalid_inputs, pad_token_id)
+        if attention_mask is not None:
+            attention_mask = attention_mask.masked_fill(invalid_inputs, 0)
+        if labels is not None:
+            labels = labels.masked_fill(invalid_inputs, -100)
+    if labels is not None:
+        invalid_labels = (labels != -100) & ((labels < 0) | (labels >= vocab_size))
+        if invalid_labels.any():
+            labels = labels.masked_fill(invalid_labels, -100)
+    return input_ids, attention_mask, labels
 # ── Perplexity calculation ─────────────────────────────────────────────────
 @torch.no_grad()
 def calculate_perplexity(
@@ -1415,6 +1497,8 @@ def compute_perplexity_strict(
     device = _resolve_eval_device(model, device)
     model.eval()
+    model_vocab_size = _infer_model_vocab_size(model)
+    pad_token_id = _resolve_pad_token_id(model, model_vocab_size)
     nll_sum = 0.0
     tok_count = 0
@@ -1453,6 +1537,15 @@ def compute_perplexity_strict(
         else:
             labels = labels.to(device)
+        if model_vocab_size is not None:
+            input_ids, attn, labels = _sanitize_token_ids_for_model(
+                input_ids,
+                attn,
+                labels,
+                vocab_size=model_vocab_size,
+                pad_token_id=pad_token_id,
+            )
         # Skip if sequence too short
         if input_ids.size(1) < 2:
             continue
@@ -1507,7 +1600,11 @@ def compute_perplexity_strict(
             continue
         log_probs = shift_logits.log_softmax(dim=-1)  # [B,T-1,V]
-        tgt = shift_labels.clamp_min(0).unsqueeze(-1)  # [B,T-1,1]
+        vocab_size = int(shift_logits.size(-1))
+        valid = valid & (shift_labels >= 0) & (shift_labels < vocab_size)
+        if not valid.any():
+            continue
+        tgt = shift_labels.clamp(min=0, max=vocab_size - 1).unsqueeze(-1)  # [B,T-1,1]
         nll = -log_probs.gather(-1, tgt).squeeze(-1)  # [B,T-1]
         nll_sum += nll[valid].sum().item()
@@ -1552,6 +1649,8 @@ def compute_perplexity(
     device = _resolve_eval_device(model, device)
     model.eval()
+    model_vocab_size = _infer_model_vocab_size(model)
+    pad_token_id = _resolve_pad_token_id(model, model_vocab_size)
     nll_sum = 0.0
     tok_count = 0
     batch_count = 0
@@ -1589,6 +1688,15 @@ def compute_perplexity(
         else:
             labels = labels.to(device)
+        if model_vocab_size is not None:
+            input_ids, attn, labels = _sanitize_token_ids_for_model(
+                input_ids,
+                attn,
+                labels,
+                vocab_size=model_vocab_size,
+                pad_token_id=pad_token_id,
+            )
         # Skip if sequence too short
         if input_ids.size(1) < 2:
             continue
@@ -1620,7 +1728,11 @@ def compute_perplexity(
         # Compute negative log-likelihood
         log_probs = shift_logits.log_softmax(dim=-1)  # [B,T-1,V]
-        tgt = shift_labels.clamp_min(0).unsqueeze(-1)  # [B,T-1,1]
+        vocab_size = int(shift_logits.size(-1))
+        valid = valid & (shift_labels >= 0) & (shift_labels < vocab_size)
+        if not valid.any():
+            continue
+        tgt = shift_labels.clamp(min=0, max=vocab_size - 1).unsqueeze(-1)  # [B,T-1,1]
         # MPS workaround: gather operation can fail on MPS, use CPU fallback
         if str(device).startswith("mps"):
@@ -1694,6 +1806,8 @@ def compute_ppl(
     device = _resolve_eval_device(model, device)
     model.eval()
+    model_vocab_size = _infer_model_vocab_size(model)
+    pad_token_id = _resolve_pad_token_id(model, model_vocab_size)
     nll_sum = 0.0
     tok_count = 0
@@ -1712,6 +1826,15 @@ def compute_ppl(
             torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0).to(device)
         )
+        if model_vocab_size is not None:
+            input_ids_tensor, attention_mask_tensor, _ = _sanitize_token_ids_for_model(
+                input_ids_tensor,
+                attention_mask_tensor,
+                labels=None,
+                vocab_size=model_vocab_size,
+                pad_token_id=pad_token_id,
+            )
         # Skip sequences that are too short
         if input_ids_tensor.size(1) < 2:
             continue
@@ -1747,7 +1870,11 @@ def compute_ppl(
         # Compute negative log-likelihood
         log_probs = shift_logits.log_softmax(dim=-1)  # [B,T-1,V]
-        tgt = shift_labels.clamp_min(0).unsqueeze(-1)  # [B,T-1,1]
+        vocab_size = int(shift_logits.size(-1))
+        valid = valid & (shift_labels >= 0) & (shift_labels < vocab_size)
+        if not valid.any():
+            continue
+        tgt = shift_labels.clamp(min=0, max=vocab_size - 1).unsqueeze(-1)  # [B,T-1,1]
         # Handle MPS device issues with gather
         if str(device).startswith("mps"):

invarlock 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

invarlock 0.3.3py3-none-any.whl → 0.3.5py3-none-any.whl