PyPI - invarlock - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

invarlock 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

invarlock/__init__.py +1 -1
invarlock/_data/runtime/tiers.yaml +57 -30
invarlock/adapters/__init__.py +1 -1
invarlock/calibration/spectral_null.py +15 -10
invarlock/calibration/variance_ve.py +0 -2
invarlock/cli/commands/calibrate.py +6 -2
invarlock/cli/commands/certify.py +58 -39
invarlock/cli/commands/doctor.py +3 -1
invarlock/cli/commands/explain_gates.py +57 -8
invarlock/cli/commands/report.py +1 -1
invarlock/cli/commands/run.py +159 -61
invarlock/cli/commands/verify.py +78 -4
invarlock/cli/config.py +21 -5
invarlock/core/api.py +45 -5
invarlock/core/auto_tuning.py +65 -20
invarlock/core/contracts.py +7 -1
invarlock/core/registry.py +2 -2
invarlock/core/runner.py +314 -50
invarlock/eval/bench.py +0 -13
invarlock/eval/data.py +73 -283
invarlock/eval/metrics.py +134 -4
invarlock/eval/primary_metric.py +23 -0
invarlock/eval/tail_stats.py +230 -0
invarlock/guards/_estimators.py +154 -0
invarlock/guards/policies.py +16 -6
invarlock/guards/rmt.py +625 -544
invarlock/guards/spectral.py +348 -110
invarlock/guards/tier_config.py +32 -30
invarlock/guards/variance.py +5 -29
invarlock/guards_ref/rmt_ref.py +23 -23
invarlock/model_profile.py +42 -15
invarlock/reporting/certificate.py +225 -46
invarlock/reporting/certificate_schema.py +2 -1
invarlock/reporting/dataset_hashing.py +15 -2
invarlock/reporting/guards_analysis.py +197 -274
invarlock/reporting/normalizer.py +6 -0
invarlock/reporting/policy_utils.py +38 -36
invarlock/reporting/primary_metric_utils.py +71 -17
invarlock/reporting/render.py +61 -0
invarlock/reporting/report.py +1 -1
invarlock/reporting/report_types.py +5 -2
invarlock/reporting/validate.py +1 -18
{invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/METADATA +6 -6
{invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/RECORD +48 -46
{invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/WHEEL +0 -0
{invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/entry_points.txt +0 -0
{invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.4.dist-info → invarlock-0.3.6.dist-info}/top_level.txt +0 -0

invarlock/eval/data.py CHANGED Viewed

@@ -7,7 +7,6 @@ Pluggable data loading system with deterministic windowing for reproducible eval
 from __future__ import annotations
-import atexit
 import hashlib
 import json
 import math
@@ -51,7 +50,6 @@ except ImportError:
 try:
     import torch
-    import torch.nn.functional as F
     HAS_TORCH = True
 except ImportError:
@@ -160,9 +158,9 @@ class WikiText2Provider:
     """
     name = "wikitext2"
-    _MODEL_CACHE: Any | None | bool = None
-    _MODEL_DEVICE: Any | None = None
-    _CLEANUP_REGISTERED: bool = False
+    _BYTE_NGRAM_ORDER = 4
+    _BYTE_NGRAM_PAD = 256
+    _BYTE_NGRAM_ALPHA = 1.0
     def __init__(
         self,
@@ -178,13 +176,9 @@ class WikiText2Provider:
         """
         self.cache_dir = cache_dir
         self._validate_dependencies()
-        self._register_cleanup()
-        self._difficulty_model = self.__class__._MODEL_CACHE
-        self._difficulty_device = self.__class__._MODEL_DEVICE
         self._last_stratification_stats: dict[str, Any] | None = None
         self._last_batch_size_used: int = 0
         self._last_scorer_profile: dict[str, Any] | None = None
-        self._scorer_warmed: bool = False
         # In-process cache for loaded/filtered texts to avoid repeated
         # load_dataset() calls across stratification retries.
         self._texts_cache: dict[str, list[str]] = {}
@@ -192,48 +186,9 @@ class WikiText2Provider:
         normalized_hint = (device_hint or "").strip().lower()
         self._device_hint: str | None = normalized_hint or None
-    @classmethod
-    def _register_cleanup(cls) -> None:
-        """Register an atexit hook once per process to release cached models."""
-        if cls._CLEANUP_REGISTERED or not HAS_TORCH:
-            return
-        def _cleanup() -> None:
-            cls._cleanup_model_cache()
-        atexit.register(_cleanup)
-        cls._CLEANUP_REGISTERED = True
-    @classmethod
-    def _cleanup_model_cache(cls) -> None:
-        """Release cached models to avoid leaking multiprocessing semaphores."""
-        cache = cls._MODEL_CACHE
-        if cache is not None and cache is not False and HAS_TORCH:
-            try:
-                cache.to("cpu")
-            except Exception:
-                pass
-        cls._MODEL_CACHE = None
-        cls._MODEL_DEVICE = None
-    @staticmethod
-    def _pick_default_scorer_device() -> torch.device:
-        """
-        Choose a default device for the difficulty scorer model.
-        Prefers CUDA → MPS → CPU when available.
-        """
-        if torch.cuda.is_available():
-            return torch.device("cuda")
-        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-            return torch.device("mps")
-        return torch.device("cpu")
     def _validate_dependencies(self) -> None:
         """Check that required dependencies are available."""
         if not HAS_DATASETS:
-            if _LIGHT_IMPORT:
-                return
             raise _DepErr(
                 code="E301",
                 message=(
@@ -371,13 +326,6 @@ class WikiText2Provider:
         if cached is not None and len(cached) >= max_samples:
             return cached[:max_samples]
-        if not HAS_DATASETS and _LIGHT_IMPORT:
-            texts = ["hello world", "invarlock synthetic text"] * max(
-                1, max_samples // 2
-            )
-            self._texts_cache[split] = texts
-            return texts[:max_samples]
         # Load dataset with size limit for efficiency
         dataset_slice = f"{split}[:{max_samples}]" if max_samples > 0 else split
         dataset = load_dataset(
@@ -513,9 +461,11 @@ class WikiText2Provider:
                 candidates.append(
                     {
                         "dataset_index": idx,
+                        "text": texts[idx],
                         "input_ids": input_ids_list,
                         "attention_mask": attention_mask_list,
                         "token_count": real_tokens,
+                        "seq_len": len(input_ids_list),
                     }
                 )
@@ -531,32 +481,7 @@ class WikiText2Provider:
                 details={"needed": int(total_required), "got": int(len(candidates))},
             )
-        if not self._score_candidates_with_model(candidates):
-            token_counter: Counter[int] = Counter()
-            for candidate in candidates:
-                for token_id, mask in zip(
-                    candidate["input_ids"], candidate["attention_mask"], strict=False
-                ):
-                    if mask:
-                        token_counter[int(token_id)] += 1
-            total_tokens = sum(token_counter.values()) or 1
-            vocab_size = max(len(token_counter), 1)
-            for candidate in candidates:
-                difficulty = 0.0
-                real_tokens = 0
-                for token_id, mask in zip(
-                    candidate["input_ids"], candidate["attention_mask"], strict=False
-                ):
-                    if not mask:
-                        continue
-                    freq = (token_counter[int(token_id)] + 1.0) / (
-                        total_tokens + vocab_size
-                    )
-                    difficulty -= math.log(freq)
-                    real_tokens += 1
-                candidate["difficulty"] = difficulty / max(real_tokens, 1)
+        self._score_candidates_byte_ngram(candidates)
         sorted_candidates = sorted(
             candidates, key=lambda item: (item["difficulty"], item["dataset_index"])
@@ -843,193 +768,63 @@ class WikiText2Provider:
         return results
-    def _score_candidates_with_model(self, candidates: list[dict[str, Any]]) -> bool:
-        """Score candidate windows using a pretrained GPT-2 model if available."""
-        if not HAS_TORCH:
-            return False
-        if self._difficulty_model is False:
-            return False
-        try:
-            eval_device_override = os.environ.get("INVARLOCK_EVAL_DEVICE")
-            device_hint = getattr(self, "_device_hint", None)
-            def _is_device_usable(device: torch.device) -> bool:
-                try:
-                    _ = torch.zeros((1, 1), dtype=torch.long, device=device)
-                    return True
-                except Exception:
-                    return False
-            if self._difficulty_model is None:
-                from transformers import GPT2LMHeadModel
-                model = GPT2LMHeadModel.from_pretrained("gpt2")
-                model.eval()
-                # Decide initial scorer device: env override → provider hint → heuristic
-                if eval_device_override:
-                    try:
-                        device = torch.device(eval_device_override)
-                    except Exception:
-                        device = self._pick_default_scorer_device()
-                elif device_hint and device_hint != "auto":
-                    try:
-                        device = torch.device(device_hint)
-                    except Exception:
-                        device = self._pick_default_scorer_device()
-                else:
-                    device = self._pick_default_scorer_device()
-                if device.type != "cpu" and not _is_device_usable(device):
-                    warnings.warn(
-                        f"Difficulty scorer device {device} unavailable; falling back to CPU",
-                        stacklevel=2,
-                    )
-                    device = torch.device("cpu")
-                model.to(device)
-                self._difficulty_model = model
-                self._difficulty_device = device
-                self.__class__._MODEL_CACHE = model
-                self.__class__._MODEL_DEVICE = device
-            assert self._difficulty_model is not None
-            model = self._difficulty_model
-            device = self._difficulty_device or torch.device("cpu")
-            # If a new override/hint is provided, move the cached model if needed.
-            desired_device = device
-            if eval_device_override:
-                try:
-                    desired_device = torch.device(eval_device_override)
-                except Exception:
-                    desired_device = device
-            elif device_hint and device_hint != "auto":
-                try:
-                    desired_device = torch.device(device_hint)
-                except Exception:
-                    desired_device = device
-            if desired_device != device:
-                if desired_device.type != "cpu" and not _is_device_usable(
-                    desired_device
-                ):
-                    warnings.warn(
-                        f"Difficulty scorer device {desired_device} unavailable; keeping {device}",
-                        stacklevel=2,
-                    )
-                else:
-                    try:
-                        model.to(desired_device)
-                        device = desired_device
-                        self._difficulty_device = desired_device
-                        self.__class__._MODEL_DEVICE = desired_device
-                    except Exception as exc:
-                        warnings.warn(
-                            f"Failed to move GPT-2 difficulty scorer to {desired_device}: {exc}",
-                            stacklevel=2,
-                        )
-            if not self._scorer_warmed:
-                with torch.no_grad():
-                    dummy_input = torch.zeros((1, 8), dtype=torch.long, device=device)
-                    dummy_attention = torch.ones_like(dummy_input)
-                    model(dummy_input, attention_mask=dummy_attention)
-                self._scorer_warmed = True
-            batch_override = os.environ.get("INVARLOCK_SCORES_BATCH_SIZE")
-            override_size = None
-            if batch_override:
-                try:
-                    override_size = max(1, int(batch_override))
-                except ValueError:
-                    override_size = None
-            batch_size = min(32, max(4, len(candidates)))
-            if override_size is not None:
-                batch_size = max(1, min(override_size, len(candidates)))
-            config = getattr(model, "config", None)
-            scorer_vocab_size = getattr(config, "vocab_size", None)
-            input_batch: list[list[int]] = []
-            attention_batch: list[list[int]] = []
-            candidate_batch: list[dict[str, Any]] = []
-            total_tokens = 0
-            start_time = time.perf_counter()
-            with torch.no_grad():
-                for candidate in candidates:
-                    input_batch.append(candidate["input_ids"])
-                    attention_batch.append(candidate["attention_mask"])
-                    candidate_batch.append(candidate)
-                    if len(input_batch) == batch_size or candidate is candidates[-1]:
-                        input_tensor = torch.tensor(
-                            input_batch, dtype=torch.long, device=device
-                        )
-                        attention_tensor = torch.tensor(
-                            attention_batch, dtype=torch.long, device=device
-                        )
-                        # Guard against out-of-range token IDs when scoring with GPT-2.
-                        # Some model tokenizers emit IDs beyond GPT-2 vocab, which can
-                        # trigger device-side asserts in embedding/gather kernels.
-                        if scorer_vocab_size and scorer_vocab_size > 0:
-                            input_tensor = input_tensor.clamp(
-                                min=0, max=scorer_vocab_size - 1
-                            )
-                        outputs = model(input_tensor, attention_mask=attention_tensor)
-                        shift_logits = outputs.logits[:, :-1, :].contiguous()
-                        shift_labels = input_tensor[:, 1:].contiguous()
-                        shift_mask = attention_tensor[:, 1:].contiguous()
-                        shift_labels = shift_labels.masked_fill(shift_mask == 0, 0)
-                        vocab_size = shift_logits.size(-1)
-                        losses = F.cross_entropy(
-                            shift_logits.view(-1, vocab_size),
-                            shift_labels.view(-1),
-                            reduction="none",
-                        )
-                        losses = losses.view(shift_labels.size()) * shift_mask
-                        token_counts = shift_mask.sum(dim=1).clamp(min=1)
-                        loss_per_example = (
-                            (losses.sum(dim=1) / token_counts).cpu().tolist()
-                        )
-                        for cand_obj, loss_value in zip(
-                            candidate_batch, loss_per_example, strict=False
-                        ):
-                            cand_obj["difficulty"] = float(loss_value)
-                        total_tokens += int(token_counts.sum().item())
-                        input_batch.clear()
-                        attention_batch.clear()
-                        candidate_batch.clear()
-            self._last_batch_size_used = batch_size
-            elapsed = max(time.perf_counter() - start_time, 1e-9)
-            tokens_per_sec = total_tokens / elapsed if total_tokens else 0.0
-            self._last_scorer_profile = {
-                "batch_size": batch_size,
-                "tokens_processed": total_tokens,
-                "elapsed_seconds": elapsed,
-                "tokens_per_second": tokens_per_sec,
-            }
-            return True
-        except Exception as exc:  # pragma: no cover - defensive
-            warnings.warn(
-                f"Failed to compute GPT-2 difficulty scores: {exc}", stacklevel=2
-            )
-            self._difficulty_model = False
-            self._difficulty_device = None
-            self.__class__._MODEL_CACHE = False
-            self.__class__._MODEL_DEVICE = None
+    def _score_candidates_byte_ngram(self, candidates: list[dict[str, Any]]) -> bool:
+        if not candidates:
             self._last_batch_size_used = 0
             self._last_scorer_profile = None
             return False
+        order = max(1, int(self._BYTE_NGRAM_ORDER))
+        pad_token = int(self._BYTE_NGRAM_PAD)
+        alpha = float(self._BYTE_NGRAM_ALPHA)
+        vocab_size = pad_token + 1
+        context_counts: Counter[tuple[int, ...]] = Counter()
+        ngram_counts: Counter[tuple[int, ...]] = Counter()
+        sequences: list[list[int]] = []
+        start_time = time.perf_counter()
+        for candidate in candidates:
+            text = candidate.get("text")
+            if not isinstance(text, str):
+                text = ""
+            byte_values = list(text.encode("utf-8", errors="replace"))
+            tokens = ([pad_token] * (order - 1)) + byte_values
+            sequences.append(tokens)
+            for idx in range(order - 1, len(tokens)):
+                context = tuple(tokens[idx - order + 1 : idx])
+                ngram = context + (tokens[idx],)
+                context_counts[context] += 1
+                ngram_counts[ngram] += 1
+        total_tokens = 0
+        for candidate, tokens in zip(candidates, sequences, strict=False):
+            loss_sum = 0.0
+            token_count = 0
+            for idx in range(order - 1, len(tokens)):
+                context = tuple(tokens[idx - order + 1 : idx])
+                ngram = context + (tokens[idx],)
+                context_count = context_counts.get(context, 0)
+                ngram_count = ngram_counts.get(ngram, 0)
+                prob = (ngram_count + alpha) / (context_count + alpha * vocab_size)
+                loss_sum += -math.log(prob)
+                token_count += 1
+            candidate["difficulty"] = loss_sum / max(token_count, 1)
+            total_tokens += token_count
+        self._last_batch_size_used = len(candidates)
+        elapsed = max(time.perf_counter() - start_time, 1e-9)
+        tokens_per_sec = total_tokens / elapsed if total_tokens else 0.0
+        self._last_scorer_profile = {
+            "mode": "byte_ngram",
+            "order": order,
+            "vocab_size": vocab_size,
+            "tokens_processed": total_tokens,
+            "elapsed_seconds": elapsed,
+            "tokens_per_second": tokens_per_sec,
+        }
+        return True
     def _tokenize_samples(
         self,
         texts: list[str],
@@ -1258,14 +1053,13 @@ class HFTextProvider:
         max_samples: int = 2000,
     ):
         if not HAS_DATASETS:
-            if not _LIGHT_IMPORT:
-                raise _DepErr(
-                    code="E301",
-                    message=(
-                        "DEPENDENCY-MISSING: datasets library required for hf_text provider"
-                    ),
-                    details={"dependency": "datasets"},
-                )
+            raise _DepErr(
+                code="E301",
+                message=(
+                    "DEPENDENCY-MISSING: datasets library required for hf_text provider"
+                ),
+                details={"dependency": "datasets"},
+            )
         self.dataset_name = dataset_name or "wikitext"
         self.config_name = config_name or None
         self.text_field = text_field
@@ -1273,9 +1067,6 @@ class HFTextProvider:
         self.max_samples = int(max_samples)
     def load(self, split: str = "validation", **kwargs) -> list[str]:
-        if not HAS_DATASETS and _LIGHT_IMPORT:
-            return ["synthetic dataset text"] * int(self.max_samples or 1)
         ds = load_dataset(
             path=self.dataset_name,
             name=self.config_name,
@@ -1400,14 +1191,13 @@ class HFSeq2SeqProvider:
         max_samples: int = 2000,
     ) -> None:
         if not HAS_DATASETS:
-            if not _LIGHT_IMPORT:
-                raise _DepErr(
-                    code="E301",
-                    message=(
-                        "DEPENDENCY-MISSING: datasets library required for hf_seq2seq provider"
-                    ),
-                    details={"dependency": "datasets"},
-                )
+            raise _DepErr(
+                code="E301",
+                message=(
+                    "DEPENDENCY-MISSING: datasets library required for hf_seq2seq provider"
+                ),
+                details={"dependency": "datasets"},
+            )
         self.dataset_name = dataset_name
         self.config_name = config_name
         self.src_field = src_field

invarlock/eval/metrics.py CHANGED Viewed

@@ -723,7 +723,10 @@ def calculate_lens_metrics_for_model(
     except Exception as e:
         logger.error(f"Metrics calculation failed: {e}")
         if config.strict_validation:
-            raise MetricsError(f"Metrics calculation failed: {e}") from e
+            raise MetricsError(
+                code="E401",
+                message=f"METRICS-COMPUTE-FAILED: {e}",
+            ) from e
     finally:
         resource_manager.cleanup()
@@ -1379,6 +1382,88 @@ def _resolve_eval_device(
     return resolved
+def _infer_model_vocab_size(model: nn.Module) -> int | None:
+    """Best-effort vocab size for guarding against invalid token IDs.
+    Prefer the actual embedding size (more reliable than config.vocab_size when
+    tokenizers have added tokens), and fall back to config when embeddings are
+    unavailable (e.g., stub models in tests).
+    """
+    try:
+        get_emb = getattr(model, "get_input_embeddings", None)
+        if callable(get_emb):
+            emb = get_emb()
+            weight = getattr(emb, "weight", None)
+            if weight is not None and hasattr(weight, "shape"):
+                size = int(weight.shape[0])
+                if size > 0:
+                    return size
+    except Exception:
+        pass
+    # Fallback for lightweight/stub models: pick the largest nn.Embedding module.
+    # This is not guaranteed to be the token embedding, but is a good heuristic
+    # when get_input_embeddings/config.vocab_size are unavailable.
+    try:
+        max_embeddings = 0
+        for module in model.modules():
+            if isinstance(module, nn.Embedding):
+                max_embeddings = max(max_embeddings, int(module.num_embeddings))
+        if max_embeddings > 0:
+            return max_embeddings
+    except Exception:
+        pass
+    config = getattr(model, "config", None)
+    vocab_size = getattr(config, "vocab_size", None)
+    if isinstance(vocab_size, int) and vocab_size > 0:
+        return vocab_size
+    return None
+def _resolve_pad_token_id(model: nn.Module, vocab_size: int | None) -> int:
+    """Pick a safe pad token id for sanitizing invalid token IDs."""
+    config = getattr(model, "config", None)
+    pad_token_id = getattr(config, "pad_token_id", None)
+    if isinstance(pad_token_id, int) and pad_token_id >= 0:
+        if vocab_size is None or pad_token_id < vocab_size:
+            return pad_token_id
+    return 0
+def _sanitize_token_ids_for_model(
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    labels: torch.Tensor | None,
+    *,
+    vocab_size: int,
+    pad_token_id: int,
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+    """Prevent device-side asserts from out-of-range token IDs.
+    Out-of-range token IDs can trigger CUDA device-side asserts in embedding and
+    gather kernels, poisoning the CUDA context for the entire process. Instead,
+    mask them out as padding and ignore them in labels.
+    """
+    if vocab_size <= 0:
+        return input_ids, attention_mask, labels
+    invalid_inputs = (input_ids < 0) | (input_ids >= vocab_size)
+    if invalid_inputs.any():
+        input_ids = input_ids.masked_fill(invalid_inputs, pad_token_id)
+        if attention_mask is not None:
+            attention_mask = attention_mask.masked_fill(invalid_inputs, 0)
+        if labels is not None:
+            labels = labels.masked_fill(invalid_inputs, -100)
+    if labels is not None:
+        invalid_labels = (labels != -100) & ((labels < 0) | (labels >= vocab_size))
+        if invalid_labels.any():
+            labels = labels.masked_fill(invalid_labels, -100)
+    return input_ids, attention_mask, labels
 # ── Perplexity calculation ─────────────────────────────────────────────────
 @torch.no_grad()
 def calculate_perplexity(
@@ -1415,6 +1500,8 @@ def compute_perplexity_strict(
     device = _resolve_eval_device(model, device)
     model.eval()
+    model_vocab_size = _infer_model_vocab_size(model)
+    pad_token_id = _resolve_pad_token_id(model, model_vocab_size)
     nll_sum = 0.0
     tok_count = 0
@@ -1453,6 +1540,15 @@ def compute_perplexity_strict(
         else:
             labels = labels.to(device)
+        if model_vocab_size is not None:
+            input_ids, attn, labels = _sanitize_token_ids_for_model(
+                input_ids,
+                attn,
+                labels,
+                vocab_size=model_vocab_size,
+                pad_token_id=pad_token_id,
+            )
         # Skip if sequence too short
         if input_ids.size(1) < 2:
             continue
@@ -1507,7 +1603,11 @@ def compute_perplexity_strict(
             continue
         log_probs = shift_logits.log_softmax(dim=-1)  # [B,T-1,V]
-        tgt = shift_labels.clamp_min(0).unsqueeze(-1)  # [B,T-1,1]
+        vocab_size = int(shift_logits.size(-1))
+        valid = valid & (shift_labels >= 0) & (shift_labels < vocab_size)
+        if not valid.any():
+            continue
+        tgt = shift_labels.clamp(min=0, max=vocab_size - 1).unsqueeze(-1)  # [B,T-1,1]
         nll = -log_probs.gather(-1, tgt).squeeze(-1)  # [B,T-1]
         nll_sum += nll[valid].sum().item()
@@ -1552,6 +1652,8 @@ def compute_perplexity(
     device = _resolve_eval_device(model, device)
     model.eval()
+    model_vocab_size = _infer_model_vocab_size(model)
+    pad_token_id = _resolve_pad_token_id(model, model_vocab_size)
     nll_sum = 0.0
     tok_count = 0
     batch_count = 0
@@ -1589,6 +1691,15 @@ def compute_perplexity(
         else:
             labels = labels.to(device)
+        if model_vocab_size is not None:
+            input_ids, attn, labels = _sanitize_token_ids_for_model(
+                input_ids,
+                attn,
+                labels,
+                vocab_size=model_vocab_size,
+                pad_token_id=pad_token_id,
+            )
         # Skip if sequence too short
         if input_ids.size(1) < 2:
             continue
@@ -1620,7 +1731,11 @@ def compute_perplexity(
         # Compute negative log-likelihood
         log_probs = shift_logits.log_softmax(dim=-1)  # [B,T-1,V]
-        tgt = shift_labels.clamp_min(0).unsqueeze(-1)  # [B,T-1,1]
+        vocab_size = int(shift_logits.size(-1))
+        valid = valid & (shift_labels >= 0) & (shift_labels < vocab_size)
+        if not valid.any():
+            continue
+        tgt = shift_labels.clamp(min=0, max=vocab_size - 1).unsqueeze(-1)  # [B,T-1,1]
         # MPS workaround: gather operation can fail on MPS, use CPU fallback
         if str(device).startswith("mps"):
@@ -1694,6 +1809,8 @@ def compute_ppl(
     device = _resolve_eval_device(model, device)
     model.eval()
+    model_vocab_size = _infer_model_vocab_size(model)
+    pad_token_id = _resolve_pad_token_id(model, model_vocab_size)
     nll_sum = 0.0
     tok_count = 0
@@ -1712,6 +1829,15 @@ def compute_ppl(
             torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0).to(device)
         )
+        if model_vocab_size is not None:
+            input_ids_tensor, attention_mask_tensor, _ = _sanitize_token_ids_for_model(
+                input_ids_tensor,
+                attention_mask_tensor,
+                labels=None,
+                vocab_size=model_vocab_size,
+                pad_token_id=pad_token_id,
+            )
         # Skip sequences that are too short
         if input_ids_tensor.size(1) < 2:
             continue
@@ -1747,7 +1873,11 @@ def compute_ppl(
         # Compute negative log-likelihood
         log_probs = shift_logits.log_softmax(dim=-1)  # [B,T-1,V]
-        tgt = shift_labels.clamp_min(0).unsqueeze(-1)  # [B,T-1,1]
+        vocab_size = int(shift_logits.size(-1))
+        valid = valid & (shift_labels >= 0) & (shift_labels < vocab_size)
+        if not valid.any():
+            continue
+        tgt = shift_labels.clamp(min=0, max=vocab_size - 1).unsqueeze(-1)  # [B,T-1,1]
         # Handle MPS device issues with gather
         if str(device).startswith("mps"):

invarlock 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

invarlock 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl