PyPI - spectralquant - Versions diffs - 0.3.0__py3-none-any.whl - Mend

spectralquant 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

spectralquant/__init__.py +75 -0
spectralquant/_water_fill.py +45 -0
spectralquant/api.py +357 -0
spectralquant/calibrate.py +891 -0
spectralquant/calibration_data.py +117 -0
spectralquant/engine.py +1185 -0
spectralquant/integrations/__init__.py +59 -0
spectralquant/integrations/alphafold.py +305 -0
spectralquant/integrations/dynamic_cache.py +373 -0
spectralquant/integrations/esmfold.py +338 -0
spectralquant/integrations/huggingface.py +283 -0
spectralquant/integrations/videomae.py +440 -0
spectralquant/integrations/vit.py +265 -0
spectralquant/kernels/__init__.py +15 -0
spectralquant/kernels/compress_keys.py +128 -0
spectralquant/kernels/compress_values.py +142 -0
spectralquant/kernels/fused_attention.py +124 -0
spectralquant/presets.py +135 -0
spectralquant-0.3.0.dist-info/METADATA +329 -0
spectralquant-0.3.0.dist-info/RECORD +23 -0
spectralquant-0.3.0.dist-info/WHEEL +5 -0
spectralquant-0.3.0.dist-info/licenses/LICENSE +21 -0
spectralquant-0.3.0.dist-info/top_level.txt +1 -0

spectralquant/__init__.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""
+SpectralQuant — Eigenspectral KV Cache Compression for Transformers.
+Achieves up to 6.55x KV cache compression with FP16-equivalent output
+quality on Mistral 7B and Qwen 2.5 7B. Pure PyTorch, no custom CUDA
+kernels, runs anywhere torch runs.
+Quick start (3 lines)::
+    import spectralquant as sq
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3",
+                                                 torch_dtype="float16").cuda()
+    tok   = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
+    engine = sq.SpectralQuant(compression="high")        # 6.55x preset
+    out    = engine.generate(model, tok, "Explain water-filling in one paragraph.")
+    print(out["text"])
+    print(f"{out['stats']['ratio']:.2f}x compression at "
+          f"{out['stats']['tokens_per_second']:.1f} tok/s")
+Compression presets:
+* ``"standard"`` -> 5.95x   (paper baseline, safest)
+* ``"high"``     -> 6.55x   (validated on Mistral & Qwen 7B; default)
+* ``"max"``      -> 6.68x   (edge of the cliff, mild repetition possible)
+Inspect or override::
+    print(sq.describe_presets())                         # full table
+    engine = sq.SpectralQuant(compression="high",
+                              d_eff_variance=0.94)       # power-user override
+The low-level :class:`SpectralQuantEngine` is also exported for users who
+want explicit control over calibration, bit allocation, or the legacy
+attention-level monkey-patch path.
+"""
+# High-level user-facing API
+from spectralquant.api import SpectralQuant
+from spectralquant.presets import (
+    PRESETS,
+    CompressionPreset,
+    describe as describe_presets,
+    resolve as resolve_preset,
+)
+from spectralquant.calibration_data import (
+    CALIBRATION_TEXTS as DEFAULT_CALIBRATION_TEXTS,
+    get_default_calibration_texts,
+)
+# Low-level engine (kept stable for power users + backward compatibility)
+from spectralquant.engine import SpectralQuantEngine, HeadEngine
+from spectralquant.calibrate import EigenspectralCalibrator, HeadCalibrationData
+__version__ = "0.3.0"
+__all__ = [
+    # New high-level API
+    "SpectralQuant",
+    "PRESETS",
+    "CompressionPreset",
+    "describe_presets",
+    "resolve_preset",
+    "DEFAULT_CALIBRATION_TEXTS",
+    "get_default_calibration_texts",
+    # Low-level (unchanged)
+    "SpectralQuantEngine",
+    "HeadEngine",
+    "EigenspectralCalibrator",
+    "HeadCalibrationData",
+    "__version__",
+]

spectralquant/_water_fill.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Greedy water-filling bit allocator — used by engine.py."""
+from typing import List
+import torch
+def water_fill_allocate(
+    eigenvalues: torch.Tensor,
+    d_eff: int,
+    semantic_budget: int,
+    min_bits: int = 1,
+    max_bits: int = 8,
+) -> List[int]:
+    """
+    Greedy water-filling across d_eff semantic dimensions.
+    Each step gives the next bit to whichever dimension has the highest
+    marginal distortion reduction: gain_i = λ_i / 4^b_i.
+    Args:
+        eigenvalues:     Full eigenvalue array (head_dim,), sorted descending.
+        d_eff:           Number of semantic dimensions to allocate across.
+        semantic_budget: Total bits to spend = d_eff × mse_bits_high.
+        min_bits:        Minimum bits per dimension (default 1).
+        max_bits:        Maximum bits per dimension (default 8).
+    Returns:
+        List[int] of length d_eff summing to semantic_budget (or less if capped).
+    """
+    d_eff = min(d_eff, len(eigenvalues))
+    lam   = eigenvalues[:d_eff].float().tolist()
+    bits  = [min_bits] * d_eff
+    spent = d_eff * min_bits
+    for _ in range(max(0, semantic_budget - spent)):
+        gains = [
+            lam[i] / (4.0 ** bits[i]) if bits[i] < max_bits else -1.0
+            for i in range(d_eff)
+        ]
+        best = max(range(d_eff), key=lambda i: gains[i])
+        if gains[best] <= 0.0:
+            break
+        bits[best] += 1
+    return bits

spectralquant/api.py ADDED Viewed

@@ -0,0 +1,357 @@
+"""
+spectralquant.api
+─────────────────
+Clean, opinionated top-level API.
+The :class:`SpectralQuant` class wraps the lower-level
+:class:`SpectralQuantEngine` so most users never need to think about bit
+budgets, water-filling, or eigenvariance shares. They pick a named
+``compression`` preset and call ``.generate(...)``.
+::
+    import spectralquant as sq
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    model = AutoModelForCausalLM.from_pretrained(
+        "mistralai/Mistral-7B-Instruct-v0.3", torch_dtype="float16"
+    ).cuda()
+    tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
+    engine = sq.SpectralQuant(compression="high", device="cuda")
+    out = engine.generate(model, tok, "Explain water-filling in one paragraph.")
+    print(out["text"])
+    print(f"{out['stats']['ratio']:.2f}x compression at "
+          f"{out['stats']['tokens_per_second']:.1f} tok/s")
+Compression presets:
+* ``"standard"`` — paper-grade 5.95x, fully safe.
+* ``"high"``     — 6.55x, validated clean on Mistral and Qwen 7B (default).
+* ``"max"``      — 6.68x, edge of the cliff. Light degradation on long output.
+Power users can override any individual dial with kwargs::
+    engine = sq.SpectralQuant(
+        compression="high",                # base preset
+        d_eff_variance=0.93,               # override one knob
+    )
+"""
+from __future__ import annotations
+import time
+from typing import Any, Dict, List, Optional
+import torch
+from torch import nn
+from spectralquant.calibration_data import get_default_calibration_texts
+from spectralquant.engine import SpectralQuantEngine
+from spectralquant.presets import PRESETS, PresetName, resolve
+def _pick_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda"
+    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def _infer_head_dim(model: nn.Module) -> int:
+    """Best-effort head_dim inference from a HuggingFace model."""
+    cfg = getattr(model, "config", None)
+    if cfg is None:
+        return 128
+    if getattr(cfg, "head_dim", None):
+        return int(cfg.head_dim)
+    h = getattr(cfg, "hidden_size", None)
+    n = getattr(cfg, "num_attention_heads", None)
+    if h and n:
+        return int(h // n)
+    return 128
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+class SpectralQuant:
+    """Drop-in KV cache compression for transformer inference.
+    Parameters
+    ----------
+    compression : {"standard", "high", "max"}, default "high"
+        Named preset that selects the bit-allocation profile:
+        * ``"standard"``  -> 5.95x  (paper baseline, safest)
+        * ``"high"``      -> 6.55x  (validated clean on Mistral & Qwen 7B)
+        * ``"max"``       -> 6.68x  (edge of the cliff, may show light
+                                     repetition on long outputs)
+    device : str, optional
+        Torch device string. Defaults to ``"cuda"`` if available, else
+        ``"mps"``, else ``"cpu"``.
+    head_dim : int, optional
+        Per-head dimension. Inferred from the model on first use if not
+        specified. Pass explicitly to skip inference.
+    avg_bits, noise_bits, value_noise_bits, d_eff_variance : optional
+        Power-user overrides for the named preset.
+    Notes
+    -----
+    Calibration is *automatic*: the first time you call ``.generate()`` or
+    ``.compress_prefill(...)``, the engine runs the bundled 64-sentence
+    calibration set on your model. Subsequent calls reuse the calibration.
+    Pass your own corpus to ``.calibrate(...)`` for domain-specific
+    inference (e.g. code or biomedical text).
+    """
+    def __init__(
+        self,
+        *,
+        compression: PresetName = "high",
+        device: Optional[str] = None,
+        head_dim: Optional[int] = None,
+        avg_bits: Optional[int] = None,
+        noise_bits: Optional[int] = None,
+        value_noise_bits: Optional[int] = None,
+        d_eff_variance: Optional[float] = None,
+        use_qjl: bool = False,
+    ) -> None:
+        self._preset = resolve(compression)
+        self._device = device or _pick_device()
+        self._head_dim_user = head_dim
+        # Resolve dials: preset values, optionally overridden by kwargs.
+        self._cfg = {
+            "total_bits":       avg_bits         if avg_bits         is not None else self._preset.avg_bits,
+            "noise_bits":       noise_bits       if noise_bits       is not None else self._preset.noise_bits,
+            "value_noise_bits": value_noise_bits if value_noise_bits is not None else self._preset.value_noise_bits,
+            "d_eff_variance":   d_eff_variance   if d_eff_variance   is not None else self._preset.d_eff_variance,
+            "use_qjl":          use_qjl,
+        }
+        self._engine: Optional[SpectralQuantEngine] = None
+        self._calibrated_for: Optional[int] = None  # id(model) of last calibration
+    # ------------------------------------------------------------------
+    # Properties
+    # ------------------------------------------------------------------
+    @property
+    def preset_name(self) -> str:
+        return self._preset.name
+    @property
+    def expected_ratio(self) -> float:
+        """The headline compression ratio for this preset (e.g. 6.55)."""
+        return self._preset.ratio
+    @property
+    def device(self) -> str:
+        return self._device
+    @property
+    def engine(self) -> SpectralQuantEngine:
+        """The underlying low-level engine. Returns None until first
+        calibration; use ``ensure_engine(model)`` to force creation."""
+        if self._engine is None:
+            raise RuntimeError(
+                "SpectralQuant engine is not yet built. Call .calibrate(model, "
+                "tokenizer, ...) first, or just call .generate(...) which "
+                "auto-calibrates on first use."
+            )
+        return self._engine
+    # ------------------------------------------------------------------
+    # Calibration
+    # ------------------------------------------------------------------
+    def calibrate(
+        self,
+        model: nn.Module,
+        tokenizer: Any,
+        calibration_texts: Optional[List[str]] = None,
+        n_samples: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """Run eigenspectral calibration. Takes ~3-5 sec on H200 / 7B model.
+        Parameters
+        ----------
+        model, tokenizer : HuggingFace model and matching tokenizer.
+        calibration_texts : list of str, optional
+            Diverse texts representative of your inference workload. If
+            ``None`` (default), uses the bundled 64-sentence default corpus.
+        n_samples : int, optional
+            How many texts to actually pass through the model. Defaults to
+            ``len(calibration_texts)``.
+        """
+        head_dim = self._head_dim_user or _infer_head_dim(model)
+        if self._engine is None:
+            self._engine = SpectralQuantEngine(
+                head_dim=head_dim,
+                use_water_fill=True,
+                device=self._device,
+                **self._cfg,
+            )
+        texts = calibration_texts or get_default_calibration_texts()
+        n = n_samples or len(texts)
+        summary = self._engine.calibrate(model, tokenizer, texts, n_samples=n)
+        self._calibrated_for = id(model)
+        return summary
+    def _ensure_calibrated(self, model: nn.Module, tokenizer: Any) -> None:
+        if self._engine is not None and self._calibrated_for == id(model):
+            return
+        if self._engine is not None and self._calibrated_for is not None:
+            print(
+                "[SpectralQuant] Re-calibrating for new model "
+                "(previous calibration was on a different model id)."
+            )
+        else:
+            print(
+                f"[SpectralQuant] Auto-calibrating with bundled "
+                f"{len(get_default_calibration_texts())}-sentence corpus. "
+                "Pass your own to .calibrate(...) for domain-specific use."
+            )
+        self.calibrate(model, tokenizer)
+    # ------------------------------------------------------------------
+    # Generation
+    # ------------------------------------------------------------------
+    def generate(
+        self,
+        model: nn.Module,
+        tokenizer: Any,
+        prompt: str,
+        *,
+        max_new_tokens: int = 128,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        repetition_penalty: float = 1.0,
+        add_special_tokens: bool = True,
+    ) -> Dict[str, Any]:
+        """Run end-to-end compressed generation.
+        Returns a dict with keys::
+            text                  : str, the generated text (no prompt)
+            tokens                : int, number of new tokens generated
+            stats : {
+                ratio             : float, prefix-cache compression
+                tokens_per_second : float, decode throughput
+                fp16_bytes        : int,
+                compressed_bytes  : int,
+                preset            : str, e.g. "high"
+            }
+        """
+        self._ensure_calibrated(model, tokenizer)
+        result = self._engine.generate_compressed(
+            model, tokenizer, prompt,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            device=self._device,
+            add_special_tokens=add_special_tokens,
+        )
+        # Re-shape the engine return into the public schema.
+        stats = dict(result.get("stats", {}))
+        stats["preset"] = self._preset.name
+        return {
+            "text":   result.get("text", ""),
+            "tokens": stats.get("new_tokens", 0),
+            "stats":  stats,
+        }
+    # ------------------------------------------------------------------
+    # Cache compression (for users who want to manage decoding themselves)
+    # ------------------------------------------------------------------
+    def compress_prefill(
+        self,
+        model: nn.Module,
+        tokenizer: Any,
+        prompt: str,
+        *,
+        add_special_tokens: bool = True,
+    ) -> Dict[str, Any]:
+        """Run an FP16 prefill on ``prompt``, compress the resulting KV
+        cache, return a fresh ``DynamicCache`` ready to feed into
+        ``model.generate(past_key_values=cache, ...)``.
+        Returns::
+            cache         : DynamicCache, the compressed-then-reconstructed cache
+            input_ids     : LongTensor, the tokenised prompt
+            stats         : {ratio, fp16_bytes, compressed_bytes, ...}
+            prompt_length : int
+        Useful when you want to keep the compressed cache around (e.g. to
+        prepend it to many different completions of the same long prefix).
+        """
+        self._ensure_calibrated(model, tokenizer)
+        return self._engine.prefill_compress(
+            model, tokenizer, prompt,
+            device=self._device, add_special_tokens=add_special_tokens,
+        )
+    # ------------------------------------------------------------------
+    # Stats
+    # ------------------------------------------------------------------
+    def compression_stats(self) -> Dict[str, Any]:
+        """Return the engine's static byte budget for the chosen preset.
+        Available after calibration. Includes ``sq_ratio`` (the engine's
+        theoretical headline number), ``sq_key_bytes``, ``sq_val_bytes``,
+        ``d_eff``, and the comparison vs TurboQuant.
+        """
+        if self._engine is None:
+            return {
+                "preset":         self._preset.name,
+                "expected_ratio": self._preset.ratio,
+                "calibrated":     False,
+            }
+        stats = self._engine.compression_stats(None)
+        stats["preset"] = self._preset.name
+        stats["expected_ratio"] = self._preset.ratio
+        return stats
+    # ------------------------------------------------------------------
+    # Persistence
+    # ------------------------------------------------------------------
+    def save_calibration(self, path: str) -> None:
+        """Persist the per-head calibration to disk for instant reuse."""
+        if self._engine is None:
+            raise RuntimeError("Nothing to save. Call .calibrate(...) first.")
+        self._engine.save_calibration(path)
+    def load_calibration(self, path: str, head_dim: int = 128) -> None:
+        """Load a previously-saved calibration into a fresh engine."""
+        self._engine = SpectralQuantEngine(
+            head_dim=head_dim,
+            use_water_fill=True,
+            device=self._device,
+            **self._cfg,
+        )
+        self._engine.load_calibration(path)
+        self._calibrated_for = -1  # marker: calibrated, but to no specific model
+    # ------------------------------------------------------------------
+    # Repr
+    # ------------------------------------------------------------------
+    def __repr__(self) -> str:
+        return (
+            f"SpectralQuant(compression={self._preset.name!r}, "
+            f"expected_ratio={self._preset.ratio:.2f}x, device={self._device!r})"
+        )