spectralquant 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,75 @@
1
+ """
2
+ SpectralQuant — Eigenspectral KV Cache Compression for Transformers.
3
+
4
+ Achieves up to 6.55x KV cache compression with FP16-equivalent output
5
+ quality on Mistral 7B and Qwen 2.5 7B. Pure PyTorch, no custom CUDA
6
+ kernels, runs anywhere torch runs.
7
+
8
+ Quick start (3 lines)::
9
+
10
+ import spectralquant as sq
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer
12
+
13
+ model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3",
14
+ torch_dtype="float16").cuda()
15
+ tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
16
+
17
+ engine = sq.SpectralQuant(compression="high") # 6.55x preset
18
+ out = engine.generate(model, tok, "Explain water-filling in one paragraph.")
19
+
20
+ print(out["text"])
21
+ print(f"{out['stats']['ratio']:.2f}x compression at "
22
+ f"{out['stats']['tokens_per_second']:.1f} tok/s")
23
+
24
+ Compression presets:
25
+
26
+ * ``"standard"`` -> 5.95x (paper baseline, safest)
27
+ * ``"high"`` -> 6.55x (validated on Mistral & Qwen 7B; default)
28
+ * ``"max"`` -> 6.68x (edge of the cliff, mild repetition possible)
29
+
30
+ Inspect or override::
31
+
32
+ print(sq.describe_presets()) # full table
33
+ engine = sq.SpectralQuant(compression="high",
34
+ d_eff_variance=0.94) # power-user override
35
+
36
+ The low-level :class:`SpectralQuantEngine` is also exported for users who
37
+ want explicit control over calibration, bit allocation, or the legacy
38
+ attention-level monkey-patch path.
39
+ """
40
+
41
+ # High-level user-facing API
42
+ from spectralquant.api import SpectralQuant
43
+ from spectralquant.presets import (
44
+ PRESETS,
45
+ CompressionPreset,
46
+ describe as describe_presets,
47
+ resolve as resolve_preset,
48
+ )
49
+ from spectralquant.calibration_data import (
50
+ CALIBRATION_TEXTS as DEFAULT_CALIBRATION_TEXTS,
51
+ get_default_calibration_texts,
52
+ )
53
+
54
+ # Low-level engine (kept stable for power users + backward compatibility)
55
+ from spectralquant.engine import SpectralQuantEngine, HeadEngine
56
+ from spectralquant.calibrate import EigenspectralCalibrator, HeadCalibrationData
57
+
58
+ __version__ = "0.3.0"
59
+
60
+ __all__ = [
61
+ # New high-level API
62
+ "SpectralQuant",
63
+ "PRESETS",
64
+ "CompressionPreset",
65
+ "describe_presets",
66
+ "resolve_preset",
67
+ "DEFAULT_CALIBRATION_TEXTS",
68
+ "get_default_calibration_texts",
69
+ # Low-level (unchanged)
70
+ "SpectralQuantEngine",
71
+ "HeadEngine",
72
+ "EigenspectralCalibrator",
73
+ "HeadCalibrationData",
74
+ "__version__",
75
+ ]
@@ -0,0 +1,45 @@
1
+ """Greedy water-filling bit allocator — used by engine.py."""
2
+
3
+ from typing import List
4
+ import torch
5
+
6
+
7
+ def water_fill_allocate(
8
+ eigenvalues: torch.Tensor,
9
+ d_eff: int,
10
+ semantic_budget: int,
11
+ min_bits: int = 1,
12
+ max_bits: int = 8,
13
+ ) -> List[int]:
14
+ """
15
+ Greedy water-filling across d_eff semantic dimensions.
16
+
17
+ Each step gives the next bit to whichever dimension has the highest
18
+ marginal distortion reduction: gain_i = λ_i / 4^b_i.
19
+
20
+ Args:
21
+ eigenvalues: Full eigenvalue array (head_dim,), sorted descending.
22
+ d_eff: Number of semantic dimensions to allocate across.
23
+ semantic_budget: Total bits to spend = d_eff × mse_bits_high.
24
+ min_bits: Minimum bits per dimension (default 1).
25
+ max_bits: Maximum bits per dimension (default 8).
26
+
27
+ Returns:
28
+ List[int] of length d_eff summing to semantic_budget (or less if capped).
29
+ """
30
+ d_eff = min(d_eff, len(eigenvalues))
31
+ lam = eigenvalues[:d_eff].float().tolist()
32
+ bits = [min_bits] * d_eff
33
+ spent = d_eff * min_bits
34
+
35
+ for _ in range(max(0, semantic_budget - spent)):
36
+ gains = [
37
+ lam[i] / (4.0 ** bits[i]) if bits[i] < max_bits else -1.0
38
+ for i in range(d_eff)
39
+ ]
40
+ best = max(range(d_eff), key=lambda i: gains[i])
41
+ if gains[best] <= 0.0:
42
+ break
43
+ bits[best] += 1
44
+
45
+ return bits
spectralquant/api.py ADDED
@@ -0,0 +1,357 @@
1
+ """
2
+ spectralquant.api
3
+ ─────────────────
4
+ Clean, opinionated top-level API.
5
+
6
+ The :class:`SpectralQuant` class wraps the lower-level
7
+ :class:`SpectralQuantEngine` so most users never need to think about bit
8
+ budgets, water-filling, or eigenvariance shares. They pick a named
9
+ ``compression`` preset and call ``.generate(...)``.
10
+
11
+ ::
12
+
13
+ import spectralquant as sq
14
+ from transformers import AutoModelForCausalLM, AutoTokenizer
15
+
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ "mistralai/Mistral-7B-Instruct-v0.3", torch_dtype="float16"
18
+ ).cuda()
19
+ tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
20
+
21
+ engine = sq.SpectralQuant(compression="high", device="cuda")
22
+ out = engine.generate(model, tok, "Explain water-filling in one paragraph.")
23
+
24
+ print(out["text"])
25
+ print(f"{out['stats']['ratio']:.2f}x compression at "
26
+ f"{out['stats']['tokens_per_second']:.1f} tok/s")
27
+
28
+ Compression presets:
29
+
30
+ * ``"standard"`` — paper-grade 5.95x, fully safe.
31
+ * ``"high"`` — 6.55x, validated clean on Mistral and Qwen 7B (default).
32
+ * ``"max"`` — 6.68x, edge of the cliff. Light degradation on long output.
33
+
34
+ Power users can override any individual dial with kwargs::
35
+
36
+ engine = sq.SpectralQuant(
37
+ compression="high", # base preset
38
+ d_eff_variance=0.93, # override one knob
39
+ )
40
+ """
41
+ from __future__ import annotations
42
+
43
+ import time
44
+ from typing import Any, Dict, List, Optional
45
+
46
+ import torch
47
+ from torch import nn
48
+
49
+ from spectralquant.calibration_data import get_default_calibration_texts
50
+ from spectralquant.engine import SpectralQuantEngine
51
+ from spectralquant.presets import PRESETS, PresetName, resolve
52
+
53
+
54
+ def _pick_device() -> str:
55
+ if torch.cuda.is_available():
56
+ return "cuda"
57
+ if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
58
+ return "mps"
59
+ return "cpu"
60
+
61
+
62
+ def _infer_head_dim(model: nn.Module) -> int:
63
+ """Best-effort head_dim inference from a HuggingFace model."""
64
+ cfg = getattr(model, "config", None)
65
+ if cfg is None:
66
+ return 128
67
+ if getattr(cfg, "head_dim", None):
68
+ return int(cfg.head_dim)
69
+ h = getattr(cfg, "hidden_size", None)
70
+ n = getattr(cfg, "num_attention_heads", None)
71
+ if h and n:
72
+ return int(h // n)
73
+ return 128
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Public API
78
+ # ---------------------------------------------------------------------------
79
+
80
+ class SpectralQuant:
81
+ """Drop-in KV cache compression for transformer inference.
82
+
83
+ Parameters
84
+ ----------
85
+ compression : {"standard", "high", "max"}, default "high"
86
+ Named preset that selects the bit-allocation profile:
87
+
88
+ * ``"standard"`` -> 5.95x (paper baseline, safest)
89
+ * ``"high"`` -> 6.55x (validated clean on Mistral & Qwen 7B)
90
+ * ``"max"`` -> 6.68x (edge of the cliff, may show light
91
+ repetition on long outputs)
92
+ device : str, optional
93
+ Torch device string. Defaults to ``"cuda"`` if available, else
94
+ ``"mps"``, else ``"cpu"``.
95
+ head_dim : int, optional
96
+ Per-head dimension. Inferred from the model on first use if not
97
+ specified. Pass explicitly to skip inference.
98
+ avg_bits, noise_bits, value_noise_bits, d_eff_variance : optional
99
+ Power-user overrides for the named preset.
100
+
101
+ Notes
102
+ -----
103
+ Calibration is *automatic*: the first time you call ``.generate()`` or
104
+ ``.compress_prefill(...)``, the engine runs the bundled 64-sentence
105
+ calibration set on your model. Subsequent calls reuse the calibration.
106
+ Pass your own corpus to ``.calibrate(...)`` for domain-specific
107
+ inference (e.g. code or biomedical text).
108
+ """
109
+
110
+ def __init__(
111
+ self,
112
+ *,
113
+ compression: PresetName = "high",
114
+ device: Optional[str] = None,
115
+ head_dim: Optional[int] = None,
116
+ avg_bits: Optional[int] = None,
117
+ noise_bits: Optional[int] = None,
118
+ value_noise_bits: Optional[int] = None,
119
+ d_eff_variance: Optional[float] = None,
120
+ use_qjl: bool = False,
121
+ ) -> None:
122
+ self._preset = resolve(compression)
123
+ self._device = device or _pick_device()
124
+ self._head_dim_user = head_dim
125
+
126
+ # Resolve dials: preset values, optionally overridden by kwargs.
127
+ self._cfg = {
128
+ "total_bits": avg_bits if avg_bits is not None else self._preset.avg_bits,
129
+ "noise_bits": noise_bits if noise_bits is not None else self._preset.noise_bits,
130
+ "value_noise_bits": value_noise_bits if value_noise_bits is not None else self._preset.value_noise_bits,
131
+ "d_eff_variance": d_eff_variance if d_eff_variance is not None else self._preset.d_eff_variance,
132
+ "use_qjl": use_qjl,
133
+ }
134
+
135
+ self._engine: Optional[SpectralQuantEngine] = None
136
+ self._calibrated_for: Optional[int] = None # id(model) of last calibration
137
+
138
+ # ------------------------------------------------------------------
139
+ # Properties
140
+ # ------------------------------------------------------------------
141
+
142
+ @property
143
+ def preset_name(self) -> str:
144
+ return self._preset.name
145
+
146
+ @property
147
+ def expected_ratio(self) -> float:
148
+ """The headline compression ratio for this preset (e.g. 6.55)."""
149
+ return self._preset.ratio
150
+
151
+ @property
152
+ def device(self) -> str:
153
+ return self._device
154
+
155
+ @property
156
+ def engine(self) -> SpectralQuantEngine:
157
+ """The underlying low-level engine. Returns None until first
158
+ calibration; use ``ensure_engine(model)`` to force creation."""
159
+ if self._engine is None:
160
+ raise RuntimeError(
161
+ "SpectralQuant engine is not yet built. Call .calibrate(model, "
162
+ "tokenizer, ...) first, or just call .generate(...) which "
163
+ "auto-calibrates on first use."
164
+ )
165
+ return self._engine
166
+
167
+ # ------------------------------------------------------------------
168
+ # Calibration
169
+ # ------------------------------------------------------------------
170
+
171
+ def calibrate(
172
+ self,
173
+ model: nn.Module,
174
+ tokenizer: Any,
175
+ calibration_texts: Optional[List[str]] = None,
176
+ n_samples: Optional[int] = None,
177
+ ) -> Dict[str, Any]:
178
+ """Run eigenspectral calibration. Takes ~3-5 sec on H200 / 7B model.
179
+
180
+ Parameters
181
+ ----------
182
+ model, tokenizer : HuggingFace model and matching tokenizer.
183
+ calibration_texts : list of str, optional
184
+ Diverse texts representative of your inference workload. If
185
+ ``None`` (default), uses the bundled 64-sentence default corpus.
186
+ n_samples : int, optional
187
+ How many texts to actually pass through the model. Defaults to
188
+ ``len(calibration_texts)``.
189
+ """
190
+ head_dim = self._head_dim_user or _infer_head_dim(model)
191
+ if self._engine is None:
192
+ self._engine = SpectralQuantEngine(
193
+ head_dim=head_dim,
194
+ use_water_fill=True,
195
+ device=self._device,
196
+ **self._cfg,
197
+ )
198
+
199
+ texts = calibration_texts or get_default_calibration_texts()
200
+ n = n_samples or len(texts)
201
+ summary = self._engine.calibrate(model, tokenizer, texts, n_samples=n)
202
+ self._calibrated_for = id(model)
203
+ return summary
204
+
205
+ def _ensure_calibrated(self, model: nn.Module, tokenizer: Any) -> None:
206
+ if self._engine is not None and self._calibrated_for == id(model):
207
+ return
208
+ if self._engine is not None and self._calibrated_for is not None:
209
+ print(
210
+ "[SpectralQuant] Re-calibrating for new model "
211
+ "(previous calibration was on a different model id)."
212
+ )
213
+ else:
214
+ print(
215
+ f"[SpectralQuant] Auto-calibrating with bundled "
216
+ f"{len(get_default_calibration_texts())}-sentence corpus. "
217
+ "Pass your own to .calibrate(...) for domain-specific use."
218
+ )
219
+ self.calibrate(model, tokenizer)
220
+
221
+ # ------------------------------------------------------------------
222
+ # Generation
223
+ # ------------------------------------------------------------------
224
+
225
+ def generate(
226
+ self,
227
+ model: nn.Module,
228
+ tokenizer: Any,
229
+ prompt: str,
230
+ *,
231
+ max_new_tokens: int = 128,
232
+ do_sample: bool = False,
233
+ temperature: float = 1.0,
234
+ top_p: float = 1.0,
235
+ repetition_penalty: float = 1.0,
236
+ add_special_tokens: bool = True,
237
+ ) -> Dict[str, Any]:
238
+ """Run end-to-end compressed generation.
239
+
240
+ Returns a dict with keys::
241
+
242
+ text : str, the generated text (no prompt)
243
+ tokens : int, number of new tokens generated
244
+ stats : {
245
+ ratio : float, prefix-cache compression
246
+ tokens_per_second : float, decode throughput
247
+ fp16_bytes : int,
248
+ compressed_bytes : int,
249
+ preset : str, e.g. "high"
250
+ }
251
+ """
252
+ self._ensure_calibrated(model, tokenizer)
253
+
254
+ result = self._engine.generate_compressed(
255
+ model, tokenizer, prompt,
256
+ max_new_tokens=max_new_tokens,
257
+ do_sample=do_sample,
258
+ temperature=temperature,
259
+ top_p=top_p,
260
+ repetition_penalty=repetition_penalty,
261
+ device=self._device,
262
+ add_special_tokens=add_special_tokens,
263
+ )
264
+
265
+ # Re-shape the engine return into the public schema.
266
+ stats = dict(result.get("stats", {}))
267
+ stats["preset"] = self._preset.name
268
+ return {
269
+ "text": result.get("text", ""),
270
+ "tokens": stats.get("new_tokens", 0),
271
+ "stats": stats,
272
+ }
273
+
274
+ # ------------------------------------------------------------------
275
+ # Cache compression (for users who want to manage decoding themselves)
276
+ # ------------------------------------------------------------------
277
+
278
+ def compress_prefill(
279
+ self,
280
+ model: nn.Module,
281
+ tokenizer: Any,
282
+ prompt: str,
283
+ *,
284
+ add_special_tokens: bool = True,
285
+ ) -> Dict[str, Any]:
286
+ """Run an FP16 prefill on ``prompt``, compress the resulting KV
287
+ cache, return a fresh ``DynamicCache`` ready to feed into
288
+ ``model.generate(past_key_values=cache, ...)``.
289
+
290
+ Returns::
291
+
292
+ cache : DynamicCache, the compressed-then-reconstructed cache
293
+ input_ids : LongTensor, the tokenised prompt
294
+ stats : {ratio, fp16_bytes, compressed_bytes, ...}
295
+ prompt_length : int
296
+
297
+ Useful when you want to keep the compressed cache around (e.g. to
298
+ prepend it to many different completions of the same long prefix).
299
+ """
300
+ self._ensure_calibrated(model, tokenizer)
301
+ return self._engine.prefill_compress(
302
+ model, tokenizer, prompt,
303
+ device=self._device, add_special_tokens=add_special_tokens,
304
+ )
305
+
306
+ # ------------------------------------------------------------------
307
+ # Stats
308
+ # ------------------------------------------------------------------
309
+
310
+ def compression_stats(self) -> Dict[str, Any]:
311
+ """Return the engine's static byte budget for the chosen preset.
312
+
313
+ Available after calibration. Includes ``sq_ratio`` (the engine's
314
+ theoretical headline number), ``sq_key_bytes``, ``sq_val_bytes``,
315
+ ``d_eff``, and the comparison vs TurboQuant.
316
+ """
317
+ if self._engine is None:
318
+ return {
319
+ "preset": self._preset.name,
320
+ "expected_ratio": self._preset.ratio,
321
+ "calibrated": False,
322
+ }
323
+ stats = self._engine.compression_stats(None)
324
+ stats["preset"] = self._preset.name
325
+ stats["expected_ratio"] = self._preset.ratio
326
+ return stats
327
+
328
+ # ------------------------------------------------------------------
329
+ # Persistence
330
+ # ------------------------------------------------------------------
331
+
332
+ def save_calibration(self, path: str) -> None:
333
+ """Persist the per-head calibration to disk for instant reuse."""
334
+ if self._engine is None:
335
+ raise RuntimeError("Nothing to save. Call .calibrate(...) first.")
336
+ self._engine.save_calibration(path)
337
+
338
+ def load_calibration(self, path: str, head_dim: int = 128) -> None:
339
+ """Load a previously-saved calibration into a fresh engine."""
340
+ self._engine = SpectralQuantEngine(
341
+ head_dim=head_dim,
342
+ use_water_fill=True,
343
+ device=self._device,
344
+ **self._cfg,
345
+ )
346
+ self._engine.load_calibration(path)
347
+ self._calibrated_for = -1 # marker: calibrated, but to no specific model
348
+
349
+ # ------------------------------------------------------------------
350
+ # Repr
351
+ # ------------------------------------------------------------------
352
+
353
+ def __repr__(self) -> str:
354
+ return (
355
+ f"SpectralQuant(compression={self._preset.name!r}, "
356
+ f"expected_ratio={self._preset.ratio:.2f}x, device={self._device!r})"
357
+ )