spectralquant 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. spectralquant-0.3.0/LICENSE +21 -0
  2. spectralquant-0.3.0/MANIFEST.in +27 -0
  3. spectralquant-0.3.0/PKG-INFO +329 -0
  4. spectralquant-0.3.0/README.md +266 -0
  5. spectralquant-0.3.0/assets/spectralquant_banner.png +0 -0
  6. spectralquant-0.3.0/examples/README.md +76 -0
  7. spectralquant-0.3.0/examples/cache_compress_demo.py +321 -0
  8. spectralquant-0.3.0/examples/depth_anything_v2.py +209 -0
  9. spectralquant-0.3.0/examples/drop_in_any_model.py +332 -0
  10. spectralquant-0.3.0/examples/esmfold_protein.py +196 -0
  11. spectralquant-0.3.0/examples/monkey_patch_demo.py +239 -0
  12. spectralquant-0.3.0/examples/quickstart.py +118 -0
  13. spectralquant-0.3.0/examples/quickstart_llm.py +136 -0
  14. spectralquant-0.3.0/examples/run_perplexity.py +268 -0
  15. spectralquant-0.3.0/examples/sweep_compression.py +461 -0
  16. spectralquant-0.3.0/examples/videomae_kinetics.py +187 -0
  17. spectralquant-0.3.0/examples/vit_large_imagenet.py +156 -0
  18. spectralquant-0.3.0/notebooks/spectralquant_demo.ipynb +404 -0
  19. spectralquant-0.3.0/pyproject.toml +77 -0
  20. spectralquant-0.3.0/setup.cfg +4 -0
  21. spectralquant-0.3.0/src/spectralquant/__init__.py +75 -0
  22. spectralquant-0.3.0/src/spectralquant/_water_fill.py +45 -0
  23. spectralquant-0.3.0/src/spectralquant/api.py +357 -0
  24. spectralquant-0.3.0/src/spectralquant/calibrate.py +891 -0
  25. spectralquant-0.3.0/src/spectralquant/calibration_data.py +117 -0
  26. spectralquant-0.3.0/src/spectralquant/engine.py +1185 -0
  27. spectralquant-0.3.0/src/spectralquant/integrations/__init__.py +59 -0
  28. spectralquant-0.3.0/src/spectralquant/integrations/alphafold.py +305 -0
  29. spectralquant-0.3.0/src/spectralquant/integrations/dynamic_cache.py +373 -0
  30. spectralquant-0.3.0/src/spectralquant/integrations/esmfold.py +338 -0
  31. spectralquant-0.3.0/src/spectralquant/integrations/huggingface.py +283 -0
  32. spectralquant-0.3.0/src/spectralquant/integrations/videomae.py +440 -0
  33. spectralquant-0.3.0/src/spectralquant/integrations/vit.py +265 -0
  34. spectralquant-0.3.0/src/spectralquant/kernels/__init__.py +15 -0
  35. spectralquant-0.3.0/src/spectralquant/kernels/compress_keys.py +128 -0
  36. spectralquant-0.3.0/src/spectralquant/kernels/compress_values.py +142 -0
  37. spectralquant-0.3.0/src/spectralquant/kernels/fused_attention.py +124 -0
  38. spectralquant-0.3.0/src/spectralquant/presets.py +135 -0
  39. spectralquant-0.3.0/src/spectralquant.egg-info/SOURCES.txt +36 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anirudh Bharadwaj Vangara, Ashwin Gopinath
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,27 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+
5
+ graft src
6
+ graft examples
7
+ graft notebooks
8
+ graft assets
9
+
10
+ global-exclude __pycache__
11
+ global-exclude *.py[cod]
12
+ global-exclude *.so
13
+ global-exclude *.egg-info
14
+ global-exclude .DS_Store
15
+ global-exclude .ipynb_checkpoints
16
+ global-exclude .pytest_cache
17
+ global-exclude results
18
+ global-exclude *.mp4
19
+ global-exclude *.mov
20
+ global-exclude *.avi
21
+ global-exclude *.gif
22
+
23
+ prune **/__pycache__
24
+ prune **/.ipynb_checkpoints
25
+ prune **/*.egg-info
26
+ prune **/.pytest_cache
27
+ prune **/results
@@ -0,0 +1,329 @@
1
+ Metadata-Version: 2.4
2
+ Name: spectralquant
3
+ Version: 0.3.0
4
+ Summary: Eigenspectral KV cache compression for transformer inference. Up to 6.55x compression with FP16-equivalent quality, drop-in for HuggingFace LLMs and vision transformers.
5
+ Author-email: Anirudh Bharadwaj Vangara <anirudh@sentra.app>, Ashwin Gopinath <ashwin@sentra.app>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Dynamis-Labs/spectralquant
8
+ Project-URL: Repository, https://github.com/Dynamis-Labs/spectralquant
9
+ Project-URL: Documentation, https://github.com/Dynamis-Labs/spectralquant#readme
10
+ Project-URL: Issues, https://github.com/Dynamis-Labs/spectralquant/issues
11
+ Keywords: kv-cache,compression,quantization,llm,attention,transformer,inference,spectral,eigenspectral,water-filling,huggingface,vision-transformer
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: torch>=2.2.0
26
+ Requires-Dist: numpy>=1.24
27
+ Requires-Dist: scipy>=1.11
28
+ Requires-Dist: tqdm>=4.65
29
+ Provides-Extra: hf
30
+ Requires-Dist: transformers>=4.40.0; extra == "hf"
31
+ Requires-Dist: accelerate>=0.27.0; extra == "hf"
32
+ Provides-Extra: vit
33
+ Requires-Dist: transformers>=4.40.0; extra == "vit"
34
+ Requires-Dist: Pillow>=10.0; extra == "vit"
35
+ Provides-Extra: alphafold
36
+ Requires-Dist: transformers>=4.40.0; extra == "alphafold"
37
+ Provides-Extra: esmfold
38
+ Requires-Dist: transformers>=4.40.0; extra == "esmfold"
39
+ Provides-Extra: videomae
40
+ Requires-Dist: transformers>=4.40.0; extra == "videomae"
41
+ Requires-Dist: Pillow>=10.0; extra == "videomae"
42
+ Provides-Extra: video
43
+ Requires-Dist: transformers>=4.40.0; extra == "video"
44
+ Requires-Dist: Pillow>=10.0; extra == "video"
45
+ Requires-Dist: av>=10.0.0; extra == "video"
46
+ Provides-Extra: examples
47
+ Requires-Dist: transformers>=4.40.0; extra == "examples"
48
+ Requires-Dist: accelerate>=0.27.0; extra == "examples"
49
+ Requires-Dist: datasets>=2.14; extra == "examples"
50
+ Requires-Dist: Pillow>=10.0; extra == "examples"
51
+ Requires-Dist: requests>=2.28; extra == "examples"
52
+ Requires-Dist: av>=10.0.0; extra == "examples"
53
+ Requires-Dist: numpy>=1.24; extra == "examples"
54
+ Provides-Extra: dev
55
+ Requires-Dist: pytest; extra == "dev"
56
+ Requires-Dist: pytest-cov; extra == "dev"
57
+ Requires-Dist: build; extra == "dev"
58
+ Requires-Dist: twine; extra == "dev"
59
+ Requires-Dist: ruff; extra == "dev"
60
+ Provides-Extra: all
61
+ Requires-Dist: spectralquant[alphafold,dev,esmfold,examples,hf,videomae,vit]; extra == "all"
62
+ Dynamic: license-file
63
+
64
+ <p align="center">
65
+ <img src="https://raw.githubusercontent.com/Dynamis-Labs/spectralquant/main/assets/spectralquant_banner.png" alt="SpectralQuant" width="100%">
66
+ </p>
67
+
68
+ # SpectralQuant
69
+
70
+ Eigenspectral KV cache compression for transformer inference. Up to 6.55x
71
+ compression of the KV cache with FP16-equivalent output quality.
72
+
73
+ ```
74
+ pip install spectralquant
75
+ ```
76
+
77
+ ## What it does
78
+
79
+ Modern LLM inference is bottlenecked by the size of the KV cache. The cache
80
+ grows linearly with sequence length and consumes more memory than the model
81
+ weights themselves at long context. SpectralQuant compresses that cache by
82
+ exploiting the fact that, after a per-head spectral rotation, only a small
83
+ number of dimensions actually carry information.
84
+
85
+ A short calibration step measures the eigenstructure of each attention head.
86
+ Each head's keys and values are then split into a high-variance "semantic"
87
+ band and a low-variance "tail" band. The semantic band gets a generous bit
88
+ budget; the tail gets one or two bits. Total cache size shrinks by 6.55x with
89
+ output quality indistinguishable from FP16.
90
+
91
+ The package ships pure-PyTorch kernels and HuggingFace integrations. There
92
+ are no custom CUDA dependencies. It runs anywhere torch runs.
93
+
94
+ ## Quickstart
95
+
96
+ ```python
97
+ import torch
98
+ import spectralquant as sq
99
+ from transformers import AutoModelForCausalLM, AutoTokenizer
100
+
101
+ model = AutoModelForCausalLM.from_pretrained(
102
+ "mistralai/Mistral-7B-Instruct-v0.3",
103
+ torch_dtype=torch.float16,
104
+ device_map="auto",
105
+ )
106
+ tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
107
+
108
+ engine = sq.SpectralQuant(compression="high") # 6.55x preset
109
+
110
+ out = engine.generate(
111
+ model, tok,
112
+ "Explain water-filling bit allocation in two sentences.",
113
+ max_new_tokens=120,
114
+ )
115
+
116
+ print(out["text"])
117
+ print(f"{out['stats']['ratio']:.2f}x compression, "
118
+ f"{out['stats']['tokens_per_second']:.1f} tok/s")
119
+ ```
120
+
121
+ The first call to `engine.generate(...)` runs a one-time calibration with a
122
+ bundled 64-sentence corpus. Subsequent calls reuse it. You can also pass your
123
+ own domain-specific corpus.
124
+
125
+ ## Compression presets
126
+
127
+ ```python
128
+ print(sq.describe_presets())
129
+ ```
130
+
131
+ | preset | ratio | risk | notes |
132
+ |------------|--------|------------|---------------------------------------------------|
133
+ | `standard` | 5.95x | safe | Paper baseline. Production default. |
134
+ | `high` | 6.55x | safe | Validated on Mistral 7B and Qwen 2.5 7B. |
135
+ | `max` | 6.68x | edge | First paragraph clean. Light repetition possible. |
136
+
137
+ You can also override individual dials when you need them:
138
+
139
+ ```python
140
+ engine = sq.SpectralQuant(
141
+ compression="high",
142
+ d_eff_variance=0.93, # override one knob
143
+ )
144
+ ```
145
+
146
+ The dials are `avg_bits`, `noise_bits`, `value_noise_bits`, and
147
+ `d_eff_variance`. Anything unset falls back to the named preset.
148
+
149
+ ## Supported models
150
+
151
+ Tested and verified:
152
+
153
+ | family | example | works |
154
+ |-------------------|-----------------------------------------------|-----------|
155
+ | Mistral | `mistralai/Mistral-7B-Instruct-v0.3` | yes |
156
+ | Qwen 2.5 | `Qwen/Qwen2.5-7B-Instruct` | yes |
157
+ | Llama 3.x | `NousResearch/Meta-Llama-3.1-8B-Instruct` | yes |
158
+ | SmolLM2 | `HuggingFaceTB/SmolLM2-135M` | yes |
159
+ | Gemma 2 | `google/gemma-2-9b` | expected |
160
+
161
+ The cache-level integration works with any HuggingFace causal LM that uses
162
+ `DynamicCache` (transformers >= 4.40). RoPE-based architectures with grouped
163
+ query attention are the primary target.
164
+
165
+ For non-LLM transformers (ViT, ESMFold, VideoMAE, AlphaFold) see the modules
166
+ in `spectralquant.integrations`. Vision transformers can actually see a
167
+ quality *improvement* over FP16 because the eigenspectral filtering removes
168
+ noise in the low-variance directions.
169
+
170
+ ## Hardware
171
+
172
+ | GPU | memory | recommended for |
173
+ |-------------------------|---------|------------------------------------|
174
+ | H100 / H200 | 80–141 GB | 7B, 13B, 70B inference, batch decode |
175
+ | A100 80 GB | 80 GB | 7B and 13B inference |
176
+ | A100 40 GB / A6000 | 40–48 GB | 7B inference, short context |
177
+ | RTX 4090 / 4080 / 3090 | 24 GB | 7B inference at FP16, short context |
178
+ | T4 / RTX 3060 | 12–16 GB | smaller models, demo runs |
179
+ | CPU | n/a | works, but slow |
180
+
181
+ The compression ratios above were measured on H200 with Mistral 7B and Qwen
182
+ 2.5 7B at sequence length 512. Compression is sequence-length agnostic so
183
+ ratios hold at longer contexts; speed gains scale with context length because
184
+ the FP16 baseline gets slower while the SQ decode stays linear.
185
+
186
+ ## Generating with a pre-compressed prefix
187
+
188
+ Useful when you want to keep one compressed cache and reuse it across many
189
+ completions of the same long prefix.
190
+
191
+ ```python
192
+ result = engine.compress_prefill(model, tok, long_prefix)
193
+ cache = result["cache"] # a fresh DynamicCache, FP16 surface
194
+ print(f"prefix compression: {result['stats']['ratio']:.2f}x")
195
+
196
+ # Use cache as past_key_values for any number of follow-ups:
197
+ inputs = tok(question, return_tensors="pt").to(model.device)
198
+ ids = model.generate(
199
+ **inputs,
200
+ past_key_values=cache,
201
+ max_new_tokens=200,
202
+ )
203
+ ```
204
+
205
+ ## Custom calibration
206
+
207
+ The bundled corpus works for general English. For domain-specific workloads
208
+ (code, biomedical text, legal filings), pass your own:
209
+
210
+ ```python
211
+ my_corpus = [...] # 32–128 representative samples
212
+ engine = sq.SpectralQuant(compression="high")
213
+ engine.calibrate(model, tok, my_corpus)
214
+ ```
215
+
216
+ Calibration takes a few seconds on H200. You can persist it once and reload
217
+ in any future process:
218
+
219
+ ```python
220
+ engine.save_calibration("/path/to/calib")
221
+ fresh = sq.SpectralQuant(compression="high")
222
+ fresh.load_calibration("/path/to/calib", head_dim=128)
223
+ ```
224
+
225
+ ## How it works (one paragraph)
226
+
227
+ For each attention head, calibration accumulates the key and value covariance
228
+ matrices and eigendecomposes them. The eigenvectors define a per-head
229
+ rotation that aligns coordinates with directions of decreasing variance.
230
+ After rotation, a *water-filling* allocator distributes bits across
231
+ coordinates so that high-variance dimensions get more bits and tail
232
+ dimensions get fewer. Two bit budgets are used: a "semantic" budget
233
+ (`avg_bits`) for the high-variance band and a "tail" budget (`noise_bits`,
234
+ `value_noise_bits`) for the rest. Each coordinate is quantized with a
235
+ Lloyd-Max scalar codebook fit to a Gaussian whose variance equals that
236
+ coordinate's eigenvalue. Decode rotates back, dequantizes, and the rest of
237
+ attention proceeds at full FP16. The math is in
238
+ [`engine.py`](src/spectralquant/engine.py).
239
+
240
+ ## Demo notebook
241
+
242
+ A full end-to-end notebook is included at
243
+ [`notebooks/spectralquant_demo.ipynb`](notebooks/spectralquant_demo.ipynb).
244
+ It walks through:
245
+
246
+ 1. Install + GPU sanity check
247
+ 2. The three presets
248
+ 3. Loading Mistral 7B
249
+ 4. Side-by-side FP16 vs SpectralQuant on four diverse prompts, for each preset
250
+ 5. Power-user override
251
+ 6. Custom calibration
252
+ 7. Final summary table
253
+ 8. Save / load round-trip
254
+
255
+ To run it on a fresh GPU instance:
256
+
257
+ ```bash
258
+ unzip -oq spectralquant.zip -d spectralquant
259
+ pip install -e ./spectralquant
260
+ jupyter notebook notebooks/spectralquant_demo.ipynb
261
+ ```
262
+
263
+ ## API surface
264
+
265
+ ```python
266
+ sq.SpectralQuant(
267
+ compression="standard" | "high" | "max",
268
+ device=None, # "cuda" | "mps" | "cpu" | None (auto)
269
+ head_dim=None, # inferred from model
270
+ avg_bits=None, noise_bits=None,
271
+ value_noise_bits=None,
272
+ d_eff_variance=None,
273
+ )
274
+
275
+ engine.generate(model, tokenizer, prompt, *, max_new_tokens=128, ...)
276
+ engine.compress_prefill(model, tokenizer, prompt)
277
+ engine.calibrate(model, tokenizer, calibration_texts=None)
278
+ engine.compression_stats()
279
+ engine.save_calibration(path)
280
+ engine.load_calibration(path, head_dim=128)
281
+ ```
282
+
283
+ The lower-level `sq.SpectralQuantEngine` is also exported for users who want
284
+ direct access to per-head bit allocations or to use the legacy
285
+ attention-level monkey-patch path.
286
+
287
+ ## Measuring quality
288
+
289
+ The package reports four metrics in `engine.compression_stats()` and in the
290
+ `stats` field returned by `.generate(...)`:
291
+
292
+ * `ratio` — observed prefix-cache compression vs FP16 (bytes / bytes)
293
+ * `tokens_per_second` — measured decode throughput
294
+ * `seconds` — wall clock for the decode step
295
+ * `compressed_bytes`, `fp16_bytes` — raw byte counts
296
+
297
+ For independent quality validation you can run perplexity on WikiText:
298
+
299
+ ```bash
300
+ python examples/run_perplexity.py --model mistralai/Mistral-7B-Instruct-v0.3
301
+ ```
302
+
303
+ Or sweep parameters to find the sweet spot for a model not in our test set:
304
+
305
+ ```bash
306
+ python examples/sweep_compression.py --model <hf_repo>
307
+ ```
308
+
309
+ ## Authors
310
+
311
+ - Anirudh Bharadwaj Vangara — <anirudh@sentra.app>
312
+ - Ashwin Gopinath — <ashwin@sentra.app>
313
+
314
+ Bug reports, feature requests, and pull requests are welcome on
315
+ [GitHub](https://github.com/Dynamis-Labs/spectralquant).
316
+
317
+ ## License
318
+
319
+ MIT.
320
+
321
+ ## Citation
322
+
323
+ ```bibtex
324
+ @misc{spectralquant2026,
325
+ title = {SpectralQuant: Eigenspectral KV Cache Compression},
326
+ author = {Vangara, Anirudh Bharadwaj and Gopinath, Ashwin},
327
+ year = {2026},
328
+ }
329
+ ```
@@ -0,0 +1,266 @@
1
+ <p align="center">
2
+ <img src="https://raw.githubusercontent.com/Dynamis-Labs/spectralquant/main/assets/spectralquant_banner.png" alt="SpectralQuant" width="100%">
3
+ </p>
4
+
5
+ # SpectralQuant
6
+
7
+ Eigenspectral KV cache compression for transformer inference. Up to 6.55x
8
+ compression of the KV cache with FP16-equivalent output quality.
9
+
10
+ ```
11
+ pip install spectralquant
12
+ ```
13
+
14
+ ## What it does
15
+
16
+ Modern LLM inference is bottlenecked by the size of the KV cache. The cache
17
+ grows linearly with sequence length and consumes more memory than the model
18
+ weights themselves at long context. SpectralQuant compresses that cache by
19
+ exploiting the fact that, after a per-head spectral rotation, only a small
20
+ number of dimensions actually carry information.
21
+
22
+ A short calibration step measures the eigenstructure of each attention head.
23
+ Each head's keys and values are then split into a high-variance "semantic"
24
+ band and a low-variance "tail" band. The semantic band gets a generous bit
25
+ budget; the tail gets one or two bits. Total cache size shrinks by 6.55x with
26
+ output quality indistinguishable from FP16.
27
+
28
+ The package ships pure-PyTorch kernels and HuggingFace integrations. There
29
+ are no custom CUDA dependencies. It runs anywhere torch runs.
30
+
31
+ ## Quickstart
32
+
33
+ ```python
34
+ import torch
35
+ import spectralquant as sq
36
+ from transformers import AutoModelForCausalLM, AutoTokenizer
37
+
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ "mistralai/Mistral-7B-Instruct-v0.3",
40
+ torch_dtype=torch.float16,
41
+ device_map="auto",
42
+ )
43
+ tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
44
+
45
+ engine = sq.SpectralQuant(compression="high") # 6.55x preset
46
+
47
+ out = engine.generate(
48
+ model, tok,
49
+ "Explain water-filling bit allocation in two sentences.",
50
+ max_new_tokens=120,
51
+ )
52
+
53
+ print(out["text"])
54
+ print(f"{out['stats']['ratio']:.2f}x compression, "
55
+ f"{out['stats']['tokens_per_second']:.1f} tok/s")
56
+ ```
57
+
58
+ The first call to `engine.generate(...)` runs a one-time calibration with a
59
+ bundled 64-sentence corpus. Subsequent calls reuse it. You can also pass your
60
+ own domain-specific corpus.
61
+
62
+ ## Compression presets
63
+
64
+ ```python
65
+ print(sq.describe_presets())
66
+ ```
67
+
68
+ | preset | ratio | risk | notes |
69
+ |------------|--------|------------|---------------------------------------------------|
70
+ | `standard` | 5.95x | safe | Paper baseline. Production default. |
71
+ | `high` | 6.55x | safe | Validated on Mistral 7B and Qwen 2.5 7B. |
72
+ | `max` | 6.68x | edge | First paragraph clean. Light repetition possible. |
73
+
74
+ You can also override individual dials when you need them:
75
+
76
+ ```python
77
+ engine = sq.SpectralQuant(
78
+ compression="high",
79
+ d_eff_variance=0.93, # override one knob
80
+ )
81
+ ```
82
+
83
+ The dials are `avg_bits`, `noise_bits`, `value_noise_bits`, and
84
+ `d_eff_variance`. Anything unset falls back to the named preset.
85
+
86
+ ## Supported models
87
+
88
+ Tested and verified:
89
+
90
+ | family | example | works |
91
+ |-------------------|-----------------------------------------------|-----------|
92
+ | Mistral | `mistralai/Mistral-7B-Instruct-v0.3` | yes |
93
+ | Qwen 2.5 | `Qwen/Qwen2.5-7B-Instruct` | yes |
94
+ | Llama 3.x | `NousResearch/Meta-Llama-3.1-8B-Instruct` | yes |
95
+ | SmolLM2 | `HuggingFaceTB/SmolLM2-135M` | yes |
96
+ | Gemma 2 | `google/gemma-2-9b` | expected |
97
+
98
+ The cache-level integration works with any HuggingFace causal LM that uses
99
+ `DynamicCache` (transformers >= 4.40). RoPE-based architectures with grouped
100
+ query attention are the primary target.
101
+
102
+ For non-LLM transformers (ViT, ESMFold, VideoMAE, AlphaFold) see the modules
103
+ in `spectralquant.integrations`. Vision transformers can actually see a
104
+ quality *improvement* over FP16 because the eigenspectral filtering removes
105
+ noise in the low-variance directions.
106
+
107
+ ## Hardware
108
+
109
+ | GPU | memory | recommended for |
110
+ |-------------------------|---------|------------------------------------|
111
+ | H100 / H200 | 80–141 GB | 7B, 13B, 70B inference, batch decode |
112
+ | A100 80 GB | 80 GB | 7B and 13B inference |
113
+ | A100 40 GB / A6000 | 40–48 GB | 7B inference, short context |
114
+ | RTX 4090 / 4080 / 3090 | 24 GB | 7B inference at FP16, short context |
115
+ | T4 / RTX 3060 | 12–16 GB | smaller models, demo runs |
116
+ | CPU | n/a | works, but slow |
117
+
118
+ The compression ratios above were measured on H200 with Mistral 7B and Qwen
119
+ 2.5 7B at sequence length 512. Compression is sequence-length agnostic so
120
+ ratios hold at longer contexts; speed gains scale with context length because
121
+ the FP16 baseline gets slower while the SQ decode stays linear.
122
+
123
+ ## Generating with a pre-compressed prefix
124
+
125
+ Useful when you want to keep one compressed cache and reuse it across many
126
+ completions of the same long prefix.
127
+
128
+ ```python
129
+ result = engine.compress_prefill(model, tok, long_prefix)
130
+ cache = result["cache"] # a fresh DynamicCache, FP16 surface
131
+ print(f"prefix compression: {result['stats']['ratio']:.2f}x")
132
+
133
+ # Use cache as past_key_values for any number of follow-ups:
134
+ inputs = tok(question, return_tensors="pt").to(model.device)
135
+ ids = model.generate(
136
+ **inputs,
137
+ past_key_values=cache,
138
+ max_new_tokens=200,
139
+ )
140
+ ```
141
+
142
+ ## Custom calibration
143
+
144
+ The bundled corpus works for general English. For domain-specific workloads
145
+ (code, biomedical text, legal filings), pass your own:
146
+
147
+ ```python
148
+ my_corpus = [...] # 32–128 representative samples
149
+ engine = sq.SpectralQuant(compression="high")
150
+ engine.calibrate(model, tok, my_corpus)
151
+ ```
152
+
153
+ Calibration takes a few seconds on H200. You can persist it once and reload
154
+ in any future process:
155
+
156
+ ```python
157
+ engine.save_calibration("/path/to/calib")
158
+ fresh = sq.SpectralQuant(compression="high")
159
+ fresh.load_calibration("/path/to/calib", head_dim=128)
160
+ ```
161
+
162
+ ## How it works (one paragraph)
163
+
164
+ For each attention head, calibration accumulates the key and value covariance
165
+ matrices and eigendecomposes them. The eigenvectors define a per-head
166
+ rotation that aligns coordinates with directions of decreasing variance.
167
+ After rotation, a *water-filling* allocator distributes bits across
168
+ coordinates so that high-variance dimensions get more bits and tail
169
+ dimensions get fewer. Two bit budgets are used: a "semantic" budget
170
+ (`avg_bits`) for the high-variance band and a "tail" budget (`noise_bits`,
171
+ `value_noise_bits`) for the rest. Each coordinate is quantized with a
172
+ Lloyd-Max scalar codebook fit to a Gaussian whose variance equals that
173
+ coordinate's eigenvalue. Decode rotates back, dequantizes, and the rest of
174
+ attention proceeds at full FP16. The math is in
175
+ [`engine.py`](src/spectralquant/engine.py).
176
+
177
+ ## Demo notebook
178
+
179
+ A full end-to-end notebook is included at
180
+ [`notebooks/spectralquant_demo.ipynb`](notebooks/spectralquant_demo.ipynb).
181
+ It walks through:
182
+
183
+ 1. Install + GPU sanity check
184
+ 2. The three presets
185
+ 3. Loading Mistral 7B
186
+ 4. Side-by-side FP16 vs SpectralQuant on four diverse prompts, for each preset
187
+ 5. Power-user override
188
+ 6. Custom calibration
189
+ 7. Final summary table
190
+ 8. Save / load round-trip
191
+
192
+ To run it on a fresh GPU instance:
193
+
194
+ ```bash
195
+ unzip -oq spectralquant.zip -d spectralquant
196
+ pip install -e ./spectralquant
197
+ jupyter notebook notebooks/spectralquant_demo.ipynb
198
+ ```
199
+
200
+ ## API surface
201
+
202
+ ```python
203
+ sq.SpectralQuant(
204
+ compression="standard" | "high" | "max",
205
+ device=None, # "cuda" | "mps" | "cpu" | None (auto)
206
+ head_dim=None, # inferred from model
207
+ avg_bits=None, noise_bits=None,
208
+ value_noise_bits=None,
209
+ d_eff_variance=None,
210
+ )
211
+
212
+ engine.generate(model, tokenizer, prompt, *, max_new_tokens=128, ...)
213
+ engine.compress_prefill(model, tokenizer, prompt)
214
+ engine.calibrate(model, tokenizer, calibration_texts=None)
215
+ engine.compression_stats()
216
+ engine.save_calibration(path)
217
+ engine.load_calibration(path, head_dim=128)
218
+ ```
219
+
220
+ The lower-level `sq.SpectralQuantEngine` is also exported for users who want
221
+ direct access to per-head bit allocations or to use the legacy
222
+ attention-level monkey-patch path.
223
+
224
+ ## Measuring quality
225
+
226
+ The package reports four metrics in `engine.compression_stats()` and in the
227
+ `stats` field returned by `.generate(...)`:
228
+
229
+ * `ratio` — observed prefix-cache compression vs FP16 (bytes / bytes)
230
+ * `tokens_per_second` — measured decode throughput
231
+ * `seconds` — wall clock for the decode step
232
+ * `compressed_bytes`, `fp16_bytes` — raw byte counts
233
+
234
+ For independent quality validation you can run perplexity on WikiText:
235
+
236
+ ```bash
237
+ python examples/run_perplexity.py --model mistralai/Mistral-7B-Instruct-v0.3
238
+ ```
239
+
240
+ Or sweep parameters to find the sweet spot for a model not in our test set:
241
+
242
+ ```bash
243
+ python examples/sweep_compression.py --model <hf_repo>
244
+ ```
245
+
246
+ ## Authors
247
+
248
+ - Anirudh Bharadwaj Vangara — <anirudh@sentra.app>
249
+ - Ashwin Gopinath — <ashwin@sentra.app>
250
+
251
+ Bug reports, feature requests, and pull requests are welcome on
252
+ [GitHub](https://github.com/Dynamis-Labs/spectralquant).
253
+
254
+ ## License
255
+
256
+ MIT.
257
+
258
+ ## Citation
259
+
260
+ ```bibtex
261
+ @misc{spectralquant2026,
262
+ title = {SpectralQuant: Eigenspectral KV Cache Compression},
263
+ author = {Vangara, Anirudh Bharadwaj and Gopinath, Ashwin},
264
+ year = {2026},
265
+ }
266
+ ```