spectralquant 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spectralquant-0.3.0/LICENSE +21 -0
- spectralquant-0.3.0/MANIFEST.in +27 -0
- spectralquant-0.3.0/PKG-INFO +329 -0
- spectralquant-0.3.0/README.md +266 -0
- spectralquant-0.3.0/assets/spectralquant_banner.png +0 -0
- spectralquant-0.3.0/examples/README.md +76 -0
- spectralquant-0.3.0/examples/cache_compress_demo.py +321 -0
- spectralquant-0.3.0/examples/depth_anything_v2.py +209 -0
- spectralquant-0.3.0/examples/drop_in_any_model.py +332 -0
- spectralquant-0.3.0/examples/esmfold_protein.py +196 -0
- spectralquant-0.3.0/examples/monkey_patch_demo.py +239 -0
- spectralquant-0.3.0/examples/quickstart.py +118 -0
- spectralquant-0.3.0/examples/quickstart_llm.py +136 -0
- spectralquant-0.3.0/examples/run_perplexity.py +268 -0
- spectralquant-0.3.0/examples/sweep_compression.py +461 -0
- spectralquant-0.3.0/examples/videomae_kinetics.py +187 -0
- spectralquant-0.3.0/examples/vit_large_imagenet.py +156 -0
- spectralquant-0.3.0/notebooks/spectralquant_demo.ipynb +404 -0
- spectralquant-0.3.0/pyproject.toml +77 -0
- spectralquant-0.3.0/setup.cfg +4 -0
- spectralquant-0.3.0/src/spectralquant/__init__.py +75 -0
- spectralquant-0.3.0/src/spectralquant/_water_fill.py +45 -0
- spectralquant-0.3.0/src/spectralquant/api.py +357 -0
- spectralquant-0.3.0/src/spectralquant/calibrate.py +891 -0
- spectralquant-0.3.0/src/spectralquant/calibration_data.py +117 -0
- spectralquant-0.3.0/src/spectralquant/engine.py +1185 -0
- spectralquant-0.3.0/src/spectralquant/integrations/__init__.py +59 -0
- spectralquant-0.3.0/src/spectralquant/integrations/alphafold.py +305 -0
- spectralquant-0.3.0/src/spectralquant/integrations/dynamic_cache.py +373 -0
- spectralquant-0.3.0/src/spectralquant/integrations/esmfold.py +338 -0
- spectralquant-0.3.0/src/spectralquant/integrations/huggingface.py +283 -0
- spectralquant-0.3.0/src/spectralquant/integrations/videomae.py +440 -0
- spectralquant-0.3.0/src/spectralquant/integrations/vit.py +265 -0
- spectralquant-0.3.0/src/spectralquant/kernels/__init__.py +15 -0
- spectralquant-0.3.0/src/spectralquant/kernels/compress_keys.py +128 -0
- spectralquant-0.3.0/src/spectralquant/kernels/compress_values.py +142 -0
- spectralquant-0.3.0/src/spectralquant/kernels/fused_attention.py +124 -0
- spectralquant-0.3.0/src/spectralquant/presets.py +135 -0
- spectralquant-0.3.0/src/spectralquant.egg-info/SOURCES.txt +36 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anirudh Bharadwaj Vangara, Ashwin Gopinath
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
|
|
5
|
+
graft src
|
|
6
|
+
graft examples
|
|
7
|
+
graft notebooks
|
|
8
|
+
graft assets
|
|
9
|
+
|
|
10
|
+
global-exclude __pycache__
|
|
11
|
+
global-exclude *.py[cod]
|
|
12
|
+
global-exclude *.so
|
|
13
|
+
global-exclude *.egg-info
|
|
14
|
+
global-exclude .DS_Store
|
|
15
|
+
global-exclude .ipynb_checkpoints
|
|
16
|
+
global-exclude .pytest_cache
|
|
17
|
+
global-exclude results
|
|
18
|
+
global-exclude *.mp4
|
|
19
|
+
global-exclude *.mov
|
|
20
|
+
global-exclude *.avi
|
|
21
|
+
global-exclude *.gif
|
|
22
|
+
|
|
23
|
+
prune **/__pycache__
|
|
24
|
+
prune **/.ipynb_checkpoints
|
|
25
|
+
prune **/*.egg-info
|
|
26
|
+
prune **/.pytest_cache
|
|
27
|
+
prune **/results
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spectralquant
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Eigenspectral KV cache compression for transformer inference. Up to 6.55x compression with FP16-equivalent quality, drop-in for HuggingFace LLMs and vision transformers.
|
|
5
|
+
Author-email: Anirudh Bharadwaj Vangara <anirudh@sentra.app>, Ashwin Gopinath <ashwin@sentra.app>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Dynamis-Labs/spectralquant
|
|
8
|
+
Project-URL: Repository, https://github.com/Dynamis-Labs/spectralquant
|
|
9
|
+
Project-URL: Documentation, https://github.com/Dynamis-Labs/spectralquant#readme
|
|
10
|
+
Project-URL: Issues, https://github.com/Dynamis-Labs/spectralquant/issues
|
|
11
|
+
Keywords: kv-cache,compression,quantization,llm,attention,transformer,inference,spectral,eigenspectral,water-filling,huggingface,vision-transformer
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: torch>=2.2.0
|
|
26
|
+
Requires-Dist: numpy>=1.24
|
|
27
|
+
Requires-Dist: scipy>=1.11
|
|
28
|
+
Requires-Dist: tqdm>=4.65
|
|
29
|
+
Provides-Extra: hf
|
|
30
|
+
Requires-Dist: transformers>=4.40.0; extra == "hf"
|
|
31
|
+
Requires-Dist: accelerate>=0.27.0; extra == "hf"
|
|
32
|
+
Provides-Extra: vit
|
|
33
|
+
Requires-Dist: transformers>=4.40.0; extra == "vit"
|
|
34
|
+
Requires-Dist: Pillow>=10.0; extra == "vit"
|
|
35
|
+
Provides-Extra: alphafold
|
|
36
|
+
Requires-Dist: transformers>=4.40.0; extra == "alphafold"
|
|
37
|
+
Provides-Extra: esmfold
|
|
38
|
+
Requires-Dist: transformers>=4.40.0; extra == "esmfold"
|
|
39
|
+
Provides-Extra: videomae
|
|
40
|
+
Requires-Dist: transformers>=4.40.0; extra == "videomae"
|
|
41
|
+
Requires-Dist: Pillow>=10.0; extra == "videomae"
|
|
42
|
+
Provides-Extra: video
|
|
43
|
+
Requires-Dist: transformers>=4.40.0; extra == "video"
|
|
44
|
+
Requires-Dist: Pillow>=10.0; extra == "video"
|
|
45
|
+
Requires-Dist: av>=10.0.0; extra == "video"
|
|
46
|
+
Provides-Extra: examples
|
|
47
|
+
Requires-Dist: transformers>=4.40.0; extra == "examples"
|
|
48
|
+
Requires-Dist: accelerate>=0.27.0; extra == "examples"
|
|
49
|
+
Requires-Dist: datasets>=2.14; extra == "examples"
|
|
50
|
+
Requires-Dist: Pillow>=10.0; extra == "examples"
|
|
51
|
+
Requires-Dist: requests>=2.28; extra == "examples"
|
|
52
|
+
Requires-Dist: av>=10.0.0; extra == "examples"
|
|
53
|
+
Requires-Dist: numpy>=1.24; extra == "examples"
|
|
54
|
+
Provides-Extra: dev
|
|
55
|
+
Requires-Dist: pytest; extra == "dev"
|
|
56
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
57
|
+
Requires-Dist: build; extra == "dev"
|
|
58
|
+
Requires-Dist: twine; extra == "dev"
|
|
59
|
+
Requires-Dist: ruff; extra == "dev"
|
|
60
|
+
Provides-Extra: all
|
|
61
|
+
Requires-Dist: spectralquant[alphafold,dev,esmfold,examples,hf,videomae,vit]; extra == "all"
|
|
62
|
+
Dynamic: license-file
|
|
63
|
+
|
|
64
|
+
<p align="center">
|
|
65
|
+
<img src="https://raw.githubusercontent.com/Dynamis-Labs/spectralquant/main/assets/spectralquant_banner.png" alt="SpectralQuant" width="100%">
|
|
66
|
+
</p>
|
|
67
|
+
|
|
68
|
+
# SpectralQuant
|
|
69
|
+
|
|
70
|
+
Eigenspectral KV cache compression for transformer inference. Up to 6.55x
|
|
71
|
+
compression of the KV cache with FP16-equivalent output quality.
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
pip install spectralquant
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## What it does
|
|
78
|
+
|
|
79
|
+
Modern LLM inference is bottlenecked by the size of the KV cache. The cache
|
|
80
|
+
grows linearly with sequence length and consumes more memory than the model
|
|
81
|
+
weights themselves at long context. SpectralQuant compresses that cache by
|
|
82
|
+
exploiting the fact that, after a per-head spectral rotation, only a small
|
|
83
|
+
number of dimensions actually carry information.
|
|
84
|
+
|
|
85
|
+
A short calibration step measures the eigenstructure of each attention head.
|
|
86
|
+
Each head's keys and values are then split into a high-variance "semantic"
|
|
87
|
+
band and a low-variance "tail" band. The semantic band gets a generous bit
|
|
88
|
+
budget; the tail gets one or two bits. Total cache size shrinks by 6.55x with
|
|
89
|
+
output quality indistinguishable from FP16.
|
|
90
|
+
|
|
91
|
+
The package ships pure-PyTorch kernels and HuggingFace integrations. There
|
|
92
|
+
are no custom CUDA dependencies. It runs anywhere torch runs.
|
|
93
|
+
|
|
94
|
+
## Quickstart
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
import torch
|
|
98
|
+
import spectralquant as sq
|
|
99
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
100
|
+
|
|
101
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
102
|
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
|
103
|
+
torch_dtype=torch.float16,
|
|
104
|
+
device_map="auto",
|
|
105
|
+
)
|
|
106
|
+
tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
|
|
107
|
+
|
|
108
|
+
engine = sq.SpectralQuant(compression="high") # 6.55x preset
|
|
109
|
+
|
|
110
|
+
out = engine.generate(
|
|
111
|
+
model, tok,
|
|
112
|
+
"Explain water-filling bit allocation in two sentences.",
|
|
113
|
+
max_new_tokens=120,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
print(out["text"])
|
|
117
|
+
print(f"{out['stats']['ratio']:.2f}x compression, "
|
|
118
|
+
f"{out['stats']['tokens_per_second']:.1f} tok/s")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The first call to `engine.generate(...)` runs a one-time calibration with a
|
|
122
|
+
bundled 64-sentence corpus. Subsequent calls reuse it. You can also pass your
|
|
123
|
+
own domain-specific corpus.
|
|
124
|
+
|
|
125
|
+
## Compression presets
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
print(sq.describe_presets())
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
| preset | ratio | risk | notes |
|
|
132
|
+
|------------|--------|------------|---------------------------------------------------|
|
|
133
|
+
| `standard` | 5.95x | safe | Paper baseline. Production default. |
|
|
134
|
+
| `high` | 6.55x | safe | Validated on Mistral 7B and Qwen 2.5 7B. |
|
|
135
|
+
| `max` | 6.68x | edge | First paragraph clean. Light repetition possible. |
|
|
136
|
+
|
|
137
|
+
You can also override individual dials when you need them:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
engine = sq.SpectralQuant(
|
|
141
|
+
compression="high",
|
|
142
|
+
d_eff_variance=0.93, # override one knob
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
The dials are `avg_bits`, `noise_bits`, `value_noise_bits`, and
|
|
147
|
+
`d_eff_variance`. Anything unset falls back to the named preset.
|
|
148
|
+
|
|
149
|
+
## Supported models
|
|
150
|
+
|
|
151
|
+
Tested and verified:
|
|
152
|
+
|
|
153
|
+
| family | example | works |
|
|
154
|
+
|-------------------|-----------------------------------------------|-----------|
|
|
155
|
+
| Mistral | `mistralai/Mistral-7B-Instruct-v0.3` | yes |
|
|
156
|
+
| Qwen 2.5 | `Qwen/Qwen2.5-7B-Instruct` | yes |
|
|
157
|
+
| Llama 3.x | `NousResearch/Meta-Llama-3.1-8B-Instruct` | yes |
|
|
158
|
+
| SmolLM2 | `HuggingFaceTB/SmolLM2-135M` | yes |
|
|
159
|
+
| Gemma 2 | `google/gemma-2-9b` | expected |
|
|
160
|
+
|
|
161
|
+
The cache-level integration works with any HuggingFace causal LM that uses
|
|
162
|
+
`DynamicCache` (transformers >= 4.40). RoPE-based architectures with grouped
|
|
163
|
+
query attention are the primary target.
|
|
164
|
+
|
|
165
|
+
For non-LLM transformers (ViT, ESMFold, VideoMAE, AlphaFold) see the modules
|
|
166
|
+
in `spectralquant.integrations`. Vision transformers can actually see a
|
|
167
|
+
quality *improvement* over FP16 because the eigenspectral filtering removes
|
|
168
|
+
noise in the low-variance directions.
|
|
169
|
+
|
|
170
|
+
## Hardware
|
|
171
|
+
|
|
172
|
+
| GPU | memory | recommended for |
|
|
173
|
+
|-------------------------|---------|------------------------------------|
|
|
174
|
+
| H100 / H200 | 80–141 GB | 7B, 13B, 70B inference, batch decode |
|
|
175
|
+
| A100 80 GB | 80 GB | 7B and 13B inference |
|
|
176
|
+
| A100 40 GB / A6000 | 40–48 GB | 7B inference, short context |
|
|
177
|
+
| RTX 4090 / 4080 / 3090 | 24 GB | 7B inference at FP16, short context |
|
|
178
|
+
| T4 / RTX 3060 | 12–16 GB | smaller models, demo runs |
|
|
179
|
+
| CPU | n/a | works, but slow |
|
|
180
|
+
|
|
181
|
+
The compression ratios above were measured on H200 with Mistral 7B and Qwen
|
|
182
|
+
2.5 7B at sequence length 512. Compression is sequence-length agnostic so
|
|
183
|
+
ratios hold at longer contexts; speed gains scale with context length because
|
|
184
|
+
the FP16 baseline gets slower while the SQ decode stays linear.
|
|
185
|
+
|
|
186
|
+
## Generating with a pre-compressed prefix
|
|
187
|
+
|
|
188
|
+
Useful when you want to keep one compressed cache and reuse it across many
|
|
189
|
+
completions of the same long prefix.
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
result = engine.compress_prefill(model, tok, long_prefix)
|
|
193
|
+
cache = result["cache"] # a fresh DynamicCache, FP16 surface
|
|
194
|
+
print(f"prefix compression: {result['stats']['ratio']:.2f}x")
|
|
195
|
+
|
|
196
|
+
# Use cache as past_key_values for any number of follow-ups:
|
|
197
|
+
inputs = tok(question, return_tensors="pt").to(model.device)
|
|
198
|
+
ids = model.generate(
|
|
199
|
+
**inputs,
|
|
200
|
+
past_key_values=cache,
|
|
201
|
+
max_new_tokens=200,
|
|
202
|
+
)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Custom calibration
|
|
206
|
+
|
|
207
|
+
The bundled corpus works for general English. For domain-specific workloads
|
|
208
|
+
(code, biomedical text, legal filings), pass your own:
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
my_corpus = [...] # 32–128 representative samples
|
|
212
|
+
engine = sq.SpectralQuant(compression="high")
|
|
213
|
+
engine.calibrate(model, tok, my_corpus)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Calibration takes a few seconds on H200. You can persist it once and reload
|
|
217
|
+
in any future process:
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
engine.save_calibration("/path/to/calib")
|
|
221
|
+
fresh = sq.SpectralQuant(compression="high")
|
|
222
|
+
fresh.load_calibration("/path/to/calib", head_dim=128)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## How it works (one paragraph)
|
|
226
|
+
|
|
227
|
+
For each attention head, calibration accumulates the key and value covariance
|
|
228
|
+
matrices and eigendecomposes them. The eigenvectors define a per-head
|
|
229
|
+
rotation that aligns coordinates with directions of decreasing variance.
|
|
230
|
+
After rotation, a *water-filling* allocator distributes bits across
|
|
231
|
+
coordinates so that high-variance dimensions get more bits and tail
|
|
232
|
+
dimensions get fewer. Two bit budgets are used: a "semantic" budget
|
|
233
|
+
(`avg_bits`) for the high-variance band and a "tail" budget (`noise_bits`,
|
|
234
|
+
`value_noise_bits`) for the rest. Each coordinate is quantized with a
|
|
235
|
+
Lloyd-Max scalar codebook fit to a Gaussian whose variance equals that
|
|
236
|
+
coordinate's eigenvalue. Decode rotates back, dequantizes, and the rest of
|
|
237
|
+
attention proceeds at full FP16. The math is in
|
|
238
|
+
[`engine.py`](src/spectralquant/engine.py).
|
|
239
|
+
|
|
240
|
+
## Demo notebook
|
|
241
|
+
|
|
242
|
+
A full end-to-end notebook is included at
|
|
243
|
+
[`notebooks/spectralquant_demo.ipynb`](notebooks/spectralquant_demo.ipynb).
|
|
244
|
+
It walks through:
|
|
245
|
+
|
|
246
|
+
1. Install + GPU sanity check
|
|
247
|
+
2. The three presets
|
|
248
|
+
3. Loading Mistral 7B
|
|
249
|
+
4. Side-by-side FP16 vs SpectralQuant on four diverse prompts, for each preset
|
|
250
|
+
5. Power-user override
|
|
251
|
+
6. Custom calibration
|
|
252
|
+
7. Final summary table
|
|
253
|
+
8. Save / load round-trip
|
|
254
|
+
|
|
255
|
+
To run it on a fresh GPU instance:
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
unzip -oq spectralquant.zip -d spectralquant
|
|
259
|
+
pip install -e ./spectralquant
|
|
260
|
+
jupyter notebook notebooks/spectralquant_demo.ipynb
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## API surface
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
sq.SpectralQuant(
|
|
267
|
+
compression="standard" | "high" | "max",
|
|
268
|
+
device=None, # "cuda" | "mps" | "cpu" | None (auto)
|
|
269
|
+
head_dim=None, # inferred from model
|
|
270
|
+
avg_bits=None, noise_bits=None,
|
|
271
|
+
value_noise_bits=None,
|
|
272
|
+
d_eff_variance=None,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
engine.generate(model, tokenizer, prompt, *, max_new_tokens=128, ...)
|
|
276
|
+
engine.compress_prefill(model, tokenizer, prompt)
|
|
277
|
+
engine.calibrate(model, tokenizer, calibration_texts=None)
|
|
278
|
+
engine.compression_stats()
|
|
279
|
+
engine.save_calibration(path)
|
|
280
|
+
engine.load_calibration(path, head_dim=128)
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
The lower-level `sq.SpectralQuantEngine` is also exported for users who want
|
|
284
|
+
direct access to per-head bit allocations or to use the legacy
|
|
285
|
+
attention-level monkey-patch path.
|
|
286
|
+
|
|
287
|
+
## Measuring quality
|
|
288
|
+
|
|
289
|
+
The package reports four metrics in `engine.compression_stats()` and in the
|
|
290
|
+
`stats` field returned by `.generate(...)`:
|
|
291
|
+
|
|
292
|
+
* `ratio` — observed prefix-cache compression vs FP16 (bytes / bytes)
|
|
293
|
+
* `tokens_per_second` — measured decode throughput
|
|
294
|
+
* `seconds` — wall clock for the decode step
|
|
295
|
+
* `compressed_bytes`, `fp16_bytes` — raw byte counts
|
|
296
|
+
|
|
297
|
+
For independent quality validation you can run perplexity on WikiText:
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
python examples/run_perplexity.py --model mistralai/Mistral-7B-Instruct-v0.3
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
Or sweep parameters to find the sweet spot for a model not in our test set:
|
|
304
|
+
|
|
305
|
+
```bash
|
|
306
|
+
python examples/sweep_compression.py --model <hf_repo>
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
## Authors
|
|
310
|
+
|
|
311
|
+
- Anirudh Bharadwaj Vangara — <anirudh@sentra.app>
|
|
312
|
+
- Ashwin Gopinath — <ashwin@sentra.app>
|
|
313
|
+
|
|
314
|
+
Bug reports, feature requests, and pull requests are welcome on
|
|
315
|
+
[GitHub](https://github.com/Dynamis-Labs/spectralquant).
|
|
316
|
+
|
|
317
|
+
## License
|
|
318
|
+
|
|
319
|
+
MIT.
|
|
320
|
+
|
|
321
|
+
## Citation
|
|
322
|
+
|
|
323
|
+
```bibtex
|
|
324
|
+
@misc{spectralquant2026,
|
|
325
|
+
title = {SpectralQuant: Eigenspectral KV Cache Compression},
|
|
326
|
+
author = {Vangara, Anirudh Bharadwaj and Gopinath, Ashwin},
|
|
327
|
+
year = {2026},
|
|
328
|
+
}
|
|
329
|
+
```
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/Dynamis-Labs/spectralquant/main/assets/spectralquant_banner.png" alt="SpectralQuant" width="100%">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# SpectralQuant
|
|
6
|
+
|
|
7
|
+
Eigenspectral KV cache compression for transformer inference. Up to 6.55x
|
|
8
|
+
compression of the KV cache with FP16-equivalent output quality.
|
|
9
|
+
|
|
10
|
+
```
|
|
11
|
+
pip install spectralquant
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## What it does
|
|
15
|
+
|
|
16
|
+
Modern LLM inference is bottlenecked by the size of the KV cache. The cache
|
|
17
|
+
grows linearly with sequence length and consumes more memory than the model
|
|
18
|
+
weights themselves at long context. SpectralQuant compresses that cache by
|
|
19
|
+
exploiting the fact that, after a per-head spectral rotation, only a small
|
|
20
|
+
number of dimensions actually carry information.
|
|
21
|
+
|
|
22
|
+
A short calibration step measures the eigenstructure of each attention head.
|
|
23
|
+
Each head's keys and values are then split into a high-variance "semantic"
|
|
24
|
+
band and a low-variance "tail" band. The semantic band gets a generous bit
|
|
25
|
+
budget; the tail gets one or two bits. Total cache size shrinks by 6.55x with
|
|
26
|
+
output quality indistinguishable from FP16.
|
|
27
|
+
|
|
28
|
+
The package ships pure-PyTorch kernels and HuggingFace integrations. There
|
|
29
|
+
are no custom CUDA dependencies. It runs anywhere torch runs.
|
|
30
|
+
|
|
31
|
+
## Quickstart
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import torch
|
|
35
|
+
import spectralquant as sq
|
|
36
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
37
|
+
|
|
38
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
39
|
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
|
40
|
+
torch_dtype=torch.float16,
|
|
41
|
+
device_map="auto",
|
|
42
|
+
)
|
|
43
|
+
tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
|
|
44
|
+
|
|
45
|
+
engine = sq.SpectralQuant(compression="high") # 6.55x preset
|
|
46
|
+
|
|
47
|
+
out = engine.generate(
|
|
48
|
+
model, tok,
|
|
49
|
+
"Explain water-filling bit allocation in two sentences.",
|
|
50
|
+
max_new_tokens=120,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
print(out["text"])
|
|
54
|
+
print(f"{out['stats']['ratio']:.2f}x compression, "
|
|
55
|
+
f"{out['stats']['tokens_per_second']:.1f} tok/s")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The first call to `engine.generate(...)` runs a one-time calibration with a
|
|
59
|
+
bundled 64-sentence corpus. Subsequent calls reuse it. You can also pass your
|
|
60
|
+
own domain-specific corpus.
|
|
61
|
+
|
|
62
|
+
## Compression presets
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
print(sq.describe_presets())
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
| preset | ratio | risk | notes |
|
|
69
|
+
|------------|--------|------------|---------------------------------------------------|
|
|
70
|
+
| `standard` | 5.95x | safe | Paper baseline. Production default. |
|
|
71
|
+
| `high` | 6.55x | safe | Validated on Mistral 7B and Qwen 2.5 7B. |
|
|
72
|
+
| `max` | 6.68x | edge | First paragraph clean. Light repetition possible. |
|
|
73
|
+
|
|
74
|
+
You can also override individual dials when you need them:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
engine = sq.SpectralQuant(
|
|
78
|
+
compression="high",
|
|
79
|
+
d_eff_variance=0.93, # override one knob
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
The dials are `avg_bits`, `noise_bits`, `value_noise_bits`, and
|
|
84
|
+
`d_eff_variance`. Anything unset falls back to the named preset.
|
|
85
|
+
|
|
86
|
+
## Supported models
|
|
87
|
+
|
|
88
|
+
Tested and verified:
|
|
89
|
+
|
|
90
|
+
| family | example | works |
|
|
91
|
+
|-------------------|-----------------------------------------------|-----------|
|
|
92
|
+
| Mistral | `mistralai/Mistral-7B-Instruct-v0.3` | yes |
|
|
93
|
+
| Qwen 2.5 | `Qwen/Qwen2.5-7B-Instruct` | yes |
|
|
94
|
+
| Llama 3.x | `NousResearch/Meta-Llama-3.1-8B-Instruct` | yes |
|
|
95
|
+
| SmolLM2 | `HuggingFaceTB/SmolLM2-135M` | yes |
|
|
96
|
+
| Gemma 2 | `google/gemma-2-9b` | expected |
|
|
97
|
+
|
|
98
|
+
The cache-level integration works with any HuggingFace causal LM that uses
|
|
99
|
+
`DynamicCache` (transformers >= 4.40). RoPE-based architectures with grouped
|
|
100
|
+
query attention are the primary target.
|
|
101
|
+
|
|
102
|
+
For non-LLM transformers (ViT, ESMFold, VideoMAE, AlphaFold) see the modules
|
|
103
|
+
in `spectralquant.integrations`. Vision transformers can actually see a
|
|
104
|
+
quality *improvement* over FP16 because the eigenspectral filtering removes
|
|
105
|
+
noise in the low-variance directions.
|
|
106
|
+
|
|
107
|
+
## Hardware
|
|
108
|
+
|
|
109
|
+
| GPU | memory | recommended for |
|
|
110
|
+
|-------------------------|---------|------------------------------------|
|
|
111
|
+
| H100 / H200 | 80–141 GB | 7B, 13B, 70B inference, batch decode |
|
|
112
|
+
| A100 80 GB | 80 GB | 7B and 13B inference |
|
|
113
|
+
| A100 40 GB / A6000 | 40–48 GB | 7B inference, short context |
|
|
114
|
+
| RTX 4090 / 4080 / 3090 | 24 GB | 7B inference at FP16, short context |
|
|
115
|
+
| T4 / RTX 3060 | 12–16 GB | smaller models, demo runs |
|
|
116
|
+
| CPU | n/a | works, but slow |
|
|
117
|
+
|
|
118
|
+
The compression ratios above were measured on H200 with Mistral 7B and Qwen
|
|
119
|
+
2.5 7B at sequence length 512. Compression is sequence-length agnostic so
|
|
120
|
+
ratios hold at longer contexts; speed gains scale with context length because
|
|
121
|
+
the FP16 baseline gets slower while the SQ decode stays linear.
|
|
122
|
+
|
|
123
|
+
## Generating with a pre-compressed prefix
|
|
124
|
+
|
|
125
|
+
Useful when you want to keep one compressed cache and reuse it across many
|
|
126
|
+
completions of the same long prefix.
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
result = engine.compress_prefill(model, tok, long_prefix)
|
|
130
|
+
cache = result["cache"] # a fresh DynamicCache, FP16 surface
|
|
131
|
+
print(f"prefix compression: {result['stats']['ratio']:.2f}x")
|
|
132
|
+
|
|
133
|
+
# Use cache as past_key_values for any number of follow-ups:
|
|
134
|
+
inputs = tok(question, return_tensors="pt").to(model.device)
|
|
135
|
+
ids = model.generate(
|
|
136
|
+
**inputs,
|
|
137
|
+
past_key_values=cache,
|
|
138
|
+
max_new_tokens=200,
|
|
139
|
+
)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Custom calibration
|
|
143
|
+
|
|
144
|
+
The bundled corpus works for general English. For domain-specific workloads
|
|
145
|
+
(code, biomedical text, legal filings), pass your own:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
my_corpus = [...] # 32–128 representative samples
|
|
149
|
+
engine = sq.SpectralQuant(compression="high")
|
|
150
|
+
engine.calibrate(model, tok, my_corpus)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Calibration takes a few seconds on H200. You can persist it once and reload
|
|
154
|
+
in any future process:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
engine.save_calibration("/path/to/calib")
|
|
158
|
+
fresh = sq.SpectralQuant(compression="high")
|
|
159
|
+
fresh.load_calibration("/path/to/calib", head_dim=128)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## How it works (one paragraph)
|
|
163
|
+
|
|
164
|
+
For each attention head, calibration accumulates the key and value covariance
|
|
165
|
+
matrices and eigendecomposes them. The eigenvectors define a per-head
|
|
166
|
+
rotation that aligns coordinates with directions of decreasing variance.
|
|
167
|
+
After rotation, a *water-filling* allocator distributes bits across
|
|
168
|
+
coordinates so that high-variance dimensions get more bits and tail
|
|
169
|
+
dimensions get fewer. Two bit budgets are used: a "semantic" budget
|
|
170
|
+
(`avg_bits`) for the high-variance band and a "tail" budget (`noise_bits`,
|
|
171
|
+
`value_noise_bits`) for the rest. Each coordinate is quantized with a
|
|
172
|
+
Lloyd-Max scalar codebook fit to a Gaussian whose variance equals that
|
|
173
|
+
coordinate's eigenvalue. Decode rotates back, dequantizes, and the rest of
|
|
174
|
+
attention proceeds at full FP16. The math is in
|
|
175
|
+
[`engine.py`](src/spectralquant/engine.py).
|
|
176
|
+
|
|
177
|
+
## Demo notebook
|
|
178
|
+
|
|
179
|
+
A full end-to-end notebook is included at
|
|
180
|
+
[`notebooks/spectralquant_demo.ipynb`](notebooks/spectralquant_demo.ipynb).
|
|
181
|
+
It walks through:
|
|
182
|
+
|
|
183
|
+
1. Install + GPU sanity check
|
|
184
|
+
2. The three presets
|
|
185
|
+
3. Loading Mistral 7B
|
|
186
|
+
4. Side-by-side FP16 vs SpectralQuant on four diverse prompts, for each preset
|
|
187
|
+
5. Power-user override
|
|
188
|
+
6. Custom calibration
|
|
189
|
+
7. Final summary table
|
|
190
|
+
8. Save / load round-trip
|
|
191
|
+
|
|
192
|
+
To run it on a fresh GPU instance:
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
unzip -oq spectralquant.zip -d spectralquant
|
|
196
|
+
pip install -e ./spectralquant
|
|
197
|
+
jupyter notebook notebooks/spectralquant_demo.ipynb
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## API surface
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
sq.SpectralQuant(
|
|
204
|
+
compression="standard" | "high" | "max",
|
|
205
|
+
device=None, # "cuda" | "mps" | "cpu" | None (auto)
|
|
206
|
+
head_dim=None, # inferred from model
|
|
207
|
+
avg_bits=None, noise_bits=None,
|
|
208
|
+
value_noise_bits=None,
|
|
209
|
+
d_eff_variance=None,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
engine.generate(model, tokenizer, prompt, *, max_new_tokens=128, ...)
|
|
213
|
+
engine.compress_prefill(model, tokenizer, prompt)
|
|
214
|
+
engine.calibrate(model, tokenizer, calibration_texts=None)
|
|
215
|
+
engine.compression_stats()
|
|
216
|
+
engine.save_calibration(path)
|
|
217
|
+
engine.load_calibration(path, head_dim=128)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
The lower-level `sq.SpectralQuantEngine` is also exported for users who want
|
|
221
|
+
direct access to per-head bit allocations or to use the legacy
|
|
222
|
+
attention-level monkey-patch path.
|
|
223
|
+
|
|
224
|
+
## Measuring quality
|
|
225
|
+
|
|
226
|
+
The package reports four metrics in `engine.compression_stats()` and in the
|
|
227
|
+
`stats` field returned by `.generate(...)`:
|
|
228
|
+
|
|
229
|
+
* `ratio` — observed prefix-cache compression vs FP16 (bytes / bytes)
|
|
230
|
+
* `tokens_per_second` — measured decode throughput
|
|
231
|
+
* `seconds` — wall clock for the decode step
|
|
232
|
+
* `compressed_bytes`, `fp16_bytes` — raw byte counts
|
|
233
|
+
|
|
234
|
+
For independent quality validation you can run perplexity on WikiText:
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
python examples/run_perplexity.py --model mistralai/Mistral-7B-Instruct-v0.3
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
Or sweep parameters to find the sweet spot for a model not in our test set:
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
python examples/sweep_compression.py --model <hf_repo>
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Authors
|
|
247
|
+
|
|
248
|
+
- Anirudh Bharadwaj Vangara — <anirudh@sentra.app>
|
|
249
|
+
- Ashwin Gopinath — <ashwin@sentra.app>
|
|
250
|
+
|
|
251
|
+
Bug reports, feature requests, and pull requests are welcome on
|
|
252
|
+
[GitHub](https://github.com/Dynamis-Labs/spectralquant).
|
|
253
|
+
|
|
254
|
+
## License
|
|
255
|
+
|
|
256
|
+
MIT.
|
|
257
|
+
|
|
258
|
+
## Citation
|
|
259
|
+
|
|
260
|
+
```bibtex
|
|
261
|
+
@misc{spectralquant2026,
|
|
262
|
+
title = {SpectralQuant: Eigenspectral KV Cache Compression},
|
|
263
|
+
author = {Vangara, Anirudh Bharadwaj and Gopinath, Ashwin},
|
|
264
|
+
year = {2026},
|
|
265
|
+
}
|
|
266
|
+
```
|
|
Binary file
|