aither-kvcache 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aither_kvcache-0.1.0/LICENSE +26 -0
- aither_kvcache-0.1.0/PKG-INFO +244 -0
- aither_kvcache-0.1.0/README.md +212 -0
- aither_kvcache-0.1.0/aither_kvcache.egg-info/PKG-INFO +244 -0
- aither_kvcache-0.1.0/aither_kvcache.egg-info/SOURCES.txt +18 -0
- aither_kvcache-0.1.0/aither_kvcache.egg-info/dependency_links.txt +1 -0
- aither_kvcache-0.1.0/aither_kvcache.egg-info/requires.txt +16 -0
- aither_kvcache-0.1.0/aither_kvcache.egg-info/top_level.txt +1 -0
- aither_kvcache-0.1.0/pyproject.toml +43 -0
- aither_kvcache-0.1.0/setup.cfg +4 -0
- aither_kvcache-0.1.0/tests/test_core.py +472 -0
- aither_kvcache-0.1.0/tests/test_fused.py +419 -0
- aither_kvcache-0.1.0/turboquant/__init__.py +27 -0
- aither_kvcache-0.1.0/turboquant/bench.py +137 -0
- aither_kvcache-0.1.0/turboquant/codebook.py +166 -0
- aither_kvcache-0.1.0/turboquant/fused_attention.py +371 -0
- aither_kvcache-0.1.0/turboquant/packing.py +131 -0
- aither_kvcache-0.1.0/turboquant/quantizer.py +363 -0
- aither_kvcache-0.1.0/turboquant/rotation.py +108 -0
- aither_kvcache-0.1.0/turboquant/triton_ops.py +242 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Creative Commons Attribution 4.0 International License (CC BY 4.0)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Aitherium
|
|
4
|
+
|
|
5
|
+
You are free to:
|
|
6
|
+
|
|
7
|
+
Share - copy and redistribute the material in any medium or format for any
|
|
8
|
+
purpose, even commercially.
|
|
9
|
+
|
|
10
|
+
Adapt - remix, transform, and build upon the material for any purpose, even
|
|
11
|
+
commercially.
|
|
12
|
+
|
|
13
|
+
Under the following terms:
|
|
14
|
+
|
|
15
|
+
Attribution - You must give appropriate credit, provide a link to the license,
|
|
16
|
+
and indicate if changes were made. You may do so in any reasonable manner, but
|
|
17
|
+
not in any way that suggests the licensor endorses you or your use.
|
|
18
|
+
|
|
19
|
+
No additional restrictions - You may not apply legal terms or technological
|
|
20
|
+
measures that legally restrict others from doing anything the license permits.
|
|
21
|
+
|
|
22
|
+
Full license text: https://creativecommons.org/licenses/by/4.0/legalcode
|
|
23
|
+
|
|
24
|
+
This implementation is based on:
|
|
25
|
+
Zandieh et al., "TurboQuant: Online Vector Quantization with Near-optimal
|
|
26
|
+
Distortion Rate", arXiv:2504.19874, April 2025.
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aither-kvcache
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Near-optimal KV cache quantization for LLM inference (arXiv:2504.19874)
|
|
5
|
+
Author: Aitherium
|
|
6
|
+
License-Expression: CC-BY-4.0
|
|
7
|
+
Project-URL: Paper, https://arxiv.org/abs/2504.19874
|
|
8
|
+
Project-URL: Repository, https://github.com/Aitherium/aitherkvcache
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: torch>=2.0
|
|
20
|
+
Requires-Dist: numpy
|
|
21
|
+
Provides-Extra: triton
|
|
22
|
+
Requires-Dist: triton>=2.0; extra == "triton"
|
|
23
|
+
Provides-Extra: scipy
|
|
24
|
+
Requires-Dist: scipy; extra == "scipy"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest; extra == "dev"
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Requires-Dist: triton>=2.0; extra == "all"
|
|
29
|
+
Requires-Dist: scipy; extra == "all"
|
|
30
|
+
Requires-Dist: pytest; extra == "all"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# TurboQuant
|
|
34
|
+
|
|
35
|
+
Near-optimal KV cache quantization for LLM inference. Implements the algorithm
|
|
36
|
+
from [Zandieh et al., "TurboQuant: Online Vector Quantization with Near-optimal
|
|
37
|
+
Distortion Rate" (arXiv:2504.19874)](https://arxiv.org/abs/2504.19874).
|
|
38
|
+
|
|
39
|
+
Compresses KV cache vectors to 2-4 bits per value with MSE within 2.7x of
|
|
40
|
+
the information-theoretic lower bound. No calibration data. No retraining.
|
|
41
|
+
Works online (one vector at a time).
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install turboquant
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Optional extras:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install turboquant[triton] # GPU-fused quantize/dequantize kernels
|
|
54
|
+
pip install turboquant[scipy] # Custom codebook computation via Lloyd-Max
|
|
55
|
+
pip install turboquant[dev] # pytest for running tests
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
## Quick Start
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from turboquant import TurboQuant
|
|
63
|
+
|
|
64
|
+
tq = TurboQuant(head_dim=128, bits=4, device="cuda")
|
|
65
|
+
|
|
66
|
+
# Encode: FP16 vectors -> packed uint8 + norms
|
|
67
|
+
packed, norms = tq.encode(kv_vectors) # kv_vectors: [..., 128] float16
|
|
68
|
+
|
|
69
|
+
# Decode: packed representation -> reconstructed vectors
|
|
70
|
+
decoded = tq.decode(packed, norms) # decoded: [..., 128] float16
|
|
71
|
+
|
|
72
|
+
# Validate MSE against theory
|
|
73
|
+
print(tq.validate())
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
## Algorithm
|
|
78
|
+
|
|
79
|
+
TurboQuant applies three steps to each KV cache vector:
|
|
80
|
+
|
|
81
|
+
1. **Normalize** -- extract L2 norm, project onto the unit sphere S^{d-1}.
|
|
82
|
+
2. **Random rotation** -- multiply by a fixed orthogonal matrix Pi. This makes
|
|
83
|
+
each coordinate approximately Gaussian N(0, 1/d), regardless of the input
|
|
84
|
+
distribution. The rotation is data-oblivious (generated once from a seed).
|
|
85
|
+
3. **Optimal scalar quantization** -- quantize each coordinate independently
|
|
86
|
+
using a precomputed Lloyd-Max codebook for N(0, 1/d). Pack indices into
|
|
87
|
+
uint8 bytes.
|
|
88
|
+
|
|
89
|
+
Storage per vector: `ceil(d * bits / 8)` bytes for indices + 4 bytes for the
|
|
90
|
+
float32 norm.
|
|
91
|
+
|
|
92
|
+
Decoding reverses the process: unpack indices, look up codebook centroids,
|
|
93
|
+
apply inverse rotation Pi^T, rescale by the stored norm.
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
## Compression Ratios
|
|
97
|
+
|
|
98
|
+
Ratios for head_dim=128 (256 bytes at FP16, 128 bytes at FP8):
|
|
99
|
+
|
|
100
|
+
| Bits | Packed Size | Ratio vs FP16 | Ratio vs FP8 |
|
|
101
|
+
|------|-------------|---------------|--------------|
|
|
102
|
+
| 4 | 68 bytes | 3.8x | 1.9x |
|
|
103
|
+
| 3 | 52 bytes | 4.9x | 2.5x |
|
|
104
|
+
| 2 | 36 bytes | 7.1x | 3.6x |
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
## Validated MSE
|
|
108
|
+
|
|
109
|
+
MSE for unit vectors on S^{127} (d=128). Theory bounds from the paper:
|
|
110
|
+
|
|
111
|
+
| Bits | MSE (measured) | Theory Lower | Theory Upper | Ratio to LB |
|
|
112
|
+
|------|---------------|--------------|--------------|-------------|
|
|
113
|
+
| 4 | 0.0095 | 0.0039 | 0.0184 | 2.4x |
|
|
114
|
+
| 3 | 0.0345 | 0.0156 | 0.0736 | 2.2x |
|
|
115
|
+
| 2 | 0.1175 | 0.0625 | 0.2945 | 1.9x |
|
|
116
|
+
|
|
117
|
+
All measured values are within the paper's upper bound and well below the
|
|
118
|
+
worst-case ratio of 3*pi/2 = 4.71x.
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
## Fused Attention (TQPagedAttention)
|
|
122
|
+
|
|
123
|
+
The key optimization for inference: compute attention scores and accumulate
|
|
124
|
+
values **in the rotated domain** without ever materializing a decompression
|
|
125
|
+
buffer.
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from turboquant.fused_attention import TQPagedAttention
|
|
129
|
+
|
|
130
|
+
attn = TQPagedAttention(tq, num_query_heads=32)
|
|
131
|
+
output = attn.forward(
|
|
132
|
+
query, # [num_seqs, num_query_heads, head_dim]
|
|
133
|
+
k_packed, # [num_blocks, block_size, num_kv_heads, packed_dim]
|
|
134
|
+
k_norms, # [num_blocks, block_size, num_kv_heads]
|
|
135
|
+
v_packed, # same layout as k_packed
|
|
136
|
+
v_norms, # same layout as k_norms
|
|
137
|
+
block_tables, # [num_seqs, max_blocks_per_seq]
|
|
138
|
+
context_lens, # [num_seqs]
|
|
139
|
+
)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
The math:
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
q_rot = Pi @ q # rotate query once
|
|
146
|
+
score_i = ||k_i|| * dot(q_rot, y_hat_k_i) / sqrt(d) # score in rotated domain
|
|
147
|
+
acc += softmax_weight_i * ||v_i|| * y_hat_v_i # accumulate rotated V
|
|
148
|
+
output = Pi^T @ normalize(acc) # rotate back once
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
This reads packed uint8 indices directly, avoids the O(seq_len * head_dim)
|
|
152
|
+
decompression buffer, and uses the even/odd nibble split to compute dot
|
|
153
|
+
products without interleaving after 4-bit unpacking.
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
## API Reference
|
|
157
|
+
|
|
158
|
+
### TurboQuant
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
class TurboQuant:
|
|
162
|
+
def __init__(self, config=None, *, head_dim=128, bits=4, seed=42,
|
|
163
|
+
use_hadamard=False, device="cuda", dtype=torch.float16,
|
|
164
|
+
use_triton=True): ...
|
|
165
|
+
|
|
166
|
+
def encode(self, x: Tensor) -> Tuple[Tensor, Tensor]: ...
|
|
167
|
+
def decode(self, packed: Tensor, norms: Tensor) -> Tensor: ...
|
|
168
|
+
def validate(self, num_vectors=10000, device=None) -> dict: ...
|
|
169
|
+
def benchmark(self, num_vectors=32768, warmup=10, iters=100) -> dict: ...
|
|
170
|
+
def compression_ratio(self) -> float: ...
|
|
171
|
+
def memory_report(self, seq_len, num_layers=32, num_kv_heads=8) -> dict: ...
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### TurboQuantConfig
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
@dataclass
|
|
178
|
+
class TurboQuantConfig:
|
|
179
|
+
head_dim: int = 128 # Must be power of 2
|
|
180
|
+
bits: int = 4 # 2, 3, or 4
|
|
181
|
+
seed: int = 42 # RNG seed for rotation matrix
|
|
182
|
+
use_hadamard: bool = False # True = Randomized Hadamard Transform
|
|
183
|
+
hadamard_rounds: int = 3 # RHT rounds (>= 3 for near-Haar)
|
|
184
|
+
device: str = "cuda"
|
|
185
|
+
dtype: torch.dtype = torch.float16
|
|
186
|
+
use_triton: bool = True # Try fused Triton kernels on GPU
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### TQPagedAttention
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
class TQPagedAttention:
|
|
193
|
+
def __init__(self, tq: TurboQuant, num_query_heads: int): ...
|
|
194
|
+
|
|
195
|
+
def forward(self, query, k_packed, k_norms, v_packed, v_norms,
|
|
196
|
+
block_tables, context_lens, block_size=16,
|
|
197
|
+
num_kv_heads=None) -> Tensor: ...
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
## Benchmark
|
|
202
|
+
|
|
203
|
+
Run the built-in benchmark to validate correctness and measure throughput:
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
python -m turboquant.bench
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
This reports:
|
|
210
|
+
- MSE vs theoretical bounds for each bit-width
|
|
211
|
+
- Encode/decode throughput (vectors/second)
|
|
212
|
+
- KV cache memory usage for common model configurations
|
|
213
|
+
- Maximum context length estimates for given GPU memory
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
## Codebook Computation
|
|
217
|
+
|
|
218
|
+
The package includes hardcoded Lloyd-Max codebooks for 1-4 bit quantization
|
|
219
|
+
of N(0,1). For custom configurations, compute codebooks from scratch:
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
from turboquant.codebook import compute_codebook_scipy
|
|
223
|
+
|
|
224
|
+
centroids, boundaries, mse = compute_codebook_scipy(d=128, bits=3)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Requires `scipy` (install with `pip install turboquant[scipy]`).
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
## Reference
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
@article{zandieh2025turboquant,
|
|
234
|
+
title={TurboQuant: Online Vector Quantization with Near-optimal Distortion Rate},
|
|
235
|
+
author={Zandieh, Amir and Han, Insu and Daliri, Majid and Karbasi, Amin},
|
|
236
|
+
journal={arXiv preprint arXiv:2504.19874},
|
|
237
|
+
year={2025}
|
|
238
|
+
}
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
CC BY 4.0 -- see LICENSE file.
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# TurboQuant
|
|
2
|
+
|
|
3
|
+
Near-optimal KV cache quantization for LLM inference. Implements the algorithm
|
|
4
|
+
from [Zandieh et al., "TurboQuant: Online Vector Quantization with Near-optimal
|
|
5
|
+
Distortion Rate" (arXiv:2504.19874)](https://arxiv.org/abs/2504.19874).
|
|
6
|
+
|
|
7
|
+
Compresses KV cache vectors to 2-4 bits per value with MSE within 2.7x of
|
|
8
|
+
the information-theoretic lower bound. No calibration data. No retraining.
|
|
9
|
+
Works online (one vector at a time).
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install turboquant
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Optional extras:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install turboquant[triton] # GPU-fused quantize/dequantize kernels
|
|
22
|
+
pip install turboquant[scipy] # Custom codebook computation via Lloyd-Max
|
|
23
|
+
pip install turboquant[dev] # pytest for running tests
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from turboquant import TurboQuant
|
|
31
|
+
|
|
32
|
+
tq = TurboQuant(head_dim=128, bits=4, device="cuda")
|
|
33
|
+
|
|
34
|
+
# Encode: FP16 vectors -> packed uint8 + norms
|
|
35
|
+
packed, norms = tq.encode(kv_vectors) # kv_vectors: [..., 128] float16
|
|
36
|
+
|
|
37
|
+
# Decode: packed representation -> reconstructed vectors
|
|
38
|
+
decoded = tq.decode(packed, norms) # decoded: [..., 128] float16
|
|
39
|
+
|
|
40
|
+
# Validate MSE against theory
|
|
41
|
+
print(tq.validate())
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
## Algorithm
|
|
46
|
+
|
|
47
|
+
TurboQuant applies three steps to each KV cache vector:
|
|
48
|
+
|
|
49
|
+
1. **Normalize** -- extract L2 norm, project onto the unit sphere S^{d-1}.
|
|
50
|
+
2. **Random rotation** -- multiply by a fixed orthogonal matrix Pi. This makes
|
|
51
|
+
each coordinate approximately Gaussian N(0, 1/d), regardless of the input
|
|
52
|
+
distribution. The rotation is data-oblivious (generated once from a seed).
|
|
53
|
+
3. **Optimal scalar quantization** -- quantize each coordinate independently
|
|
54
|
+
using a precomputed Lloyd-Max codebook for N(0, 1/d). Pack indices into
|
|
55
|
+
uint8 bytes.
|
|
56
|
+
|
|
57
|
+
Storage per vector: `ceil(d * bits / 8)` bytes for indices + 4 bytes for the
|
|
58
|
+
float32 norm.
|
|
59
|
+
|
|
60
|
+
Decoding reverses the process: unpack indices, look up codebook centroids,
|
|
61
|
+
apply inverse rotation Pi^T, rescale by the stored norm.
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
## Compression Ratios
|
|
65
|
+
|
|
66
|
+
Ratios for head_dim=128 (256 bytes at FP16, 128 bytes at FP8):
|
|
67
|
+
|
|
68
|
+
| Bits | Packed Size | Ratio vs FP16 | Ratio vs FP8 |
|
|
69
|
+
|------|-------------|---------------|--------------|
|
|
70
|
+
| 4 | 68 bytes | 3.8x | 1.9x |
|
|
71
|
+
| 3 | 52 bytes | 4.9x | 2.5x |
|
|
72
|
+
| 2 | 36 bytes | 7.1x | 3.6x |
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
## Validated MSE
|
|
76
|
+
|
|
77
|
+
MSE for unit vectors on S^{127} (d=128). Theory bounds from the paper:
|
|
78
|
+
|
|
79
|
+
| Bits | MSE (measured) | Theory Lower | Theory Upper | Ratio to LB |
|
|
80
|
+
|------|---------------|--------------|--------------|-------------|
|
|
81
|
+
| 4 | 0.0095 | 0.0039 | 0.0184 | 2.4x |
|
|
82
|
+
| 3 | 0.0345 | 0.0156 | 0.0736 | 2.2x |
|
|
83
|
+
| 2 | 0.1175 | 0.0625 | 0.2945 | 1.9x |
|
|
84
|
+
|
|
85
|
+
All measured values are within the paper's upper bound and well below the
|
|
86
|
+
worst-case ratio of 3*pi/2 = 4.71x.
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
## Fused Attention (TQPagedAttention)
|
|
90
|
+
|
|
91
|
+
The key optimization for inference: compute attention scores and accumulate
|
|
92
|
+
values **in the rotated domain** without ever materializing a decompression
|
|
93
|
+
buffer.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from turboquant.fused_attention import TQPagedAttention
|
|
97
|
+
|
|
98
|
+
attn = TQPagedAttention(tq, num_query_heads=32)
|
|
99
|
+
output = attn.forward(
|
|
100
|
+
query, # [num_seqs, num_query_heads, head_dim]
|
|
101
|
+
k_packed, # [num_blocks, block_size, num_kv_heads, packed_dim]
|
|
102
|
+
k_norms, # [num_blocks, block_size, num_kv_heads]
|
|
103
|
+
v_packed, # same layout as k_packed
|
|
104
|
+
v_norms, # same layout as k_norms
|
|
105
|
+
block_tables, # [num_seqs, max_blocks_per_seq]
|
|
106
|
+
context_lens, # [num_seqs]
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The math:
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
q_rot = Pi @ q # rotate query once
|
|
114
|
+
score_i = ||k_i|| * dot(q_rot, y_hat_k_i) / sqrt(d) # score in rotated domain
|
|
115
|
+
acc += softmax_weight_i * ||v_i|| * y_hat_v_i # accumulate rotated V
|
|
116
|
+
output = Pi^T @ normalize(acc) # rotate back once
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
This reads packed uint8 indices directly, avoids the O(seq_len * head_dim)
|
|
120
|
+
decompression buffer, and uses the even/odd nibble split to compute dot
|
|
121
|
+
products without interleaving after 4-bit unpacking.
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
## API Reference
|
|
125
|
+
|
|
126
|
+
### TurboQuant
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
class TurboQuant:
|
|
130
|
+
def __init__(self, config=None, *, head_dim=128, bits=4, seed=42,
|
|
131
|
+
use_hadamard=False, device="cuda", dtype=torch.float16,
|
|
132
|
+
use_triton=True): ...
|
|
133
|
+
|
|
134
|
+
def encode(self, x: Tensor) -> Tuple[Tensor, Tensor]: ...
|
|
135
|
+
def decode(self, packed: Tensor, norms: Tensor) -> Tensor: ...
|
|
136
|
+
def validate(self, num_vectors=10000, device=None) -> dict: ...
|
|
137
|
+
def benchmark(self, num_vectors=32768, warmup=10, iters=100) -> dict: ...
|
|
138
|
+
def compression_ratio(self) -> float: ...
|
|
139
|
+
def memory_report(self, seq_len, num_layers=32, num_kv_heads=8) -> dict: ...
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### TurboQuantConfig
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
@dataclass
|
|
146
|
+
class TurboQuantConfig:
|
|
147
|
+
head_dim: int = 128 # Must be power of 2
|
|
148
|
+
bits: int = 4 # 2, 3, or 4
|
|
149
|
+
seed: int = 42 # RNG seed for rotation matrix
|
|
150
|
+
use_hadamard: bool = False # True = Randomized Hadamard Transform
|
|
151
|
+
hadamard_rounds: int = 3 # RHT rounds (>= 3 for near-Haar)
|
|
152
|
+
device: str = "cuda"
|
|
153
|
+
dtype: torch.dtype = torch.float16
|
|
154
|
+
use_triton: bool = True # Try fused Triton kernels on GPU
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### TQPagedAttention
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
class TQPagedAttention:
|
|
161
|
+
def __init__(self, tq: TurboQuant, num_query_heads: int): ...
|
|
162
|
+
|
|
163
|
+
def forward(self, query, k_packed, k_norms, v_packed, v_norms,
|
|
164
|
+
block_tables, context_lens, block_size=16,
|
|
165
|
+
num_kv_heads=None) -> Tensor: ...
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
## Benchmark
|
|
170
|
+
|
|
171
|
+
Run the built-in benchmark to validate correctness and measure throughput:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
python -m turboquant.bench
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
This reports:
|
|
178
|
+
- MSE vs theoretical bounds for each bit-width
|
|
179
|
+
- Encode/decode throughput (vectors/second)
|
|
180
|
+
- KV cache memory usage for common model configurations
|
|
181
|
+
- Maximum context length estimates for given GPU memory
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
## Codebook Computation
|
|
185
|
+
|
|
186
|
+
The package includes hardcoded Lloyd-Max codebooks for 1-4 bit quantization
|
|
187
|
+
of N(0,1). For custom configurations, compute codebooks from scratch:
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
from turboquant.codebook import compute_codebook_scipy
|
|
191
|
+
|
|
192
|
+
centroids, boundaries, mse = compute_codebook_scipy(d=128, bits=3)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Requires `scipy` (install with `pip install turboquant[scipy]`).
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
## Reference
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
@article{zandieh2025turboquant,
|
|
202
|
+
title={TurboQuant: Online Vector Quantization with Near-optimal Distortion Rate},
|
|
203
|
+
author={Zandieh, Amir and Han, Insu and Daliri, Majid and Karbasi, Amin},
|
|
204
|
+
journal={arXiv preprint arXiv:2504.19874},
|
|
205
|
+
year={2025}
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
## License
|
|
211
|
+
|
|
212
|
+
CC BY 4.0 -- see LICENSE file.
|