turboquant-mlx-full 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- turboquant_mlx_full-0.1.0/LICENSE +21 -0
- turboquant_mlx_full-0.1.0/MANIFEST.in +13 -0
- turboquant_mlx_full-0.1.0/PKG-INFO +473 -0
- turboquant_mlx_full-0.1.0/README.md +438 -0
- turboquant_mlx_full-0.1.0/__init__.py +9 -0
- turboquant_mlx_full-0.1.0/__main__.py +5 -0
- turboquant_mlx_full-0.1.0/config.py +69 -0
- turboquant_mlx_full-0.1.0/convert.py +177 -0
- turboquant_mlx_full-0.1.0/core/__init__.py +13 -0
- turboquant_mlx_full-0.1.0/core/codebook.py +150 -0
- turboquant_mlx_full-0.1.0/core/packing.py +82 -0
- turboquant_mlx_full-0.1.0/core/polar_quantize.py +141 -0
- turboquant_mlx_full-0.1.0/core/qjl.py +202 -0
- turboquant_mlx_full-0.1.0/core/rotation.py +162 -0
- turboquant_mlx_full-0.1.0/csrc/CMakeLists.txt +73 -0
- turboquant_mlx_full-0.1.0/csrc/bindings.cpp +106 -0
- turboquant_mlx_full-0.1.0/csrc/polar_kernels.metal +277 -0
- turboquant_mlx_full-0.1.0/csrc/polar_ops.cpp +300 -0
- turboquant_mlx_full-0.1.0/csrc/polar_ops.h +107 -0
- turboquant_mlx_full-0.1.0/demo_kv.py +163 -0
- turboquant_mlx_full-0.1.0/evaluate.py +431 -0
- turboquant_mlx_full-0.1.0/generate.py +233 -0
- turboquant_mlx_full-0.1.0/integration/__init__.py +0 -0
- turboquant_mlx_full-0.1.0/integration/rotation_configs.py +184 -0
- turboquant_mlx_full-0.1.0/kernels/__init__.py +3 -0
- turboquant_mlx_full-0.1.0/kernels/polar_gather_qmv.py +175 -0
- turboquant_mlx_full-0.1.0/kernels/polar_multi_gather_qmv.py +147 -0
- turboquant_mlx_full-0.1.0/kernels/polar_qmv.py +164 -0
- turboquant_mlx_full-0.1.0/layers/__init__.py +7 -0
- turboquant_mlx_full-0.1.0/layers/polar_kv_cache.py +315 -0
- turboquant_mlx_full-0.1.0/layers/polar_linear.py +242 -0
- turboquant_mlx_full-0.1.0/layers/polar_switch_linear.py +276 -0
- turboquant_mlx_full-0.1.0/pyproject.toml +86 -0
- turboquant_mlx_full-0.1.0/quantize_model.py +305 -0
- turboquant_mlx_full-0.1.0/setup.cfg +4 -0
- turboquant_mlx_full-0.1.0/setup.py +16 -0
- turboquant_mlx_full-0.1.0/test_kv_cache.py +267 -0
- turboquant_mlx_full-0.1.0/tests/test_core.py +451 -0
- turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/PKG-INFO +473 -0
- turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/SOURCES.txt +57 -0
- turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/dependency_links.txt +1 -0
- turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/entry_points.txt +2 -0
- turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/requires.txt +12 -0
- turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Manjunath Shiva
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
include README.md
|
|
2
|
+
include LICENSE
|
|
3
|
+
include pyproject.toml
|
|
4
|
+
include setup.py
|
|
5
|
+
|
|
6
|
+
recursive-include csrc *.cpp *.h *.metal CMakeLists.txt
|
|
7
|
+
|
|
8
|
+
recursive-exclude * __pycache__
|
|
9
|
+
recursive-exclude * *.pyc
|
|
10
|
+
recursive-exclude * *.pyo
|
|
11
|
+
recursive-exclude * *.so
|
|
12
|
+
recursive-exclude * *.dylib
|
|
13
|
+
recursive-exclude * *.metallib
|
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: turboquant-mlx-full
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extreme weight and KV cache compression for LLMs on Apple Silicon (MLX implementation of Google's TurboQuant)
|
|
5
|
+
Author: Manjunath Shiva
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/manjunathshiva/turboquant-mlx
|
|
8
|
+
Project-URL: Repository, https://github.com/manjunathshiva/turboquant-mlx
|
|
9
|
+
Project-URL: Issues, https://github.com/manjunathshiva/turboquant-mlx/issues
|
|
10
|
+
Keywords: mlx,quantization,llm,kv-cache,apple-silicon,turboquant,polarquant,moe,long-context
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: mlx>=0.20.0
|
|
25
|
+
Requires-Dist: mlx-lm>=0.10.0
|
|
26
|
+
Requires-Dist: numpy
|
|
27
|
+
Provides-Extra: eval
|
|
28
|
+
Requires-Dist: datasets; extra == "eval"
|
|
29
|
+
Requires-Dist: transformers; extra == "eval"
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest; extra == "dev"
|
|
32
|
+
Requires-Dist: build; extra == "dev"
|
|
33
|
+
Requires-Dist: twine; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# TurboQuant-MLX
|
|
37
|
+
|
|
38
|
+
Extreme **weight** and **KV cache** compression for LLMs on Apple Silicon. MLX implementation of Google's [TurboQuant](https://arxiv.org/abs/2504.19874) (Zandieh et al., 2025) — Hadamard rotation + Lloyd-Max codebooks applied both to weights (compile time) and the KV cache (run time).
|
|
39
|
+
|
|
40
|
+
Supports dense models (LLaMA, Qwen, Mistral) and **Mixture-of-Experts** (Qwen-MoE, GPT-OSS, Qwen3.5-MoE). Compatible with hybrid attention architectures, attention sinks, sliding-window attention, and linear attention layers.
|
|
41
|
+
|
|
42
|
+
**With both weight and KV cache compression at 3-bit, GPT-OSS-120B fits its full 131K context window in 50 GB on a 64 GB MacBook — and KV cache compression actually makes generation *faster* on the 120B (8.7 vs 6.4 tok/s) because the smaller cache cuts memory bandwidth more than dequant costs.**
|
|
43
|
+
|
|
44
|
+
## Key Results — Weight Compression
|
|
45
|
+
|
|
46
|
+
| Model | Method | Bits | PPL | Size | Gen Speed (M4 Max) |
|
|
47
|
+
|-------|--------|------|-----|------|---------------------|
|
|
48
|
+
| Qwen2.5-7B | TurboQuant | 3 | 8.92 | 3.5 GB | — |
|
|
49
|
+
| Qwen2.5-7B | Affine | 3 | 13.37 | 3.3 GB | — |
|
|
50
|
+
| GPT-OSS-20B | Affine (mlx-lm) | 4 | — | 11.2 GB | 148 tok/s |
|
|
51
|
+
| GPT-OSS-20B | MXFP4 (original) | 4 | 83.04 | 12.8 GB | — |
|
|
52
|
+
| GPT-OSS-20B | TurboQuant | 4 | 72.63 | 11.2 GB | — |
|
|
53
|
+
| GPT-OSS-20B | TurboQuant | 3 | 78.60 | 9.3 GB | **73 tok/s** |
|
|
54
|
+
| GPT-OSS-120B | [Affine 4-bit (mlx-community)](https://huggingface.co/mlx-community/gpt-oss-120b-4bit) | 4 | — | 65.8 GB | *Doesn't fit 64GB* |
|
|
55
|
+
| GPT-OSS-120B | MXFP4 (original) | 4 | — | 63.5 GB | *Doesn't fit 64GB* |
|
|
56
|
+
| GPT-OSS-120B | TurboQuant | 3 | — | 48 GB | **44 tok/s** |
|
|
57
|
+
| GPT-OSS-120B | TurboQuant | 2 | — | 32 GB | 51 tok/s (poor quality) |
|
|
58
|
+
| Qwen3.5-122B-A10B | BF16 (original) | 16 | — | ~240 GB | *Doesn't fit 64GB* |
|
|
59
|
+
| **Qwen3.5-122B-A10B** | **TurboQuant** | **3** | **—** | **~50 GB** | **26.5 tok/s** |
|
|
60
|
+
|
|
61
|
+
## Key Results — KV Cache Compression
|
|
62
|
+
|
|
63
|
+
| Model | KV cache config | KV size | Speed | Notes |
|
|
64
|
+
|-------|----------------|---------|-------|-------|
|
|
65
|
+
| GPT-OSS-20B (FP16 weights) | FP16 KV | 27.0 MB | 90.6 tok/s | baseline |
|
|
66
|
+
| GPT-OSS-20B (FP16 weights) | TQ 3-bit KV | 7.79 MB | 29.9 tok/s | **3.5x cache savings** |
|
|
67
|
+
| GPT-OSS-120B (TQ 3-bit weights) | FP16 KV | 45.0 MB | 6.4 tok/s | baseline |
|
|
68
|
+
| **GPT-OSS-120B (TQ 3-bit weights)** | **TQ 3-bit KV** | **11.83 MB** | **8.7 tok/s** | **3.8x cache savings — and *faster* than FP16** |
|
|
69
|
+
| GPT-OSS-120B (TQ 3-bit weights) | TQ 4-bit KV | 12.21 MB | 16.0 tok/s | also clean |
|
|
70
|
+
| Qwen3.5-122B (TQ 3-bit weights) | FP16 KV | 161.06 MB | 5.4 tok/s | baseline |
|
|
71
|
+
| **Qwen3.5-122B (TQ 3-bit weights)** | **TQ 3-bit KV** | **150.17 MB** | **5.7 tok/s** | output identical to FP16 |
|
|
72
|
+
|
|
73
|
+
KV cache compression projects to ~7 GB RAM saved at 131K context on GPT-OSS-120B and ~5 GB at 262K on Qwen3.5-122B. Roundtrip cosine similarity vs FP16: 0.983 at 3-bit, 0.995 at 4-bit.
|
|
74
|
+
|
|
75
|
+
## Install
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install turboquant-mlx-full
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
The package is published as `turboquant-mlx-full` on PyPI, but importable as
|
|
82
|
+
`turboquant_mlx` (without the `-full` suffix) — this matches the original
|
|
83
|
+
project name and the examples in the Medium articles.
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
import turboquant_mlx
|
|
87
|
+
from turboquant_mlx.layers import TurboQuantKVCache, convert_cache_to_turboquant
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Requirements
|
|
91
|
+
|
|
92
|
+
- macOS with Apple Silicon (M1/M2/M3/M4)
|
|
93
|
+
- Python 3.10+
|
|
94
|
+
- 64 GB unified memory recommended for 20B+ models
|
|
95
|
+
- Xcode Command Line Tools and CMake 3.27+ (the package builds a small Metal
|
|
96
|
+
extension on install)
|
|
97
|
+
|
|
98
|
+
### Install from source (for development)
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
git clone https://github.com/manjunathshiva/turboquant-mlx.git
|
|
102
|
+
cd turboquant-mlx
|
|
103
|
+
pip install -e .
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
For evaluation utilities (perplexity benchmarking), also install the optional
|
|
107
|
+
dependencies:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pip install "turboquant-mlx-full[eval]"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Quick Start
|
|
114
|
+
|
|
115
|
+
### 1. Convert a model to TurboQuant format
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Dense model (e.g., LLaMA 3.2 1B at 3-bit)
|
|
119
|
+
python -m turboquant_mlx.convert \
|
|
120
|
+
--hf-path meta-llama/Llama-3.2-1B \
|
|
121
|
+
--mlx-path ./llama-3.2-1b-tq3 \
|
|
122
|
+
--bits 3 --group-size 64
|
|
123
|
+
|
|
124
|
+
# MoE model (e.g., GPT-OSS-20B at 2-bit)
|
|
125
|
+
python -m turboquant_mlx.convert \
|
|
126
|
+
--hf-path openai/gpt-oss-20b \
|
|
127
|
+
--mlx-path ./gpt-oss-20b-tq2 \
|
|
128
|
+
--bits 2 --group-size 64
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### 2. Generate text
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
python -m turboquant_mlx.generate \
|
|
135
|
+
--model ./gpt-oss-20b-tq2 \
|
|
136
|
+
--prompt "Why is the sky blue? Explain in simple terms." \
|
|
137
|
+
--max-tokens 200
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### 3. Evaluate perplexity
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
python -m turboquant_mlx.evaluate \
|
|
144
|
+
--hf-path openai/gpt-oss-20b \
|
|
145
|
+
--bits 2 3 4 \
|
|
146
|
+
--num-samples 256 --seq-len 512
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### 4. Generate with KV cache compression
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# Standard model + KV cache compression
|
|
153
|
+
python -m turboquant_mlx.demo_kv \
|
|
154
|
+
--model openai/gpt-oss-20b \
|
|
155
|
+
--prompt "Why is the sky blue?" \
|
|
156
|
+
--max-tokens 200 --tq-bits 3
|
|
157
|
+
|
|
158
|
+
# TQ-compressed model + KV cache compression (full stack)
|
|
159
|
+
python -m turboquant_mlx.demo_kv \
|
|
160
|
+
--model ./gpt-oss-120b-tq3 \
|
|
161
|
+
--prompt "Why is the sky blue?" \
|
|
162
|
+
--max-tokens 200 --tq-bits 3
|
|
163
|
+
|
|
164
|
+
# Side-by-side comparison: FP16 KV vs TurboQuant KV
|
|
165
|
+
python -m turboquant_mlx.demo_kv \
|
|
166
|
+
--model ./gpt-oss-120b-tq3 \
|
|
167
|
+
--prompt "Why is the sky blue?" \
|
|
168
|
+
--max-tokens 200 --compare
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## KV Cache Compression
|
|
174
|
+
|
|
175
|
+
TurboQuant KV cache compression applies the same Hadamard rotation + Lloyd-Max codebook pipeline to KV vectors at runtime. The compressed cache is dequantized to float16 only when attention needs it, so it routes through MLX's standard `scaled_dot_product_attention` and is compatible with attention sinks, sliding windows, and linear attention layers.
|
|
176
|
+
|
|
177
|
+
### Programmatic usage
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
from turboquant_mlx.layers import convert_cache_to_turboquant
|
|
181
|
+
from mlx_lm.models.cache import make_prompt_cache
|
|
182
|
+
|
|
183
|
+
# 1. Process the prompt with FP16 KV cache (exact)
|
|
184
|
+
cache = make_prompt_cache(model)
|
|
185
|
+
model(prompt_tokens, cache=cache)
|
|
186
|
+
|
|
187
|
+
# 2. Convert to TurboQuant KV cache for generation
|
|
188
|
+
cache = convert_cache_to_turboquant(cache, tq_bits=3, group_size=64)
|
|
189
|
+
|
|
190
|
+
# 3. Continue generation — cache is now compressed
|
|
191
|
+
for token in generate_loop(model, cache):
|
|
192
|
+
...
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Choosing a bit-width
|
|
196
|
+
|
|
197
|
+
| Weights | KV cache | Recommendation |
|
|
198
|
+
|---------|----------|----------------|
|
|
199
|
+
| FP16 / BF16 | TQ 3-bit | Default sweet spot at every model size |
|
|
200
|
+
| TQ-compressed (~20B) | TQ 4-bit | Use 4-bit when stacking on TQ weights — small models have a tighter noise budget |
|
|
201
|
+
| **TQ-compressed (100B+)** | **TQ 3-bit** | **3-bit on 3-bit works cleanly on GPT-OSS-120B and Qwen3.5-122B — 100B+ models have enough redundancy to absorb the stacked noise** |
|
|
202
|
+
|
|
203
|
+
### The speed flip
|
|
204
|
+
|
|
205
|
+
On small fast models (~20B), KV cache compression is a quality-vs-speed tradeoff: the dequant overhead dominates because the model is fast to begin with. On large slow models (100B+), the 4x smaller KV cache reduces memory bandwidth more than dequant adds — generation is *faster* than the FP16 baseline:
|
|
206
|
+
|
|
207
|
+
| Model | FP16 KV | TQ 3-bit KV | Direction |
|
|
208
|
+
|-------|---------|-------------|-----------|
|
|
209
|
+
| GPT-OSS-20B | 90.6 tok/s | 29.9 tok/s | TQ is 3x **slower** |
|
|
210
|
+
| GPT-OSS-120B | 6.4 tok/s | 8.7 tok/s | TQ is 1.4x **faster** |
|
|
211
|
+
|
|
212
|
+
### Compatibility
|
|
213
|
+
|
|
214
|
+
| Feature | Supported | Notes |
|
|
215
|
+
|---------|-----------|-------|
|
|
216
|
+
| Attention sinks | Yes | GPT-OSS sink vectors flow through standard SDPA |
|
|
217
|
+
| Sliding window attention | Yes | `RotatingKVCache` layers are left untouched |
|
|
218
|
+
| Linear attention | Yes | `ArraysCache` (Qwen3.5 GatedDeltaNet) is left untouched |
|
|
219
|
+
| Hybrid architectures | Yes | Per-layer cache type is preserved |
|
|
220
|
+
| Prompt-first conversion | Yes | Process prompt with FP16, convert before generation |
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## Running GPT-OSS MoE Models on Apple Silicon
|
|
225
|
+
|
|
226
|
+
### GPT-OSS-20B (21B total, 32 experts, 3.6B active)
|
|
227
|
+
|
|
228
|
+
**Hardware:** Apple M4 Max 64GB (or any Apple Silicon with 16GB+ unified memory at 3-bit)
|
|
229
|
+
|
|
230
|
+
#### Step 1: Convert to TurboQuant 3-bit (recommended)
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
python -m turboquant_mlx.convert \
|
|
234
|
+
--hf-path openai/gpt-oss-20b \
|
|
235
|
+
--mlx-path ./gpt-oss-20b-tq3 \
|
|
236
|
+
--bits 3 --group-size 32
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Model size:** 9.3 GB (vs 12.8 GB MXFP4 original — 28% smaller, lower perplexity)
|
|
240
|
+
|
|
241
|
+
The converter automatically:
|
|
242
|
+
- Detects MoE architecture (SwitchLinear / QuantizedSwitchLinear layers)
|
|
243
|
+
- Dequantizes MXFP4 expert weights to float
|
|
244
|
+
- Applies Hadamard rotation + Lloyd-Max codebook quantization
|
|
245
|
+
- Keeps router weights and attention at full precision
|
|
246
|
+
- Handles blockwise Hadamard for 2880-dim experts (2880 = 9 x 320)
|
|
247
|
+
|
|
248
|
+
#### Step 2: Generate text
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
python -m turboquant_mlx.generate \
|
|
252
|
+
--model ./gpt-oss-20b-tq3 \
|
|
253
|
+
--prompt "Explain quantum entanglement to a 10-year-old." \
|
|
254
|
+
--max-tokens 256
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**Expected:** ~73 tok/s generation, ~85 tok/s prefill on M4 Max
|
|
258
|
+
|
|
259
|
+
#### Step 3: Run a quick quality check
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
python -m turboquant_mlx.evaluate \
|
|
263
|
+
--hf-path openai/gpt-oss-20b \
|
|
264
|
+
--bits 3 \
|
|
265
|
+
--no-affine --no-qjl \
|
|
266
|
+
--num-samples 64 --seq-len 512
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
#### All bit-widths for GPT-OSS-20B
|
|
270
|
+
|
|
271
|
+
| Method | Bits | Size | Peak RAM | Gen Speed | Quality |
|
|
272
|
+
|--------|------|------|----------|-----------|---------|
|
|
273
|
+
| Affine (mlx-lm) | 4 | 11.2 GB | ~14 GB | 148 tok/s | Coherent (but see note below) |
|
|
274
|
+
| TurboQuant | 4 | 11.2 GB | ~14 GB | — | Best (PPL 72.63, beats MXFP4) |
|
|
275
|
+
| **TurboQuant** | **3** | **9.3 GB** | **~12 GB** | **73 tok/s** | **Recommended (PPL 78.60, beats MXFP4, coherent)** |
|
|
276
|
+
| TurboQuant | 2 | 7.5 GB | ~10 GB | — | Poor (incoherent generation on pre-quantized models) |
|
|
277
|
+
|
|
278
|
+
> **Speed vs quality tradeoff:** Affine 4-bit is ~2x faster on the 20B model due to simpler dequantization, but TurboQuant 3-bit is 28% smaller with lower perplexity than both affine 4-bit and OpenAI's own MXFP4. Crucially, affine 4-bit **cannot scale to 120B** on 64GB hardware — TurboQuant 3-bit is the only option there.
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
# 4-bit (best quality, beats OpenAI's MXFP4)
|
|
282
|
+
python -m turboquant_mlx.convert \
|
|
283
|
+
--hf-path openai/gpt-oss-20b \
|
|
284
|
+
--mlx-path ./gpt-oss-20b-tq4 \
|
|
285
|
+
--bits 4 --group-size 32
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
### GPT-OSS-120B (120B total, 128 experts, ~13B active)
|
|
291
|
+
|
|
292
|
+
**Hardware:** Apple M4 Max 64GB — neither the original MXFP4 (63.5 GB) nor the [mlx-community 4-bit affine](https://huggingface.co/mlx-community/gpt-oss-120b-4bit) (65.8 GB) fit on a 64GB machine. TurboQuant 3-bit is the only way to run this model on consumer hardware.
|
|
293
|
+
|
|
294
|
+
#### Step 1: Convert to TurboQuant 3-bit (recommended)
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
python -m turboquant_mlx.convert \
|
|
298
|
+
--hf-path openai/gpt-oss-120b \
|
|
299
|
+
--mlx-path ./gpt-oss-120b-tq3 \
|
|
300
|
+
--bits 3 --group-size 64
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
**Model size:** 48 GB
|
|
304
|
+
|
|
305
|
+
> **Note:** Conversion requires temporarily loading the full model. With 120B parameters, peak memory during conversion may reach ~50-55 GB. On a 64 GB machine this is tight — close all other applications before running. The converter processes layers sequentially and frees memory after each expert is quantized.
|
|
306
|
+
|
|
307
|
+
#### Step 2: Generate text
|
|
308
|
+
|
|
309
|
+
```bash
|
|
310
|
+
python -m turboquant_mlx.generate \
|
|
311
|
+
--model ./gpt-oss-120b-tq3 \
|
|
312
|
+
--prompt "Explain quantum computing in simple terms." \
|
|
313
|
+
--max-tokens 200
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
**Expected:** ~44 tok/s generation, ~9.5 tok/s prefill, 52 GB peak memory on M4 Max 64GB
|
|
317
|
+
|
|
318
|
+
#### Step 3: Quick quality check
|
|
319
|
+
|
|
320
|
+
```bash
|
|
321
|
+
python -m turboquant_mlx.evaluate \
|
|
322
|
+
--hf-path openai/gpt-oss-120b \
|
|
323
|
+
--bits 3 \
|
|
324
|
+
--no-affine --no-qjl \
|
|
325
|
+
--num-samples 32 --seq-len 512
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
#### All bit-widths for GPT-OSS-120B
|
|
329
|
+
|
|
330
|
+
| Method | Bits | Size | Peak RAM | Gen Speed | Fits 64 GB? | Quality |
|
|
331
|
+
|--------|------|------|----------|-----------|-------------|---------|
|
|
332
|
+
| [mlx-community 4-bit](https://huggingface.co/mlx-community/gpt-oss-120b-4bit) | 4 (affine) | 65.8 GB | — | — | **No** | — |
|
|
333
|
+
| MXFP4 (original) | 4 (mxfp) | 63.5 GB | ~70 GB | — | **No** | — |
|
|
334
|
+
| **TurboQuant** | **3** | **48 GB** | **52.3 GB** | **44 tok/s** | **Yes** | **Coherent, well-structured** |
|
|
335
|
+
| TurboQuant | 2 | 32 GB | 34.9 GB | 51 tok/s | Yes | Incoherent after ~20 tokens |
|
|
336
|
+
|
|
337
|
+
> Neither the original MXFP4 format (63.5 GB) nor the mlx-community affine 4-bit re-quantization (65.8 GB) fit on a 64GB Mac. TurboQuant 3-bit (48 GB) is the **only** way to run GPT-OSS-120B on consumer hardware — and at 44 tok/s, it's interactive speed. At 2-bit, the model fits easily but generation quality degrades rapidly — **3-bit is the minimum for coherent output on pre-quantized MoE models.**
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
### Qwen3.5-122B-A10B (122B total, 256 experts, 8 active, ~10B active)
|
|
342
|
+
|
|
343
|
+
**Hardware:** Apple M4 Max 64GB — the original BF16 model is ~240 GB. TurboQuant 3-bit compresses it to ~50 GB, fitting on a 64GB machine.
|
|
344
|
+
|
|
345
|
+
This is a brand-new architecture featuring **256 MoE experts** (the most of any model we've tested), **hybrid attention** (GatedDeltaNet linear attention + standard softmax attention), and **thinking/reasoning** capability. The model also has a shared expert per layer alongside the routed experts.
|
|
346
|
+
|
|
347
|
+
#### Step 1: Convert to TurboQuant 3-bit
|
|
348
|
+
|
|
349
|
+
```bash
|
|
350
|
+
python -m turboquant_mlx.convert \
|
|
351
|
+
--hf-path Qwen/Qwen3.5-122B-A10B \
|
|
352
|
+
--mlx-path ./qwen3.5-122b-tq3 \
|
|
353
|
+
--bits 3 --group-size 64
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
**Model size:** ~50 GB | **Conversion time:** ~90 seconds
|
|
357
|
+
|
|
358
|
+
> **Note:** Conversion requires ~55 GB peak memory. Close all other applications before running. The converter uses memory-efficient processing — each expert layer is replaced immediately after quantization with aggressive garbage collection to handle the 256 experts per layer.
|
|
359
|
+
|
|
360
|
+
#### Step 2: Generate text
|
|
361
|
+
|
|
362
|
+
```bash
|
|
363
|
+
python -m turboquant_mlx.generate \
|
|
364
|
+
--model ./qwen3.5-122b-tq3 \
|
|
365
|
+
--prompt "Why is the sky blue? Explain in simple terms." \
|
|
366
|
+
--max-tokens 200
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
**Expected:** ~26.5 tok/s generation, 55 GB peak memory on M4 Max 64GB
|
|
370
|
+
|
|
371
|
+
#### Benchmark
|
|
372
|
+
|
|
373
|
+
| Method | Bits | Size | Peak RAM | Gen Speed | Fits 64 GB? | Quality |
|
|
374
|
+
|--------|------|------|----------|-----------|-------------|---------|
|
|
375
|
+
| BF16 (original) | 16 | ~240 GB | — | — | **No** | — |
|
|
376
|
+
| **TurboQuant** | **3** | **~50 GB** | **54.9 GB** | **26.5 tok/s** | **Yes** | **Coherent reasoning with structured thinking** |
|
|
377
|
+
|
|
378
|
+
> Qwen3.5-122B-A10B is the largest and most complex model TurboQuant has been tested on: 122B parameters, 256 experts (8 active per token), hybrid GatedDeltaNet + softmax attention, and a shared expert per MoE layer. At 3-bit, the model produces structured reasoning with proper analysis steps — demonstrating that TurboQuant preserves thinking capability at extreme compression.
|
|
379
|
+
|
|
380
|
+
---
|
|
381
|
+
|
|
382
|
+
## How It Works
|
|
383
|
+
|
|
384
|
+
TurboQuant is a two-stage, **calibration-free** quantization pipeline:
|
|
385
|
+
|
|
386
|
+
1. **Hadamard Rotation** — Multiply weights by a randomized Hadamard matrix, transforming any weight distribution into a near-Gaussian shape. This is data-oblivious (no calibration data needed).
|
|
387
|
+
|
|
388
|
+
2. **Lloyd-Max Codebook** — Apply information-theoretically optimal quantization for Gaussian distributions. The codebook is a mathematical constant, precomputed once.
|
|
389
|
+
|
|
390
|
+
The result: near-zero quality loss at 3-bit, and usable 2-bit quantization where standard affine completely breaks down.
|
|
391
|
+
|
|
392
|
+
For MoE models, all experts within a layer share the same rotation signs and codebook, keeping storage efficient.
|
|
393
|
+
|
|
394
|
+
## CLI Options
|
|
395
|
+
|
|
396
|
+
```
|
|
397
|
+
python -m turboquant_mlx.convert --help
|
|
398
|
+
|
|
399
|
+
Options:
|
|
400
|
+
--hf-path TEXT HuggingFace model path or local path (required)
|
|
401
|
+
--mlx-path TEXT Output directory (default: mlx_model)
|
|
402
|
+
--bits {2,3,4} Quantization bit-width (default: 3)
|
|
403
|
+
--group-size {32,64,128} Elements per quantization group (default: 64)
|
|
404
|
+
--rotation TEXT Rotation method: hadamard, blockwise_hadamard, none
|
|
405
|
+
--use-qjl Enable 1-bit QJL residual correction (+1 bit overhead)
|
|
406
|
+
--dtype TEXT Model dtype before quantization: float16, bfloat16
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
## Supported Architectures
|
|
410
|
+
|
|
411
|
+
| Architecture | Model Type | MoE | Status |
|
|
412
|
+
|-------------|-----------|-----|--------|
|
|
413
|
+
| LLaMA / Llama 3 | `llama` | No | Tested |
|
|
414
|
+
| Qwen2 / Qwen2.5 | `qwen2` | No | Tested |
|
|
415
|
+
| Qwen3.5 | `qwen3_5` | No | Tested |
|
|
416
|
+
| Mistral | `mistral` | No | Tested |
|
|
417
|
+
| Qwen1.5-MoE | `qwen2_moe` | Yes | Tested |
|
|
418
|
+
| GPT-OSS | `gpt_oss` | Yes | Tested |
|
|
419
|
+
| Qwen3.5-MoE | `qwen3_5_moe` | Yes (256 experts) | Tested (122B) |
|
|
420
|
+
|
|
421
|
+
## Project Structure
|
|
422
|
+
|
|
423
|
+
```
|
|
424
|
+
turboquant_mlx/
|
|
425
|
+
config.py # TurboQuantConfig
|
|
426
|
+
convert.py # CLI: HF model -> TurboQuant MLX
|
|
427
|
+
generate.py # Text generation with TurboQuant models
|
|
428
|
+
evaluate.py # Perplexity evaluation
|
|
429
|
+
quantize_model.py # Model traversal & layer replacement
|
|
430
|
+
demo_kv.py # Streaming generation demo with KV cache compression
|
|
431
|
+
test_kv_cache.py # KV cache roundtrip + integration tests
|
|
432
|
+
core/
|
|
433
|
+
codebook.py # Lloyd-Max codebooks for Gaussian
|
|
434
|
+
rotation.py # Randomized Hadamard rotation
|
|
435
|
+
polar_quantize.py # Rotate + codebook quantize
|
|
436
|
+
packing.py # Bit-packing into uint32
|
|
437
|
+
qjl.py # QJL residual correction
|
|
438
|
+
layers/
|
|
439
|
+
polar_linear.py # PolarQuantizedLinear (dense)
|
|
440
|
+
polar_switch_linear.py # PolarQuantizedSwitchLinear (MoE)
|
|
441
|
+
polar_kv_cache.py # TurboQuantKVCache (runtime KV compression)
|
|
442
|
+
kernels/
|
|
443
|
+
polar_qmv.py # Fused Metal kernel (dense decode)
|
|
444
|
+
polar_gather_qmv.py # Fused Metal kernel (MoE shared input)
|
|
445
|
+
polar_multi_gather_qmv.py # Fused Metal kernel (MoE per-expert input)
|
|
446
|
+
csrc/
|
|
447
|
+
polar_kernels.metal # Native Metal shaders (SIMD group reduction)
|
|
448
|
+
polar_ops.h/cpp # C++ MLX Primitive classes
|
|
449
|
+
bindings.cpp # nanobind Python bindings
|
|
450
|
+
CMakeLists.txt # Build system
|
|
451
|
+
integration/
|
|
452
|
+
rotation_configs.py # Per-architecture rotation configs
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
## Citation
|
|
456
|
+
|
|
457
|
+
```bibtex
|
|
458
|
+
@misc{turboquant_mlx,
|
|
459
|
+
title={TurboQuant-MLX: Extreme Weight and KV Cache Compression for Apple Silicon},
|
|
460
|
+
year={2025},
|
|
461
|
+
note={MLX implementation of TurboQuant (Zandieh et al., 2025) for both weight quantization and runtime KV cache compression}
|
|
462
|
+
}
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
## License
|
|
466
|
+
|
|
467
|
+
MIT
|
|
468
|
+
|
|
469
|
+
## Acknowledgments
|
|
470
|
+
|
|
471
|
+
- [TurboQuant](https://arxiv.org/abs/2504.19874) — Zandieh, Han, Daliri, Karbasi (2025)
|
|
472
|
+
- [MLX](https://github.com/ml-explore/mlx) — Apple Machine Learning Research
|
|
473
|
+
- [mlx-lm](https://github.com/ml-explore/mlx-examples) — MLX language model utilities
|