turboquant-mlx-full 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. turboquant_mlx_full-0.1.0/LICENSE +21 -0
  2. turboquant_mlx_full-0.1.0/MANIFEST.in +13 -0
  3. turboquant_mlx_full-0.1.0/PKG-INFO +473 -0
  4. turboquant_mlx_full-0.1.0/README.md +438 -0
  5. turboquant_mlx_full-0.1.0/__init__.py +9 -0
  6. turboquant_mlx_full-0.1.0/__main__.py +5 -0
  7. turboquant_mlx_full-0.1.0/config.py +69 -0
  8. turboquant_mlx_full-0.1.0/convert.py +177 -0
  9. turboquant_mlx_full-0.1.0/core/__init__.py +13 -0
  10. turboquant_mlx_full-0.1.0/core/codebook.py +150 -0
  11. turboquant_mlx_full-0.1.0/core/packing.py +82 -0
  12. turboquant_mlx_full-0.1.0/core/polar_quantize.py +141 -0
  13. turboquant_mlx_full-0.1.0/core/qjl.py +202 -0
  14. turboquant_mlx_full-0.1.0/core/rotation.py +162 -0
  15. turboquant_mlx_full-0.1.0/csrc/CMakeLists.txt +73 -0
  16. turboquant_mlx_full-0.1.0/csrc/bindings.cpp +106 -0
  17. turboquant_mlx_full-0.1.0/csrc/polar_kernels.metal +277 -0
  18. turboquant_mlx_full-0.1.0/csrc/polar_ops.cpp +300 -0
  19. turboquant_mlx_full-0.1.0/csrc/polar_ops.h +107 -0
  20. turboquant_mlx_full-0.1.0/demo_kv.py +163 -0
  21. turboquant_mlx_full-0.1.0/evaluate.py +431 -0
  22. turboquant_mlx_full-0.1.0/generate.py +233 -0
  23. turboquant_mlx_full-0.1.0/integration/__init__.py +0 -0
  24. turboquant_mlx_full-0.1.0/integration/rotation_configs.py +184 -0
  25. turboquant_mlx_full-0.1.0/kernels/__init__.py +3 -0
  26. turboquant_mlx_full-0.1.0/kernels/polar_gather_qmv.py +175 -0
  27. turboquant_mlx_full-0.1.0/kernels/polar_multi_gather_qmv.py +147 -0
  28. turboquant_mlx_full-0.1.0/kernels/polar_qmv.py +164 -0
  29. turboquant_mlx_full-0.1.0/layers/__init__.py +7 -0
  30. turboquant_mlx_full-0.1.0/layers/polar_kv_cache.py +315 -0
  31. turboquant_mlx_full-0.1.0/layers/polar_linear.py +242 -0
  32. turboquant_mlx_full-0.1.0/layers/polar_switch_linear.py +276 -0
  33. turboquant_mlx_full-0.1.0/pyproject.toml +86 -0
  34. turboquant_mlx_full-0.1.0/quantize_model.py +305 -0
  35. turboquant_mlx_full-0.1.0/setup.cfg +4 -0
  36. turboquant_mlx_full-0.1.0/setup.py +16 -0
  37. turboquant_mlx_full-0.1.0/test_kv_cache.py +267 -0
  38. turboquant_mlx_full-0.1.0/tests/test_core.py +451 -0
  39. turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/PKG-INFO +473 -0
  40. turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/SOURCES.txt +57 -0
  41. turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/dependency_links.txt +1 -0
  42. turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/entry_points.txt +2 -0
  43. turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/requires.txt +12 -0
  44. turboquant_mlx_full-0.1.0/turboquant_mlx_full.egg-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Manjunath Shiva
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,13 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+ include setup.py
5
+
6
+ recursive-include csrc *.cpp *.h *.metal CMakeLists.txt
7
+
8
+ recursive-exclude * __pycache__
9
+ recursive-exclude * *.pyc
10
+ recursive-exclude * *.pyo
11
+ recursive-exclude * *.so
12
+ recursive-exclude * *.dylib
13
+ recursive-exclude * *.metallib
@@ -0,0 +1,473 @@
1
+ Metadata-Version: 2.4
2
+ Name: turboquant-mlx-full
3
+ Version: 0.1.0
4
+ Summary: Extreme weight and KV cache compression for LLMs on Apple Silicon (MLX implementation of Google's TurboQuant)
5
+ Author: Manjunath Shiva
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/manjunathshiva/turboquant-mlx
8
+ Project-URL: Repository, https://github.com/manjunathshiva/turboquant-mlx
9
+ Project-URL: Issues, https://github.com/manjunathshiva/turboquant-mlx/issues
10
+ Keywords: mlx,quantization,llm,kv-cache,apple-silicon,turboquant,polarquant,moe,long-context
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: mlx>=0.20.0
25
+ Requires-Dist: mlx-lm>=0.10.0
26
+ Requires-Dist: numpy
27
+ Provides-Extra: eval
28
+ Requires-Dist: datasets; extra == "eval"
29
+ Requires-Dist: transformers; extra == "eval"
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest; extra == "dev"
32
+ Requires-Dist: build; extra == "dev"
33
+ Requires-Dist: twine; extra == "dev"
34
+ Dynamic: license-file
35
+
36
+ # TurboQuant-MLX
37
+
38
+ Extreme **weight** and **KV cache** compression for LLMs on Apple Silicon. MLX implementation of Google's [TurboQuant](https://arxiv.org/abs/2504.19874) (Zandieh et al., 2025) — Hadamard rotation + Lloyd-Max codebooks applied both to weights (compile time) and the KV cache (run time).
39
+
40
+ Supports dense models (LLaMA, Qwen, Mistral) and **Mixture-of-Experts** (Qwen-MoE, GPT-OSS, Qwen3.5-MoE). Compatible with hybrid attention architectures, attention sinks, sliding-window attention, and linear attention layers.
41
+
42
+ **With both weight and KV cache compression at 3-bit, GPT-OSS-120B fits its full 131K context window in 50 GB on a 64 GB MacBook — and KV cache compression actually makes generation *faster* on the 120B (8.7 vs 6.4 tok/s) because the smaller cache cuts memory bandwidth more than dequant costs.**
43
+
44
+ ## Key Results — Weight Compression
45
+
46
+ | Model | Method | Bits | PPL | Size | Gen Speed (M4 Max) |
47
+ |-------|--------|------|-----|------|---------------------|
48
+ | Qwen2.5-7B | TurboQuant | 3 | 8.92 | 3.5 GB | — |
49
+ | Qwen2.5-7B | Affine | 3 | 13.37 | 3.3 GB | — |
50
+ | GPT-OSS-20B | Affine (mlx-lm) | 4 | — | 11.2 GB | 148 tok/s |
51
+ | GPT-OSS-20B | MXFP4 (original) | 4 | 83.04 | 12.8 GB | — |
52
+ | GPT-OSS-20B | TurboQuant | 4 | 72.63 | 11.2 GB | — |
53
+ | GPT-OSS-20B | TurboQuant | 3 | 78.60 | 9.3 GB | **73 tok/s** |
54
+ | GPT-OSS-120B | [Affine 4-bit (mlx-community)](https://huggingface.co/mlx-community/gpt-oss-120b-4bit) | 4 | — | 65.8 GB | *Doesn't fit 64GB* |
55
+ | GPT-OSS-120B | MXFP4 (original) | 4 | — | 63.5 GB | *Doesn't fit 64GB* |
56
+ | GPT-OSS-120B | TurboQuant | 3 | — | 48 GB | **44 tok/s** |
57
+ | GPT-OSS-120B | TurboQuant | 2 | — | 32 GB | 51 tok/s (poor quality) |
58
+ | Qwen3.5-122B-A10B | BF16 (original) | 16 | — | ~240 GB | *Doesn't fit 64GB* |
59
+ | **Qwen3.5-122B-A10B** | **TurboQuant** | **3** | **—** | **~50 GB** | **26.5 tok/s** |
60
+
61
+ ## Key Results — KV Cache Compression
62
+
63
+ | Model | KV cache config | KV size | Speed | Notes |
64
+ |-------|----------------|---------|-------|-------|
65
+ | GPT-OSS-20B (FP16 weights) | FP16 KV | 27.0 MB | 90.6 tok/s | baseline |
66
+ | GPT-OSS-20B (FP16 weights) | TQ 3-bit KV | 7.79 MB | 29.9 tok/s | **3.5x cache savings** |
67
+ | GPT-OSS-120B (TQ 3-bit weights) | FP16 KV | 45.0 MB | 6.4 tok/s | baseline |
68
+ | **GPT-OSS-120B (TQ 3-bit weights)** | **TQ 3-bit KV** | **11.83 MB** | **8.7 tok/s** | **3.8x cache savings — and *faster* than FP16** |
69
+ | GPT-OSS-120B (TQ 3-bit weights) | TQ 4-bit KV | 12.21 MB | 16.0 tok/s | also clean |
70
+ | Qwen3.5-122B (TQ 3-bit weights) | FP16 KV | 161.06 MB | 5.4 tok/s | baseline |
71
+ | **Qwen3.5-122B (TQ 3-bit weights)** | **TQ 3-bit KV** | **150.17 MB** | **5.7 tok/s** | output identical to FP16 |
72
+
73
+ KV cache compression projects to ~7 GB RAM saved at 131K context on GPT-OSS-120B and ~5 GB at 262K on Qwen3.5-122B. Roundtrip cosine similarity vs FP16: 0.983 at 3-bit, 0.995 at 4-bit.
74
+
75
+ ## Install
76
+
77
+ ```bash
78
+ pip install turboquant-mlx-full
79
+ ```
80
+
81
+ The package is published as `turboquant-mlx-full` on PyPI, but importable as
82
+ `turboquant_mlx` (without the `-full` suffix) — this matches the original
83
+ project name and the examples in the Medium articles.
84
+
85
+ ```python
86
+ import turboquant_mlx
87
+ from turboquant_mlx.layers import TurboQuantKVCache, convert_cache_to_turboquant
88
+ ```
89
+
90
+ ### Requirements
91
+
92
+ - macOS with Apple Silicon (M1/M2/M3/M4)
93
+ - Python 3.10+
94
+ - 64 GB unified memory recommended for 20B+ models
95
+ - Xcode Command Line Tools and CMake 3.27+ (the package builds a small Metal
96
+ extension on install)
97
+
98
+ ### Install from source (for development)
99
+
100
+ ```bash
101
+ git clone https://github.com/manjunathshiva/turboquant-mlx.git
102
+ cd turboquant-mlx
103
+ pip install -e .
104
+ ```
105
+
106
+ For evaluation utilities (perplexity benchmarking), also install the optional
107
+ dependencies:
108
+
109
+ ```bash
110
+ pip install "turboquant-mlx-full[eval]"
111
+ ```
112
+
113
+ ## Quick Start
114
+
115
+ ### 1. Convert a model to TurboQuant format
116
+
117
+ ```bash
118
+ # Dense model (e.g., LLaMA 3.2 1B at 3-bit)
119
+ python -m turboquant_mlx.convert \
120
+ --hf-path meta-llama/Llama-3.2-1B \
121
+ --mlx-path ./llama-3.2-1b-tq3 \
122
+ --bits 3 --group-size 64
123
+
124
+ # MoE model (e.g., GPT-OSS-20B at 2-bit)
125
+ python -m turboquant_mlx.convert \
126
+ --hf-path openai/gpt-oss-20b \
127
+ --mlx-path ./gpt-oss-20b-tq2 \
128
+ --bits 2 --group-size 64
129
+ ```
130
+
131
+ ### 2. Generate text
132
+
133
+ ```bash
134
+ python -m turboquant_mlx.generate \
135
+ --model ./gpt-oss-20b-tq2 \
136
+ --prompt "Why is the sky blue? Explain in simple terms." \
137
+ --max-tokens 200
138
+ ```
139
+
140
+ ### 3. Evaluate perplexity
141
+
142
+ ```bash
143
+ python -m turboquant_mlx.evaluate \
144
+ --hf-path openai/gpt-oss-20b \
145
+ --bits 2 3 4 \
146
+ --num-samples 256 --seq-len 512
147
+ ```
148
+
149
+ ### 4. Generate with KV cache compression
150
+
151
+ ```bash
152
+ # Standard model + KV cache compression
153
+ python -m turboquant_mlx.demo_kv \
154
+ --model openai/gpt-oss-20b \
155
+ --prompt "Why is the sky blue?" \
156
+ --max-tokens 200 --tq-bits 3
157
+
158
+ # TQ-compressed model + KV cache compression (full stack)
159
+ python -m turboquant_mlx.demo_kv \
160
+ --model ./gpt-oss-120b-tq3 \
161
+ --prompt "Why is the sky blue?" \
162
+ --max-tokens 200 --tq-bits 3
163
+
164
+ # Side-by-side comparison: FP16 KV vs TurboQuant KV
165
+ python -m turboquant_mlx.demo_kv \
166
+ --model ./gpt-oss-120b-tq3 \
167
+ --prompt "Why is the sky blue?" \
168
+ --max-tokens 200 --compare
169
+ ```
170
+
171
+ ---
172
+
173
+ ## KV Cache Compression
174
+
175
+ TurboQuant KV cache compression applies the same Hadamard rotation + Lloyd-Max codebook pipeline to KV vectors at runtime. The compressed cache is dequantized to float16 only when attention needs it, so it routes through MLX's standard `scaled_dot_product_attention` and is compatible with attention sinks, sliding windows, and linear attention layers.
176
+
177
+ ### Programmatic usage
178
+
179
+ ```python
180
+ from turboquant_mlx.layers import convert_cache_to_turboquant
181
+ from mlx_lm.models.cache import make_prompt_cache
182
+
183
+ # 1. Process the prompt with FP16 KV cache (exact)
184
+ cache = make_prompt_cache(model)
185
+ model(prompt_tokens, cache=cache)
186
+
187
+ # 2. Convert to TurboQuant KV cache for generation
188
+ cache = convert_cache_to_turboquant(cache, tq_bits=3, group_size=64)
189
+
190
+ # 3. Continue generation — cache is now compressed
191
+ for token in generate_loop(model, cache):
192
+ ...
193
+ ```
194
+
195
+ ### Choosing a bit-width
196
+
197
+ | Weights | KV cache | Recommendation |
198
+ |---------|----------|----------------|
199
+ | FP16 / BF16 | TQ 3-bit | Default sweet spot at every model size |
200
+ | TQ-compressed (~20B) | TQ 4-bit | Use 4-bit when stacking on TQ weights — small models have a tighter noise budget |
201
+ | **TQ-compressed (100B+)** | **TQ 3-bit** | **3-bit on 3-bit works cleanly on GPT-OSS-120B and Qwen3.5-122B — 100B+ models have enough redundancy to absorb the stacked noise** |
202
+
203
+ ### The speed flip
204
+
205
+ On small fast models (~20B), KV cache compression is a quality-vs-speed tradeoff: the dequant overhead dominates because the model is fast to begin with. On large slow models (100B+), the 4x smaller KV cache reduces memory bandwidth more than dequant adds — generation is *faster* than the FP16 baseline:
206
+
207
+ | Model | FP16 KV | TQ 3-bit KV | Direction |
208
+ |-------|---------|-------------|-----------|
209
+ | GPT-OSS-20B | 90.6 tok/s | 29.9 tok/s | TQ is 3x **slower** |
210
+ | GPT-OSS-120B | 6.4 tok/s | 8.7 tok/s | TQ is 1.4x **faster** |
211
+
212
+ ### Compatibility
213
+
214
+ | Feature | Supported | Notes |
215
+ |---------|-----------|-------|
216
+ | Attention sinks | Yes | GPT-OSS sink vectors flow through standard SDPA |
217
+ | Sliding window attention | Yes | `RotatingKVCache` layers are left untouched |
218
+ | Linear attention | Yes | `ArraysCache` (Qwen3.5 GatedDeltaNet) is left untouched |
219
+ | Hybrid architectures | Yes | Per-layer cache type is preserved |
220
+ | Prompt-first conversion | Yes | Process prompt with FP16, convert before generation |
221
+
222
+ ---
223
+
224
+ ## Running GPT-OSS MoE Models on Apple Silicon
225
+
226
+ ### GPT-OSS-20B (21B total, 32 experts, 3.6B active)
227
+
228
+ **Hardware:** Apple M4 Max 64GB (or any Apple Silicon with 16GB+ unified memory at 3-bit)
229
+
230
+ #### Step 1: Convert to TurboQuant 3-bit (recommended)
231
+
232
+ ```bash
233
+ python -m turboquant_mlx.convert \
234
+ --hf-path openai/gpt-oss-20b \
235
+ --mlx-path ./gpt-oss-20b-tq3 \
236
+ --bits 3 --group-size 32
237
+ ```
238
+
239
+ **Model size:** 9.3 GB (vs 12.8 GB MXFP4 original — 28% smaller, lower perplexity)
240
+
241
+ The converter automatically:
242
+ - Detects MoE architecture (SwitchLinear / QuantizedSwitchLinear layers)
243
+ - Dequantizes MXFP4 expert weights to float
244
+ - Applies Hadamard rotation + Lloyd-Max codebook quantization
245
+ - Keeps router weights and attention at full precision
246
+ - Handles blockwise Hadamard for 2880-dim experts (2880 = 9 x 320)
247
+
248
+ #### Step 2: Generate text
249
+
250
+ ```bash
251
+ python -m turboquant_mlx.generate \
252
+ --model ./gpt-oss-20b-tq3 \
253
+ --prompt "Explain quantum entanglement to a 10-year-old." \
254
+ --max-tokens 256
255
+ ```
256
+
257
+ **Expected:** ~73 tok/s generation, ~85 tok/s prefill on M4 Max
258
+
259
+ #### Step 3: Run a quick quality check
260
+
261
+ ```bash
262
+ python -m turboquant_mlx.evaluate \
263
+ --hf-path openai/gpt-oss-20b \
264
+ --bits 3 \
265
+ --no-affine --no-qjl \
266
+ --num-samples 64 --seq-len 512
267
+ ```
268
+
269
+ #### All bit-widths for GPT-OSS-20B
270
+
271
+ | Method | Bits | Size | Peak RAM | Gen Speed | Quality |
272
+ |--------|------|------|----------|-----------|---------|
273
+ | Affine (mlx-lm) | 4 | 11.2 GB | ~14 GB | 148 tok/s | Coherent (but see note below) |
274
+ | TurboQuant | 4 | 11.2 GB | ~14 GB | — | Best (PPL 72.63, beats MXFP4) |
275
+ | **TurboQuant** | **3** | **9.3 GB** | **~12 GB** | **73 tok/s** | **Recommended (PPL 78.60, beats MXFP4, coherent)** |
276
+ | TurboQuant | 2 | 7.5 GB | ~10 GB | — | Poor (incoherent generation on pre-quantized models) |
277
+
278
+ > **Speed vs quality tradeoff:** Affine 4-bit is ~2x faster on the 20B model due to simpler dequantization, but TurboQuant 3-bit is 28% smaller with lower perplexity than both affine 4-bit and OpenAI's own MXFP4. Crucially, affine 4-bit **cannot scale to 120B** on 64GB hardware — TurboQuant 3-bit is the only option there.
279
+
280
+ ```bash
281
+ # 4-bit (best quality, beats OpenAI's MXFP4)
282
+ python -m turboquant_mlx.convert \
283
+ --hf-path openai/gpt-oss-20b \
284
+ --mlx-path ./gpt-oss-20b-tq4 \
285
+ --bits 4 --group-size 32
286
+ ```
287
+
288
+ ---
289
+
290
+ ### GPT-OSS-120B (120B total, 128 experts, ~13B active)
291
+
292
+ **Hardware:** Apple M4 Max 64GB — neither the original MXFP4 (63.5 GB) nor the [mlx-community 4-bit affine](https://huggingface.co/mlx-community/gpt-oss-120b-4bit) (65.8 GB) fit on a 64GB machine. TurboQuant 3-bit is the only way to run this model on consumer hardware.
293
+
294
+ #### Step 1: Convert to TurboQuant 3-bit (recommended)
295
+
296
+ ```bash
297
+ python -m turboquant_mlx.convert \
298
+ --hf-path openai/gpt-oss-120b \
299
+ --mlx-path ./gpt-oss-120b-tq3 \
300
+ --bits 3 --group-size 64
301
+ ```
302
+
303
+ **Model size:** 48 GB
304
+
305
+ > **Note:** Conversion requires temporarily loading the full model. With 120B parameters, peak memory during conversion may reach ~50-55 GB. On a 64 GB machine this is tight — close all other applications before running. The converter processes layers sequentially and frees memory after each expert is quantized.
306
+
307
+ #### Step 2: Generate text
308
+
309
+ ```bash
310
+ python -m turboquant_mlx.generate \
311
+ --model ./gpt-oss-120b-tq3 \
312
+ --prompt "Explain quantum computing in simple terms." \
313
+ --max-tokens 200
314
+ ```
315
+
316
+ **Expected:** ~44 tok/s generation, ~9.5 tok/s prefill, 52 GB peak memory on M4 Max 64GB
317
+
318
+ #### Step 3: Quick quality check
319
+
320
+ ```bash
321
+ python -m turboquant_mlx.evaluate \
322
+ --hf-path openai/gpt-oss-120b \
323
+ --bits 3 \
324
+ --no-affine --no-qjl \
325
+ --num-samples 32 --seq-len 512
326
+ ```
327
+
328
+ #### All bit-widths for GPT-OSS-120B
329
+
330
+ | Method | Bits | Size | Peak RAM | Gen Speed | Fits 64 GB? | Quality |
331
+ |--------|------|------|----------|-----------|-------------|---------|
332
+ | [mlx-community 4-bit](https://huggingface.co/mlx-community/gpt-oss-120b-4bit) | 4 (affine) | 65.8 GB | — | — | **No** | — |
333
+ | MXFP4 (original) | 4 (mxfp) | 63.5 GB | ~70 GB | — | **No** | — |
334
+ | **TurboQuant** | **3** | **48 GB** | **52.3 GB** | **44 tok/s** | **Yes** | **Coherent, well-structured** |
335
+ | TurboQuant | 2 | 32 GB | 34.9 GB | 51 tok/s | Yes | Incoherent after ~20 tokens |
336
+
337
+ > Neither the original MXFP4 format (63.5 GB) nor the mlx-community affine 4-bit re-quantization (65.8 GB) fit on a 64GB Mac. TurboQuant 3-bit (48 GB) is the **only** way to run GPT-OSS-120B on consumer hardware — and at 44 tok/s, it's interactive speed. At 2-bit, the model fits easily but generation quality degrades rapidly — **3-bit is the minimum for coherent output on pre-quantized MoE models.**
338
+
339
+ ---
340
+
341
+ ### Qwen3.5-122B-A10B (122B total, 256 experts, 8 active, ~10B active)
342
+
343
+ **Hardware:** Apple M4 Max 64GB — the original BF16 model is ~240 GB. TurboQuant 3-bit compresses it to ~50 GB, fitting on a 64GB machine.
344
+
345
+ This is a brand-new architecture featuring **256 MoE experts** (the most of any model we've tested), **hybrid attention** (GatedDeltaNet linear attention + standard softmax attention), and **thinking/reasoning** capability. The model also has a shared expert per layer alongside the routed experts.
346
+
347
+ #### Step 1: Convert to TurboQuant 3-bit
348
+
349
+ ```bash
350
+ python -m turboquant_mlx.convert \
351
+ --hf-path Qwen/Qwen3.5-122B-A10B \
352
+ --mlx-path ./qwen3.5-122b-tq3 \
353
+ --bits 3 --group-size 64
354
+ ```
355
+
356
+ **Model size:** ~50 GB | **Conversion time:** ~90 seconds
357
+
358
+ > **Note:** Conversion requires ~55 GB peak memory. Close all other applications before running. The converter uses memory-efficient processing — each expert layer is replaced immediately after quantization with aggressive garbage collection to handle the 256 experts per layer.
359
+
360
+ #### Step 2: Generate text
361
+
362
+ ```bash
363
+ python -m turboquant_mlx.generate \
364
+ --model ./qwen3.5-122b-tq3 \
365
+ --prompt "Why is the sky blue? Explain in simple terms." \
366
+ --max-tokens 200
367
+ ```
368
+
369
+ **Expected:** ~26.5 tok/s generation, 55 GB peak memory on M4 Max 64GB
370
+
371
+ #### Benchmark
372
+
373
+ | Method | Bits | Size | Peak RAM | Gen Speed | Fits 64 GB? | Quality |
374
+ |--------|------|------|----------|-----------|-------------|---------|
375
+ | BF16 (original) | 16 | ~240 GB | — | — | **No** | — |
376
+ | **TurboQuant** | **3** | **~50 GB** | **54.9 GB** | **26.5 tok/s** | **Yes** | **Coherent reasoning with structured thinking** |
377
+
378
+ > Qwen3.5-122B-A10B is the largest and most complex model TurboQuant has been tested on: 122B parameters, 256 experts (8 active per token), hybrid GatedDeltaNet + softmax attention, and a shared expert per MoE layer. At 3-bit, the model produces structured reasoning with proper analysis steps — demonstrating that TurboQuant preserves thinking capability at extreme compression.
379
+
380
+ ---
381
+
382
+ ## How It Works
383
+
384
+ TurboQuant is a two-stage, **calibration-free** quantization pipeline:
385
+
386
+ 1. **Hadamard Rotation** — Multiply weights by a randomized Hadamard matrix, transforming any weight distribution into a near-Gaussian shape. This is data-oblivious (no calibration data needed).
387
+
388
+ 2. **Lloyd-Max Codebook** — Apply information-theoretically optimal quantization for Gaussian distributions. The codebook is a mathematical constant, precomputed once.
389
+
390
+ The result: near-zero quality loss at 3-bit, and usable 2-bit quantization where standard affine completely breaks down.
391
+
392
+ For MoE models, all experts within a layer share the same rotation signs and codebook, keeping storage efficient.
393
+
394
+ ## CLI Options
395
+
396
+ ```
397
+ python -m turboquant_mlx.convert --help
398
+
399
+ Options:
400
+ --hf-path TEXT HuggingFace model path or local path (required)
401
+ --mlx-path TEXT Output directory (default: mlx_model)
402
+ --bits {2,3,4} Quantization bit-width (default: 3)
403
+ --group-size {32,64,128} Elements per quantization group (default: 64)
404
+ --rotation TEXT Rotation method: hadamard, blockwise_hadamard, none
405
+ --use-qjl Enable 1-bit QJL residual correction (+1 bit overhead)
406
+ --dtype TEXT Model dtype before quantization: float16, bfloat16
407
+ ```
408
+
409
+ ## Supported Architectures
410
+
411
+ | Architecture | Model Type | MoE | Status |
412
+ |-------------|-----------|-----|--------|
413
+ | LLaMA / Llama 3 | `llama` | No | Tested |
414
+ | Qwen2 / Qwen2.5 | `qwen2` | No | Tested |
415
+ | Qwen3.5 | `qwen3_5` | No | Tested |
416
+ | Mistral | `mistral` | No | Tested |
417
+ | Qwen1.5-MoE | `qwen2_moe` | Yes | Tested |
418
+ | GPT-OSS | `gpt_oss` | Yes | Tested |
419
+ | Qwen3.5-MoE | `qwen3_5_moe` | Yes (256 experts) | Tested (122B) |
420
+
421
+ ## Project Structure
422
+
423
+ ```
424
+ turboquant_mlx/
425
+ config.py # TurboQuantConfig
426
+ convert.py # CLI: HF model -> TurboQuant MLX
427
+ generate.py # Text generation with TurboQuant models
428
+ evaluate.py # Perplexity evaluation
429
+ quantize_model.py # Model traversal & layer replacement
430
+ demo_kv.py # Streaming generation demo with KV cache compression
431
+ test_kv_cache.py # KV cache roundtrip + integration tests
432
+ core/
433
+ codebook.py # Lloyd-Max codebooks for Gaussian
434
+ rotation.py # Randomized Hadamard rotation
435
+ polar_quantize.py # Rotate + codebook quantize
436
+ packing.py # Bit-packing into uint32
437
+ qjl.py # QJL residual correction
438
+ layers/
439
+ polar_linear.py # PolarQuantizedLinear (dense)
440
+ polar_switch_linear.py # PolarQuantizedSwitchLinear (MoE)
441
+ polar_kv_cache.py # TurboQuantKVCache (runtime KV compression)
442
+ kernels/
443
+ polar_qmv.py # Fused Metal kernel (dense decode)
444
+ polar_gather_qmv.py # Fused Metal kernel (MoE shared input)
445
+ polar_multi_gather_qmv.py # Fused Metal kernel (MoE per-expert input)
446
+ csrc/
447
+ polar_kernels.metal # Native Metal shaders (SIMD group reduction)
448
+ polar_ops.h/cpp # C++ MLX Primitive classes
449
+ bindings.cpp # nanobind Python bindings
450
+ CMakeLists.txt # Build system
451
+ integration/
452
+ rotation_configs.py # Per-architecture rotation configs
453
+ ```
454
+
455
+ ## Citation
456
+
457
+ ```bibtex
458
+ @misc{turboquant_mlx,
459
+ title={TurboQuant-MLX: Extreme Weight and KV Cache Compression for Apple Silicon},
460
+ year={2025},
461
+ note={MLX implementation of TurboQuant (Zandieh et al., 2025) for both weight quantization and runtime KV cache compression}
462
+ }
463
+ ```
464
+
465
+ ## License
466
+
467
+ MIT
468
+
469
+ ## Acknowledgments
470
+
471
+ - [TurboQuant](https://arxiv.org/abs/2504.19874) — Zandieh, Han, Daliri, Karbasi (2025)
472
+ - [MLX](https://github.com/ml-explore/mlx) — Apple Machine Learning Research
473
+ - [mlx-lm](https://github.com/ml-explore/mlx-examples) — MLX language model utilities