mlx-flash 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlx_flash-0.2.0/PKG-INFO +423 -0
- mlx_flash-0.2.0/README.md +384 -0
- mlx_flash-0.2.0/mlx_flash.egg-info/PKG-INFO +423 -0
- mlx_flash-0.2.0/mlx_flash.egg-info/SOURCES.txt +59 -0
- mlx_flash-0.2.0/mlx_flash.egg-info/dependency_links.txt +1 -0
- mlx_flash-0.2.0/mlx_flash.egg-info/entry_points.txt +5 -0
- mlx_flash-0.2.0/mlx_flash.egg-info/requires.txt +18 -0
- mlx_flash-0.2.0/mlx_flash.egg-info/top_level.txt +1 -0
- mlx_flash-0.2.0/mlx_flash_compress/__init__.py +6 -0
- mlx_flash-0.2.0/mlx_flash_compress/__main__.py +3 -0
- mlx_flash-0.2.0/mlx_flash_compress/advanced_prefetch.py +342 -0
- mlx_flash-0.2.0/mlx_flash_compress/bench.py +819 -0
- mlx_flash-0.2.0/mlx_flash_compress/bench_e2e.py +397 -0
- mlx_flash-0.2.0/mlx_flash_compress/bench_final.py +263 -0
- mlx_flash-0.2.0/mlx_flash_compress/bench_memory_pressure.py +571 -0
- mlx_flash-0.2.0/mlx_flash_compress/bench_real.py +526 -0
- mlx_flash-0.2.0/mlx_flash_compress/cache.py +478 -0
- mlx_flash-0.2.0/mlx_flash_compress/cached_inference.py +573 -0
- mlx_flash-0.2.0/mlx_flash_compress/chat.py +176 -0
- mlx_flash-0.2.0/mlx_flash_compress/compression.py +108 -0
- mlx_flash-0.2.0/mlx_flash_compress/compression_native.py +225 -0
- mlx_flash-0.2.0/mlx_flash_compress/config.py +248 -0
- mlx_flash-0.2.0/mlx_flash_compress/demo_warmup.py +366 -0
- mlx_flash-0.2.0/mlx_flash_compress/engine.py +440 -0
- mlx_flash-0.2.0/mlx_flash_compress/entropy_coding.py +254 -0
- mlx_flash-0.2.0/mlx_flash_compress/expert_merging.py +168 -0
- mlx_flash-0.2.0/mlx_flash_compress/expert_streaming.py +600 -0
- mlx_flash-0.2.0/mlx_flash_compress/fast_cache_bindings.py +205 -0
- mlx_flash-0.2.0/mlx_flash_compress/hardware.py +319 -0
- mlx_flash-0.2.0/mlx_flash_compress/lcp_cache.py +357 -0
- mlx_flash-0.2.0/mlx_flash_compress/memory_manager.py +415 -0
- mlx_flash-0.2.0/mlx_flash_compress/mixed_precision.py +238 -0
- mlx_flash-0.2.0/mlx_flash_compress/model_browser.py +169 -0
- mlx_flash-0.2.0/mlx_flash_compress/router_hook.py +305 -0
- mlx_flash-0.2.0/mlx_flash_compress/run.py +367 -0
- mlx_flash-0.2.0/mlx_flash_compress/rust_bridge.py +68 -0
- mlx_flash-0.2.0/mlx_flash_compress/serve.py +450 -0
- mlx_flash-0.2.0/mlx_flash_compress/smart_eviction.py +277 -0
- mlx_flash-0.2.0/mlx_flash_compress/speculative_experts.py +313 -0
- mlx_flash-0.2.0/mlx_flash_compress/ssd_protection.py +196 -0
- mlx_flash-0.2.0/mlx_flash_compress/task_profiler.py +391 -0
- mlx_flash-0.2.0/mlx_flash_compress/tier_optimizer.py +267 -0
- mlx_flash-0.2.0/mlx_flash_compress/vertical_split.py +173 -0
- mlx_flash-0.2.0/pyproject.toml +69 -0
- mlx_flash-0.2.0/setup.cfg +4 -0
- mlx_flash-0.2.0/tests/test_advanced_prefetch.py +157 -0
- mlx_flash-0.2.0/tests/test_cache.py +156 -0
- mlx_flash-0.2.0/tests/test_cached_inference.py +84 -0
- mlx_flash-0.2.0/tests/test_compression.py +97 -0
- mlx_flash-0.2.0/tests/test_config_hw_ssd.py +154 -0
- mlx_flash-0.2.0/tests/test_demo_warmup.py +62 -0
- mlx_flash-0.2.0/tests/test_entropy_coding.py +167 -0
- mlx_flash-0.2.0/tests/test_expert_merging.py +113 -0
- mlx_flash-0.2.0/tests/test_expert_streaming.py +182 -0
- mlx_flash-0.2.0/tests/test_lcp_cache.py +186 -0
- mlx_flash-0.2.0/tests/test_memory_manager_hints.py +56 -0
- mlx_flash-0.2.0/tests/test_profiler_memory.py +173 -0
- mlx_flash-0.2.0/tests/test_rust_bridge.py +33 -0
- mlx_flash-0.2.0/tests/test_serve.py +124 -0
- mlx_flash-0.2.0/tests/test_speculative_experts.py +146 -0
- mlx_flash-0.2.0/tests/test_vertical_split.py +124 -0
mlx_flash-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mlx-flash
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Run AI models too large for your Mac's memory — expert caching, speculative execution, and 15+ research techniques for MoE inference on Apple Silicon
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/szibis/MLX-Flash
|
|
7
|
+
Project-URL: Repository, https://github.com/szibis/MLX-Flash
|
|
8
|
+
Project-URL: Issues, https://github.com/szibis/MLX-Flash/issues
|
|
9
|
+
Project-URL: Documentation, https://github.com/szibis/MLX-Flash/tree/main/docs
|
|
10
|
+
Keywords: mlx,apple-silicon,moe,mixture-of-experts,inference,expert-caching,llm,speculative-execution,quantization,metal-gpu,macos,ssd-streaming
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: MacOS
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: mlx>=0.24.0
|
|
25
|
+
Requires-Dist: mlx-lm>=0.22.0
|
|
26
|
+
Requires-Dist: numpy>=1.24.0
|
|
27
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
28
|
+
Requires-Dist: safetensors>=0.4.0
|
|
29
|
+
Requires-Dist: psutil>=5.9.0
|
|
30
|
+
Requires-Dist: tabulate>=0.9.0
|
|
31
|
+
Provides-Extra: compression
|
|
32
|
+
Requires-Dist: lz4>=4.3.0; extra == "compression"
|
|
33
|
+
Requires-Dist: zstandard>=0.23.0; extra == "compression"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == "dev"
|
|
37
|
+
Provides-Extra: all
|
|
38
|
+
Requires-Dist: mlx-flash[compression,dev]; extra == "all"
|
|
39
|
+
|
|
40
|
+
<p align="center">
|
|
41
|
+
<img src="assets/logo.svg" width="200" alt="MLX-Flash Logo" />
|
|
42
|
+
</p>
|
|
43
|
+
|
|
44
|
+
<h1 align="center">MLX-Flash</h1>
|
|
45
|
+
|
|
46
|
+
<p align="center"><strong>Run AI models too large for your Mac's memory — at near-full speed.</strong></p>
|
|
47
|
+
|
|
48
|
+
Your MacBook has 32-48GB of RAM, but the best AI models need 100-200GB+. MLX-Flash makes them run anyway by intelligently caching the most-needed parts in RAM and streaming the rest from your SSD — so you don't have to choose between quality and what fits in memory.
|
|
49
|
+
|
|
50
|
+
## How It Works (Simple Version)
|
|
51
|
+
|
|
52
|
+
Think of it like Netflix streaming: instead of downloading the entire movie before watching, you buffer what you need and stream the rest. MLX-Flash does this for AI model weights:
|
|
53
|
+
|
|
54
|
+
```mermaid
|
|
55
|
+
flowchart TB
|
|
56
|
+
subgraph RAM["Your Mac's RAM (fast)"]
|
|
57
|
+
HC[Hot Cache — 85%+ of active experts]
|
|
58
|
+
MP[Mixed Precision — hot 4-bit, cold 2-bit]
|
|
59
|
+
KV[KV Cache — optional 8-bit quantization]
|
|
60
|
+
end
|
|
61
|
+
subgraph CACHE["Smart Cache Layer"]
|
|
62
|
+
LCP[LCP Eviction — layer-depth biased]
|
|
63
|
+
PF[Speculative Prefetch — 97% accuracy]
|
|
64
|
+
MM[Memory Monitor — never harms your apps]
|
|
65
|
+
SPEC[Speculative Execution — predict → execute → verify]
|
|
66
|
+
end
|
|
67
|
+
subgraph SSD["Your Mac's SSD (big)"]
|
|
68
|
+
FULL[Full model weights — even 200GB+]
|
|
69
|
+
ENT[Entropy-coded storage — 65% smaller]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
SSD -->|stream on demand| CACHE
|
|
73
|
+
CACHE -->|cache hit: 0.08ms| RAM
|
|
74
|
+
CACHE -->|cache miss: 0.6ms| SSD
|
|
75
|
+
RAM -->|feed to GPU| GPU[MLX GPU Inference]
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Result:** A 200GB AI model runs on your 48GB Mac at **2-3x faster** than naive SSD streaming.
|
|
79
|
+
|
|
80
|
+
## Quick Start
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# Install from PyPI
|
|
84
|
+
pip install mlx-flash
|
|
85
|
+
|
|
86
|
+
# Or Homebrew (includes Rust sidecar)
|
|
87
|
+
brew tap szibis/mlx-flash && brew install mlx-flash
|
|
88
|
+
|
|
89
|
+
# Or from source
|
|
90
|
+
git clone https://github.com/szibis/MLX-Flash.git
|
|
91
|
+
cd MLX-Flash && pip install -e ".[all]"
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Interactive chat (simplest way to use it)
|
|
96
|
+
mlx-flash-chat
|
|
97
|
+
|
|
98
|
+
# Start the API server (works with LM Studio, Cursor, Claude Code, Codex, OpenAI SDK)
|
|
99
|
+
mlx-flash --port 8080
|
|
100
|
+
|
|
101
|
+
# With KV cache quantization (45% less KV memory)
|
|
102
|
+
mlx-flash --port 8080 --kv-bits 8
|
|
103
|
+
|
|
104
|
+
# See what models fit your hardware
|
|
105
|
+
mlx-flash-browse
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Performance
|
|
109
|
+
|
|
110
|
+
### Measured Results
|
|
111
|
+
|
|
112
|
+
| Technique | Speedup | How It Works |
|
|
113
|
+
|-----------|---------|-------------|
|
|
114
|
+
| **LCP Smart Cache** | **2.80x** | Keeps frequently-used model parts in RAM, predicts what's needed next |
|
|
115
|
+
| **+ Async Prefetch** | **2.93x** | Loads next part from SSD while GPU computes current part |
|
|
116
|
+
| **Mixed Precision** | **1.80x size reduction** | Rarely-used parts stored at lower quality (saves space, barely affects output) |
|
|
117
|
+
| **Skip Fallback** | **2.67x** | When something isn't cached, gracefully skip it instead of waiting |
|
|
118
|
+
| **Speculative Execution** | **14-42% TPOT** | Execute predicted experts before router confirms, verify after |
|
|
119
|
+
| **Adaptive Top-K** | **10-30% compute** | Skip low-confidence secondary experts automatically |
|
|
120
|
+
|
|
121
|
+
### Real Hardware Numbers (Measured on M3 Max 36GB)
|
|
122
|
+
|
|
123
|
+
**Memory pressure recovery** (the key result):
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
Model at 0.9x RAM (barely fits):
|
|
127
|
+
Without optimization: 43.5 tok/s ########
|
|
128
|
+
With mixed precision: 104.5 tok/s #################### 2.4x faster
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
The memory pressure cliff is razor-sharp: 10% over the limit causes 59% slowdown. Our 20% footprint reduction shifts the model back to full speed.
|
|
132
|
+
|
|
133
|
+
**Cache warm-up** (ISP-like progressive acceleration):
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
Token 0: 83.3ms (cold start, loading experts from SSD)
|
|
137
|
+
Token 8: 5.7ms (warming up, 62% cache hit)
|
|
138
|
+
Token 24: 0.5ms (full speed, 85%+ cache hit)
|
|
139
|
+
-> 41x speedup from warm-up
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**Topic switching:**
|
|
143
|
+
```
|
|
144
|
+
coding -> writing: 62ms first token (re-warming) -> 8 tokens to recover
|
|
145
|
+
writing -> coding: 0.6ms first token (still cached!) -> instant fast
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Expert Streaming Performance
|
|
149
|
+
|
|
150
|
+
Expert streaming replaces MLX's `QuantizedSwitchLinear` with a GPU lookup table + pre-stacked tensors. The `capacity_per_layer` parameter controls how many experts stay in GPU memory:
|
|
151
|
+
|
|
152
|
+
| Model | Total Experts | Capacity | Coverage | Throughput | Notes |
|
|
153
|
+
|-------|--------------|----------|----------|------------|-------|
|
|
154
|
+
| Qwen3-30B-A3B | 128 per layer | 128 (100%) | 100% | ~35 tok/s | Full speed, no streaming needed |
|
|
155
|
+
| Qwen3-30B-A3B | 128 per layer | 64 (50%) | 85%+ hit rate | ~15 tok/s | After warm-up with LCP |
|
|
156
|
+
| Mixtral-8x7B | 8 per layer | 8 (100%) | 100% | ~20 tok/s | All experts fit |
|
|
157
|
+
| Mixtral-8x7B | 8 per layer | 4 (50%) | ~95% hit rate | ~12 tok/s | Most active cached |
|
|
158
|
+
|
|
159
|
+
**Tuning tips:**
|
|
160
|
+
- Start with `capacity_per_layer = total_experts` if RAM allows (no streaming overhead)
|
|
161
|
+
- Use `--task coding` warmup profile for programming tasks (pre-loads code-relevant experts)
|
|
162
|
+
- Enable skip-fallback with adaptive threshold to skip low-confidence secondary experts
|
|
163
|
+
- After ~25 tokens, LCP learns your workload and hit rate climbs to 85-95%
|
|
164
|
+
- Run `optimize_wired_memory_limit()` before loading to prevent Metal pressure cliff
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from mlx_flash_compress.expert_streaming import (
|
|
168
|
+
enable_expert_streaming, enable_skip_fallback, get_warmup_experts
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Load model, enable streaming with 50% capacity
|
|
172
|
+
streaming = enable_expert_streaming(model, capacity_per_layer=64)
|
|
173
|
+
enable_skip_fallback(model, streaming.caches, adaptive_skip_threshold=3.0)
|
|
174
|
+
streaming.warmup()
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Find Your Optimal Configuration
|
|
178
|
+
|
|
179
|
+
The Tier Optimizer tells you exactly how to allocate your Mac's memory:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# For a 200GB model on a 48GB Mac
|
|
183
|
+
python -m mlx_flash_compress.tier_optimizer --total-ram 48 --model-gb 209
|
|
184
|
+
|
|
185
|
+
# Output: "Best: 41.5GB RAM cache, 82% of requests served from RAM → 6.4 tok/s"
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
It shows you the sweet spot — even dedicating just 10GB to caching gives you 54% of requests served instantly from RAM.
|
|
189
|
+
|
|
190
|
+
## What's Inside
|
|
191
|
+
|
|
192
|
+
### Architecture
|
|
193
|
+
|
|
194
|
+
```mermaid
|
|
195
|
+
flowchart TB
|
|
196
|
+
subgraph Prediction["Expert Prediction (97%+ accuracy)"]
|
|
197
|
+
RP[Residual-Stream Predictor<br/>Linear projection of hidden state]
|
|
198
|
+
SM[Shadow MLP Predictor<br/>Online-trained routing MLP]
|
|
199
|
+
CL[Cross-Layer Prefetch<br/>3-hop transitive co-occurrence]
|
|
200
|
+
end
|
|
201
|
+
subgraph CacheLayer["Smart Cache Layer"]
|
|
202
|
+
LCP[LCP Eviction<br/>Layer-depth biased]
|
|
203
|
+
FLE[Forward-Looking Eviction<br/>Belady-optimal approximation]
|
|
204
|
+
VS[Vertical Split<br/>2x coverage in same RAM]
|
|
205
|
+
EM[Expert Merging<br/>Cosine similarity clustering]
|
|
206
|
+
end
|
|
207
|
+
subgraph Execution["Inference Engine"]
|
|
208
|
+
ES[Expert Streaming<br/>GPU lookup + pre-stacked tensors]
|
|
209
|
+
SE[Speculative Execution<br/>Predict → Execute → Verify]
|
|
210
|
+
SF[Skip Fallback<br/>Adaptive top-k]
|
|
211
|
+
MP[Mixed Precision<br/>Hot 4-bit / Cold 2-bit]
|
|
212
|
+
end
|
|
213
|
+
subgraph Storage["Compressed Storage"]
|
|
214
|
+
EC[Entropy Coding<br/>Huffman for uint4]
|
|
215
|
+
ST[Safetensors mmap<br/>Zero-copy SSD reads]
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
Prediction --> CacheLayer
|
|
219
|
+
CacheLayer --> Execution
|
|
220
|
+
Storage --> CacheLayer
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### Core Modules (35 Python files)
|
|
224
|
+
|
|
225
|
+
| Module | What It Does |
|
|
226
|
+
|--------|-------------|
|
|
227
|
+
| **Expert Streaming** | |
|
|
228
|
+
| `expert_streaming.py` | GPU lookup table + pre-stacked weights, skip-fallback, adaptive top-k, Mixtral/Qwen support |
|
|
229
|
+
| `speculative_experts.py` | Residual-stream predictor (97%+), Belady-optimal eviction, speculative execution |
|
|
230
|
+
| `advanced_prefetch.py` | Cross-layer N-hop predictor + shadow MLP for >90% prefetch accuracy |
|
|
231
|
+
| **Cache Management** | |
|
|
232
|
+
| `lcp_cache.py` | Smart cache with layer-depth biased LCP eviction + `mx.clear_cache()` |
|
|
233
|
+
| `smart_eviction.py` | SpecMD-inspired least-stale eviction + routing predictor |
|
|
234
|
+
| `vertical_split.py` | Cache partial expert rows for 2x coverage in same RAM (MoEpic) |
|
|
235
|
+
| `expert_merging.py` | Offline expert clustering — merge similar experts for 15-30% fewer params |
|
|
236
|
+
| **Compression** | |
|
|
237
|
+
| `entropy_coding.py` | Huffman coding for uint4 weights — 65% smaller at near-zero quality loss |
|
|
238
|
+
| `mixed_precision.py` | Hot experts at 4-bit, cold at 2-bit — 1.8x smaller, barely noticeable |
|
|
239
|
+
| `compression.py` | LZ4/ZSTD compression + Apple's native LZFSE |
|
|
240
|
+
| **Memory & Hardware** | |
|
|
241
|
+
| `memory_manager.py` | Real-time pressure monitoring, wired memory limit, auto-release |
|
|
242
|
+
| `hardware.py` | Apple Silicon detection (M1-M5), RAM, GPU cores |
|
|
243
|
+
| `tier_optimizer.py` | Finds the perfect RAM/SSD balance for your Mac + model combo |
|
|
244
|
+
| `ssd_protection.py` | Thermal cutoff, sequential hints, zero writes |
|
|
245
|
+
| **Inference & Serving** | |
|
|
246
|
+
| `serve.py` | OpenAI-compatible server with KV cache quantization, memory-aware hints |
|
|
247
|
+
| `chat.py` | Interactive CLI with memory status bar |
|
|
248
|
+
| `task_profiler.py` | Per-task expert profiles (coding/writing/math/chat) for fast warmup |
|
|
249
|
+
| `cached_inference.py` | Expert routing capture + cache simulation |
|
|
250
|
+
| `rust_bridge.py` | Python ↔ Rust Unix socket bridge |
|
|
251
|
+
| **Rust Sidecar** | |
|
|
252
|
+
| `mlx-flash-server/` | axum HTTP/SSE proxy, mach2 memory (0.1ms), DashMap LCP, Unix socket |
|
|
253
|
+
|
|
254
|
+
### Client Integration
|
|
255
|
+
|
|
256
|
+
```mermaid
|
|
257
|
+
graph LR
|
|
258
|
+
subgraph Clients
|
|
259
|
+
LS[LM Studio]
|
|
260
|
+
CU[Cursor]
|
|
261
|
+
CC[Claude Code]
|
|
262
|
+
SDK[OpenAI SDK]
|
|
263
|
+
CD[continue.dev]
|
|
264
|
+
OW[Open WebUI]
|
|
265
|
+
end
|
|
266
|
+
subgraph Rust["Rust Sidecar :8080"]
|
|
267
|
+
AX[axum HTTP/SSE]
|
|
268
|
+
MEM[Memory Monitor<br/>mach2 0.1ms]
|
|
269
|
+
LCPC[LCP Cache<br/>DashMap lock-free]
|
|
270
|
+
end
|
|
271
|
+
subgraph Python["Python Worker :8081"]
|
|
272
|
+
MLX[MLX Inference<br/>95% of work]
|
|
273
|
+
GEN[generate()]
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
Clients -->|OpenAI API| Rust
|
|
277
|
+
Rust -->|proxy| Python
|
|
278
|
+
Rust -.->|Unix socket| LCPC
|
|
279
|
+
LCPC -.->|expert weights| Python
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### Using It
|
|
283
|
+
|
|
284
|
+
| How | Command | Best For |
|
|
285
|
+
|-----|---------|----------|
|
|
286
|
+
| **Interactive chat** | `python -m mlx_flash_compress.chat` | Quick testing, shows memory status |
|
|
287
|
+
| **API server** | `python -m mlx_flash_compress.serve --port 8080` | LM Studio, continue.dev, OpenAI SDK |
|
|
288
|
+
| **API + KV quant** | `python -m mlx_flash_compress.serve --kv-bits 8` | 45% less KV memory |
|
|
289
|
+
| **Model browser** | `python -m mlx_flash_compress.model_browser` | See what fits your hardware |
|
|
290
|
+
| **Warm-up demo** | `python -m mlx_flash_compress.demo_warmup` | Watch cache fill in real-time |
|
|
291
|
+
| **Pressure test** | `python -m mlx_flash_compress.bench_memory_pressure` | Measure memory impact |
|
|
292
|
+
|
|
293
|
+
### Integration
|
|
294
|
+
|
|
295
|
+
**LM Studio**: Start our server, then set custom endpoint to `http://localhost:8080/v1`
|
|
296
|
+
|
|
297
|
+
**Ollama**: Run our server alongside Ollama — use ours for MoE models that benefit from expert caching.
|
|
298
|
+
|
|
299
|
+
**continue.dev / Cursor / Claude Code / any OpenAI SDK**: Point `api_base` to `http://localhost:8080/v1`
|
|
300
|
+
|
|
301
|
+
See `docs/integrations.md` for detailed setup for 18+ tools and `docs/getting-started.md` for quick start.
|
|
302
|
+
|
|
303
|
+
### Benchmark Suite
|
|
304
|
+
|
|
305
|
+
```bash
|
|
306
|
+
python -m mlx_flash_compress.bench_memory_pressure # Memory pressure analysis (key demo)
|
|
307
|
+
python -m mlx_flash_compress.demo_warmup # ISP-like warm-up visualization
|
|
308
|
+
python -m mlx_flash_compress.cached_inference --multi-topic # Real routing capture
|
|
309
|
+
python -m mlx_flash_compress.bench --synthetic # Quick test (no model needed)
|
|
310
|
+
python -m mlx_flash_compress.bench_real # Real Qwen MoE model test
|
|
311
|
+
python -m mlx_flash_compress.bench_final # Final comprehensive benchmark
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
## Key Discoveries
|
|
315
|
+
|
|
316
|
+
### 1. Standard Compression Doesn't Work on AI Weights
|
|
317
|
+
|
|
318
|
+
We tested 6 different compression strategies on real AI model weights. Result: **1.0x compression** (zero savings). The data is already maximally dense at 4-bit quantization. Instead, we use entropy coding (Huffman) which exploits the non-uniform distribution of quantized values for 65% savings.
|
|
319
|
+
|
|
320
|
+
### 2. Smart Caching Is the #1 Win
|
|
321
|
+
|
|
322
|
+
Instead of trying to compress, we **predict what's needed and pre-load it**. Our prediction stack achieves 97%+ accuracy:
|
|
323
|
+
- Residual-stream predictor (linear projection of hidden states)
|
|
324
|
+
- Cross-layer 3-hop lookahead (transitive co-occurrence)
|
|
325
|
+
- Forward-looking Belady-optimal eviction (never evict what you'll need)
|
|
326
|
+
- Layer-depth bias (early layers are more valuable to cache)
|
|
327
|
+
|
|
328
|
+
### 3. The Brain Already Solved This Problem
|
|
329
|
+
|
|
330
|
+
MoE models work like the brain — only 0.78% of "neurons" (experts) activate per input. The brain handles this with predictive coding (pre-activating expected pathways). We implement the same principle: predict which experts are needed, speculatively execute them, and verify after the router confirms.
|
|
331
|
+
|
|
332
|
+
### 4. Speculate, Don't Wait
|
|
333
|
+
|
|
334
|
+
Speculative expert execution (from MoE-SpAc paper) runs predicted experts *before* the router confirms them. With 97% prediction accuracy, this means 97% of expert computations start immediately with zero load latency. The 3% misses are discarded and recomputed — on unified memory, this costs only ~0.1ms per wasted computation.
|
|
335
|
+
|
|
336
|
+
## Requirements
|
|
337
|
+
|
|
338
|
+
- **macOS** with Apple Silicon (M1/M2/M3/M4/M5)
|
|
339
|
+
- **Python 3.10+**
|
|
340
|
+
- 16GB+ RAM (more = better caching = faster)
|
|
341
|
+
- For real model tests: `mlx` and `mlx-lm` packages
|
|
342
|
+
|
|
343
|
+
## Project Stats
|
|
344
|
+
|
|
345
|
+
- **15,000+ lines of code** (Python + Rust)
|
|
346
|
+
- **224 tests** (192 Python + 32 Rust)
|
|
347
|
+
- **8 benchmark suites** + interactive demos
|
|
348
|
+
- **10 research documents** (15+ papers implemented, 60+ surveyed)
|
|
349
|
+
- **35 Python modules** covering prediction, caching, compression, serving
|
|
350
|
+
- **OpenAI-compatible API server** with KV cache quantization
|
|
351
|
+
- **Memory-aware** inference with wired memory optimization
|
|
352
|
+
- **Rust sidecar** with 0.1ms memory checks (210x faster than Python)
|
|
353
|
+
- **Lock-free LCP expert cache** (DashMap) with layer-depth bias
|
|
354
|
+
- **Unix socket bridge** for Python ↔ Rust expert weight streaming
|
|
355
|
+
- **15+ research techniques** implemented from papers 2024-2026
|
|
356
|
+
|
|
357
|
+
## Research & Techniques Implemented
|
|
358
|
+
|
|
359
|
+
```mermaid
|
|
360
|
+
graph TB
|
|
361
|
+
subgraph DONE["Implemented (15+ techniques)"]
|
|
362
|
+
ES[Expert Streaming<br/>GPU lookup tables]
|
|
363
|
+
LCP[Layer-biased LCP<br/>FATE paper]
|
|
364
|
+
RP[Residual Predictor<br/>97%+ accuracy]
|
|
365
|
+
SE[Speculative Execution<br/>MoE-SpAc]
|
|
366
|
+
FE[Forward Eviction<br/>MoE-SpeQ Belady]
|
|
367
|
+
CL[Cross-Layer Prefetch<br/>3-hop lookahead]
|
|
368
|
+
SP[Shadow MLP Predictor<br/>mlx-od-moe]
|
|
369
|
+
VS[Vertical Splitting<br/>MoEpic 2x coverage]
|
|
370
|
+
EM[Expert Merging<br/>DEK/EEP]
|
|
371
|
+
EC[Entropy Coding<br/>EntroLLM Huffman]
|
|
372
|
+
AT[Adaptive Top-K<br/>LExI paper]
|
|
373
|
+
MP[Mixed Precision<br/>HOBBIT]
|
|
374
|
+
KV[KV Cache 8-bit<br/>mlx-moe]
|
|
375
|
+
WM[Wired Memory Limit<br/>macOS sysctl]
|
|
376
|
+
MC[mx.clear_cache<br/>MLX v0.31]
|
|
377
|
+
end
|
|
378
|
+
subgraph BLOCKED["Blocked"]
|
|
379
|
+
AMX[AMX Pipeline<br/>undocumented HW]
|
|
380
|
+
MLXrs[mlx-rs<br/>macOS 26 Metal]
|
|
381
|
+
end
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
| Technique | Paper | Status |
|
|
385
|
+
|-----------|-------|--------|
|
|
386
|
+
| Expert streaming (GPU lookup) | HOBBIT arXiv:2411.01433 | **Implemented** |
|
|
387
|
+
| Residual-stream predictor | Speculating Experts arXiv:2603.19289 | **Implemented** |
|
|
388
|
+
| Speculative expert execution | MoE-SpAc arXiv:2603.09983 | **Implemented** |
|
|
389
|
+
| Forward-looking Belady eviction | MoE-SpeQ arXiv:2511.14102 | **Implemented** |
|
|
390
|
+
| Cross-layer 3-hop prefetch | FATE arXiv:2502.12224 / tinyserve | **Implemented** |
|
|
391
|
+
| Layer-depth cache bias | FATE arXiv:2502.12224 | **Implemented** |
|
|
392
|
+
| Shadow model predictor | mlx-od-moe | **Implemented** |
|
|
393
|
+
| Vertical expert splitting | MoEpic paper | **Implemented** |
|
|
394
|
+
| Expert merging (offline) | DEK/EEP arXiv:2509.19781 | **Implemented** |
|
|
395
|
+
| Entropy coding (Huffman uint4) | EntroLLM arXiv:2505.02380 | **Implemented** |
|
|
396
|
+
| Adaptive top-k skipping | LExI arXiv:2509.02753 | **Implemented** |
|
|
397
|
+
| Mixed precision per-expert | HOBBIT arXiv:2411.01433 | **Implemented** |
|
|
398
|
+
| KV cache 8-bit quantization | mlx-moe / mlx-lm v0.31 | **Implemented** |
|
|
399
|
+
| Wired memory optimization | macOS sysctl / mlx-moe | **Implemented** |
|
|
400
|
+
| `mx.clear_cache()` integration | MLX v0.31.0 | **Implemented** |
|
|
401
|
+
| AMX dequant pipeline | amx-rs Rust crate | Blocked (undocumented HW) |
|
|
402
|
+
| mlx-rs native inference | mlx-rs v0.25.3 | Blocked (macOS 26 Metal) |
|
|
403
|
+
|
|
404
|
+
### Competition
|
|
405
|
+
|
|
406
|
+
10+ OSS projects and 15+ papers attack the same problem. Our unique differentiators:
|
|
407
|
+
1. **Only** project with Rust sidecar + Mach syscall memory monitoring
|
|
408
|
+
2. **Only** Apple Silicon project with mixed precision per-expert (hot 4-bit / cold 2-bit)
|
|
409
|
+
3. **Most techniques implemented**: 15+ from research frontier, more than any competitor
|
|
410
|
+
4. **Only** project combining speculative execution + Belady eviction + residual predictor + expert merging
|
|
411
|
+
|
|
412
|
+
| Competitor | Key Feature | Our Advantage |
|
|
413
|
+
|-----------|------------|---------------|
|
|
414
|
+
| mu-hashmi/mlx-moe | Expert profiles, 10+ model families | Speculative execution, residual predictor, Rust sidecar |
|
|
415
|
+
| kqb/mlx-od-moe | Shadow model, memory-mapped experts | Cross-layer prefetch, entropy coding, expert merging |
|
|
416
|
+
| jundot/omlx | Hybrid mxfp4/mxfp8 quantization | Belady eviction, adaptive top-k, vertical splitting |
|
|
417
|
+
| HOBBIT (paper) | Nearly identical architecture | Apple Silicon native, open source |
|
|
418
|
+
|
|
419
|
+
See `docs/competitive-analysis.md` for the full landscape.
|
|
420
|
+
|
|
421
|
+
## License
|
|
422
|
+
|
|
423
|
+
MIT
|