thaw-vllm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thaw_vllm-0.1.0/PKG-INFO +242 -0
- thaw_vllm-0.1.0/README.md +214 -0
- thaw_vllm-0.1.0/pyproject.toml +43 -0
- thaw_vllm-0.1.0/python/thaw_vllm/__init__.py +87 -0
- thaw_vllm-0.1.0/python/thaw_vllm/cli.py +253 -0
- thaw_vllm-0.1.0/python/thaw_vllm/kv_snapshot.py +540 -0
- thaw_vllm-0.1.0/python/thaw_vllm/loader.py +106 -0
- thaw_vllm-0.1.0/python/thaw_vllm/server.py +253 -0
- thaw_vllm-0.1.0/python/thaw_vllm/snapshot.py +489 -0
- thaw_vllm-0.1.0/python/thaw_vllm.egg-info/PKG-INFO +242 -0
- thaw_vllm-0.1.0/python/thaw_vllm.egg-info/SOURCES.txt +14 -0
- thaw_vllm-0.1.0/python/thaw_vllm.egg-info/dependency_links.txt +1 -0
- thaw_vllm-0.1.0/python/thaw_vllm.egg-info/entry_points.txt +2 -0
- thaw_vllm-0.1.0/python/thaw_vllm.egg-info/requires.txt +14 -0
- thaw_vllm-0.1.0/python/thaw_vllm.egg-info/top_level.txt +1 -0
- thaw_vllm-0.1.0/setup.cfg +4 -0
thaw_vllm-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: thaw-vllm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fast snapshot/restore for LLM inference. 17x faster cold starts, multi-GPU tensor parallel, KV cache snapshots.
|
|
5
|
+
Author: Nils Matteson
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/matteso1/thaw
|
|
8
|
+
Project-URL: Repository, https://github.com/matteso1/thaw
|
|
9
|
+
Keywords: llm,inference,vllm,gpu,cold-start,kv-cache
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: torch>=2.0
|
|
18
|
+
Provides-Extra: vllm
|
|
19
|
+
Requires-Dist: vllm>=0.18; extra == "vllm"
|
|
20
|
+
Provides-Extra: serve
|
|
21
|
+
Requires-Dist: vllm>=0.18; extra == "serve"
|
|
22
|
+
Requires-Dist: fastapi; extra == "serve"
|
|
23
|
+
Requires-Dist: uvicorn; extra == "serve"
|
|
24
|
+
Provides-Extra: all
|
|
25
|
+
Requires-Dist: vllm>=0.18; extra == "all"
|
|
26
|
+
Requires-Dist: fastapi; extra == "all"
|
|
27
|
+
Requires-Dist: uvicorn; extra == "all"
|
|
28
|
+
|
|
29
|
+
# thaw
|
|
30
|
+
|
|
31
|
+
**Fast snapshot/restore for LLM inference. 17x faster cold starts on 70B, multi-GPU tensor parallel, KV cache preservation.**
|
|
32
|
+
|
|
33
|
+
vLLM cold-starts Llama-3-70B on 2x A100 in 546 seconds. thaw restores it in **31.8 seconds** — a **17.2x speedup**. Bit-identical outputs, verified by greedy decoding. Multi-GPU tensor parallel, Rust+CUDA pipelined DMA, and KV cache snapshots that no other tool offers.
|
|
34
|
+
|
|
35
|
+
## Benchmarks
|
|
36
|
+
|
|
37
|
+
**Llama-3-70B-Instruct (141 GB fp16) on 2x A100 SXM 80GB — tensor parallel:**
|
|
38
|
+
|
|
39
|
+
| Method | Time | Speedup |
|
|
40
|
+
|--------|------|---------|
|
|
41
|
+
| Normal vLLM cold start | 546.5s | 1x |
|
|
42
|
+
| **thaw restore (TP=2)** | **31.8s** | **17.2x** |
|
|
43
|
+
| Weight restore only | 10.5s | 6.74 GB/s per rank |
|
|
44
|
+
|
|
45
|
+
**Llama-3-8B-Instruct (16 GB fp16) — single GPU, H100 SXM:**
|
|
46
|
+
|
|
47
|
+
| Method | Time | Throughput | Speedup |
|
|
48
|
+
|--------|------|-----------|---------|
|
|
49
|
+
| Normal vLLM cold start | 20.7s | — | 1x |
|
|
50
|
+
| **thaw (NVMe)** | **3.7s** | 8.26 GB/s | **5.6x** |
|
|
51
|
+
| **thaw (RAM hot path)** | **3.5s** | 10.69 GB/s | **5.9x** |
|
|
52
|
+
|
|
53
|
+
**Agent fork — clone a running AI session (Llama-3-8B-Instruct, H100 SXM):**
|
|
54
|
+
|
|
55
|
+
| Operation | Time | Notes |
|
|
56
|
+
|-----------|------|-------|
|
|
57
|
+
| Weight restore (Rust pipelined) | **1.1s** | **14.79 GB/s** — PCIe Gen5-saturating |
|
|
58
|
+
| KV cache restore | **0.135s** | 65 blocks, 136 MB |
|
|
59
|
+
| Total restore (incl. vLLM init) | **7.3s** | vs 16s normal cold start |
|
|
60
|
+
| Fork 3 parallel completions | **1.6s avg** | All share 872-token cached prefix |
|
|
61
|
+
|
|
62
|
+
All paths produce **bit-identical** inference output. KV cache restore preserves prefix cache across cold starts — new requests skip prefill entirely.
|
|
63
|
+
|
|
64
|
+
<details>
|
|
65
|
+
<summary>More GPUs and models</summary>
|
|
66
|
+
|
|
67
|
+
| GPU | Model | Normal | thaw | Speedup |
|
|
68
|
+
|-----|-------|--------|------|---------|
|
|
69
|
+
| 2x A100 SXM 80GB | Llama-3-70B (TP=2) | 546.5s | 31.8s | **17.2x** |
|
|
70
|
+
| H100 SXM 80GB | Llama-3-8B | 20.7s | 3.5s | **5.9x** |
|
|
71
|
+
| RTX PRO 6000 (Blackwell) | Llama-3-8B | 28.6s | 3.2s | **8.9x** |
|
|
72
|
+
| RTX A6000 | Llama-3-8B | 73.2s | 5.8s | **12.6x** |
|
|
73
|
+
|
|
74
|
+
Larger models show bigger speedups because weight loading dominates more of the total cold start time.
|
|
75
|
+
|
|
76
|
+
</details>
|
|
77
|
+
|
|
78
|
+
## How it works
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
Normal vLLM cold start:
|
|
82
|
+
Download weights → deserialize safetensors → copy to GPU → init KV cache → ready
|
|
83
|
+
[==================================] 20.7s
|
|
84
|
+
|
|
85
|
+
thaw restore:
|
|
86
|
+
Dummy init → DMA snapshot to GPU (pipelined, pinned memory, O_DIRECT)
|
|
87
|
+
[=====] 3.5s
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Freeze** captures all GPU state into binary snapshots — model weights (`.thaw`) and KV cache blocks (`.thawkv`).
|
|
91
|
+
|
|
92
|
+
**Restore** initializes vLLM with dummy weights (fast — no disk I/O), then overwrites them from the snapshot using double-buffered pipelined DMA through pinned host memory. Two CUDA streams overlap PCIe transfers with disk reads. KV cache blocks are restored separately with their prefix cache hash mappings, so new requests immediately get cache hits.
|
|
93
|
+
|
|
94
|
+
Two restore modes:
|
|
95
|
+
- **Disk**: reads snapshot from NVMe with O_DIRECT, bypassing the kernel page cache. Throughput limited by NVMe bandwidth.
|
|
96
|
+
- **RAM hot path**: snapshot pre-loaded in memory (tmpfs, shared memory, mmap). Pure PCIe DMA — 10.69 GB/s on H100. For production use where snapshots are pre-staged.
|
|
97
|
+
|
|
98
|
+
**KV cache snapshots** capture the prefix-cached blocks that vLLM retains after generation. On restore, block data is DMA'd back to GPU and the prefix cache hash table is reconstructed. Requests with matching prefixes skip prefill — the most expensive part of inference.
|
|
99
|
+
|
|
100
|
+
## Architecture
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
thaw/
|
|
104
|
+
crates/
|
|
105
|
+
thaw-core/ Rust. File format, region tables, I/O. No CUDA dep.
|
|
106
|
+
thaw-cuda-sys/ Rust. FFI bindings to CUDA runtime (cudaMallocHost,
|
|
107
|
+
cudaMemcpyAsync, streams). Built via build.rs.
|
|
108
|
+
thaw-runtime/ Rust. Orchestration: freeze/restore pipelines, double-
|
|
109
|
+
buffered DMA, O_DIRECT, MockCuda for Mac testing.
|
|
110
|
+
thaw-py/ Rust. PyO3 bindings exposing pipelined freeze/restore
|
|
111
|
+
to Python. Builds a native .so via maturin.
|
|
112
|
+
thaw-cli/ Rust. GPU benchmark binary.
|
|
113
|
+
python/
|
|
114
|
+
thaw_vllm/ Python package (pip install thaw-vllm).
|
|
115
|
+
snapshot.py Freeze/restore weights, Rust backend fallback.
|
|
116
|
+
kv_snapshot.py KV cache freeze/restore.
|
|
117
|
+
loader.py vLLM ModelLoader: load_format="thaw".
|
|
118
|
+
server.py OpenAI-compatible API server.
|
|
119
|
+
cli.py CLI: thaw freeze, thaw serve, thaw info.
|
|
120
|
+
vllm_demo.py End-to-end benchmark: normal vs thaw cold start.
|
|
121
|
+
kv_cache_demo.py KV cache snapshot/restore demo with correctness test.
|
|
122
|
+
demos/
|
|
123
|
+
agent_fork.py Agent fork demo: clone session, fork parallel completions.
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
**Testing on Mac, shipping on GPU.** The `CudaBackend` trait abstracts all GPU operations. `MockCuda` (a HashMap-backed fake) lets 48 runtime tests run on any machine. The `cuda` feature flag activates real GPU paths only when needed.
|
|
127
|
+
|
|
128
|
+
## Quick start
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
pip install thaw-vllm[serve]
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
import thaw_vllm
|
|
136
|
+
from vllm import LLM, SamplingParams
|
|
137
|
+
|
|
138
|
+
# Freeze: save model weights to a snapshot
|
|
139
|
+
llm = LLM(model="meta-llama/Meta-Llama-3-8B", dtype="float16", enforce_eager=True)
|
|
140
|
+
thaw_vllm.freeze_model_pipelined(model, "/path/to/weights.thaw")
|
|
141
|
+
|
|
142
|
+
# Restore: two lines, 5.9x faster cold start
|
|
143
|
+
llm = thaw_vllm.load("meta-llama/Meta-Llama-3-8B", "/path/to/weights.thaw")
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Or use `load_format="thaw"` directly with vLLM:
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
import thaw_vllm # registers the loader
|
|
150
|
+
llm = LLM(model="meta-llama/Meta-Llama-3-8B",
|
|
151
|
+
load_format="thaw",
|
|
152
|
+
model_loader_extra_config={"snapshot": "/path/to/weights.thaw"})
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**Multi-GPU** — tensor parallel with per-rank snapshots:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
# Freeze: each GPU saves its shard
|
|
159
|
+
llm = LLM(model="meta-llama/Meta-Llama-3-70B-Instruct", tensor_parallel_size=2, ...)
|
|
160
|
+
thaw_vllm.freeze_model_tp(llm, "/path/to/weights.thaw")
|
|
161
|
+
# Creates: weights.thaw (rank 0), weights.rank1.thaw (rank 1)
|
|
162
|
+
|
|
163
|
+
# Restore: 17.2x faster than normal cold start
|
|
164
|
+
llm = thaw_vllm.load("meta-llama/Meta-Llama-3-70B-Instruct", "/path/to/weights.thaw",
|
|
165
|
+
tensor_parallel_size=2)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Agent fork demo** — clone a running AI session, fork parallel completions:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
python demos/agent_fork.py --snapshot weights.thaw
|
|
172
|
+
python demos/agent_fork.py --snapshot weights.thaw --full-cycle # destroy + restore
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**CLI:**
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
thaw freeze --model meta-llama/Meta-Llama-3-8B --output weights.thaw
|
|
179
|
+
thaw serve --model meta-llama/Meta-Llama-3-8B --snapshot weights.thaw
|
|
180
|
+
thaw info weights.thaw
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
<details>
|
|
184
|
+
<summary>Building with Rust+CUDA backend (optional, higher throughput)</summary>
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
git clone https://github.com/matteso1/thaw.git && cd thaw
|
|
188
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
|
189
|
+
source "$HOME/.cargo/env"
|
|
190
|
+
pip install "maturin[patchelf]" vllm
|
|
191
|
+
cd crates/thaw-py && maturin develop --release --features cuda && cd ../..
|
|
192
|
+
pip install -e ".[serve]"
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
</details>
|
|
196
|
+
|
|
197
|
+
## Competitive landscape
|
|
198
|
+
|
|
199
|
+
The model loading space is active. Here's how thaw compares:
|
|
200
|
+
|
|
201
|
+
| Project | Approach | Throughput | Limitations |
|
|
202
|
+
|---------|----------|-----------|-------------|
|
|
203
|
+
| **thaw** | Pipelined DMA, pinned memory, O_DIRECT + KV cache snapshot | 6.7-14.8 GB/s per GPU | — |
|
|
204
|
+
| fastsafetensors (IBM) | GDS + 4x NVMe RAID0 | 26.4 GB/s | Requires GDS setup + RAID hardware |
|
|
205
|
+
| NVIDIA Model Streamer | Multi-threaded concurrent streaming | ~2 GB/s (single SSD) | NVIDIA-maintained, less flexible |
|
|
206
|
+
| CoreWeave Tensorizer | HTTP/S3 streaming + deserialization | ~4.6 GB/s local | Tied to CoreWeave ecosystem |
|
|
207
|
+
| vLLM Sleep Mode | Offload to CPU RAM, reload | 0.26-3s | Not a cold start — requires prior warm load |
|
|
208
|
+
| Modal GPU Snapshots | CUDA checkpoint/restore API | ~10x reduction | Alpha. Doesn't help with large model weight loading |
|
|
209
|
+
| InferX | GPU runtime snapshotting | Claims 2s for 70B | No public code or benchmarks |
|
|
210
|
+
|
|
211
|
+
**thaw's differentiation:**
|
|
212
|
+
1. **KV cache snapshot/restore** — nobody else does this. Preserves prefix cache across cold starts, eliminates prefill. Enables agent forking, session migration, warm handoff.
|
|
213
|
+
2. **Single NVMe performance** — most deployments don't have RAID0. thaw already matches or beats multi-threaded alternatives on one drive.
|
|
214
|
+
3. **No special hardware** — no GDS, no RAID, no driver patches. Works on any CUDA 12+ GPU.
|
|
215
|
+
|
|
216
|
+
See [docs/LANDSCAPE.md](./docs/LANDSCAPE.md) for detailed analysis.
|
|
217
|
+
|
|
218
|
+
## Roadmap
|
|
219
|
+
|
|
220
|
+
- [x] Weight snapshot/restore (pure Python path)
|
|
221
|
+
- [x] Rust+CUDA pipelined freeze/restore (double-buffered DMA, O_DIRECT)
|
|
222
|
+
- [x] RAM-backed restore path (PCIe-saturating, 10.69 GB/s)
|
|
223
|
+
- [x] PyO3 bindings + vLLM integration shim
|
|
224
|
+
- [x] H100 / A6000 / Blackwell benchmarks
|
|
225
|
+
- [x] **KV cache snapshot/restore** — the moat (freeze/restore prefix-cached blocks, verified on Llama-3-8B)
|
|
226
|
+
- [x] `pip install thaw-vllm` + CLI (`thaw freeze`, `thaw serve`, `thaw info`)
|
|
227
|
+
- [x] `load_format="thaw"` — native vLLM ModelLoader integration
|
|
228
|
+
- [x] OpenAI-compatible API server (`thaw serve`)
|
|
229
|
+
- [x] Streaming support in API server (SSE, OpenAI-compatible)
|
|
230
|
+
- [x] **Agent fork demo** — clone a running AI session, fork parallel completions from shared KV cache (full-cycle: 14.79 GB/s restore, 0.135s KV restore on H100 SXM)
|
|
231
|
+
- [x] **Multi-GPU / tensor parallel** — 17.2x speedup on Llama-3-70B with 2x A100 (TP=2), bit-exact correctness verified
|
|
232
|
+
- [ ] SGLang integration
|
|
233
|
+
- [ ] Cloud snapshot storage (S3/GCS)
|
|
234
|
+
- [ ] GPUDirect Storage support
|
|
235
|
+
|
|
236
|
+
## Design
|
|
237
|
+
|
|
238
|
+
Full technical architecture, file format spec, and rationale: [DESIGN.md](./DESIGN.md)
|
|
239
|
+
|
|
240
|
+
## License
|
|
241
|
+
|
|
242
|
+
MIT
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# thaw
|
|
2
|
+
|
|
3
|
+
**Fast snapshot/restore for LLM inference. 17x faster cold starts on 70B, multi-GPU tensor parallel, KV cache preservation.**
|
|
4
|
+
|
|
5
|
+
vLLM cold-starts Llama-3-70B on 2x A100 in 546 seconds. thaw restores it in **31.8 seconds** — a **17.2x speedup**. Bit-identical outputs, verified by greedy decoding. Multi-GPU tensor parallel, Rust+CUDA pipelined DMA, and KV cache snapshots that no other tool offers.
|
|
6
|
+
|
|
7
|
+
## Benchmarks
|
|
8
|
+
|
|
9
|
+
**Llama-3-70B-Instruct (141 GB fp16) on 2x A100 SXM 80GB — tensor parallel:**
|
|
10
|
+
|
|
11
|
+
| Method | Time | Speedup |
|
|
12
|
+
|--------|------|---------|
|
|
13
|
+
| Normal vLLM cold start | 546.5s | 1x |
|
|
14
|
+
| **thaw restore (TP=2)** | **31.8s** | **17.2x** |
|
|
15
|
+
| Weight restore only | 10.5s | 6.74 GB/s per rank |
|
|
16
|
+
|
|
17
|
+
**Llama-3-8B-Instruct (16 GB fp16) — single GPU, H100 SXM:**
|
|
18
|
+
|
|
19
|
+
| Method | Time | Throughput | Speedup |
|
|
20
|
+
|--------|------|-----------|---------|
|
|
21
|
+
| Normal vLLM cold start | 20.7s | — | 1x |
|
|
22
|
+
| **thaw (NVMe)** | **3.7s** | 8.26 GB/s | **5.6x** |
|
|
23
|
+
| **thaw (RAM hot path)** | **3.5s** | 10.69 GB/s | **5.9x** |
|
|
24
|
+
|
|
25
|
+
**Agent fork — clone a running AI session (Llama-3-8B-Instruct, H100 SXM):**
|
|
26
|
+
|
|
27
|
+
| Operation | Time | Notes |
|
|
28
|
+
|-----------|------|-------|
|
|
29
|
+
| Weight restore (Rust pipelined) | **1.1s** | **14.79 GB/s** — PCIe Gen5-saturating |
|
|
30
|
+
| KV cache restore | **0.135s** | 65 blocks, 136 MB |
|
|
31
|
+
| Total restore (incl. vLLM init) | **7.3s** | vs 16s normal cold start |
|
|
32
|
+
| Fork 3 parallel completions | **1.6s avg** | All share 872-token cached prefix |
|
|
33
|
+
|
|
34
|
+
All paths produce **bit-identical** inference output. KV cache restore preserves prefix cache across cold starts — new requests skip prefill entirely.
|
|
35
|
+
|
|
36
|
+
<details>
|
|
37
|
+
<summary>More GPUs and models</summary>
|
|
38
|
+
|
|
39
|
+
| GPU | Model | Normal | thaw | Speedup |
|
|
40
|
+
|-----|-------|--------|------|---------|
|
|
41
|
+
| 2x A100 SXM 80GB | Llama-3-70B (TP=2) | 546.5s | 31.8s | **17.2x** |
|
|
42
|
+
| H100 SXM 80GB | Llama-3-8B | 20.7s | 3.5s | **5.9x** |
|
|
43
|
+
| RTX PRO 6000 (Blackwell) | Llama-3-8B | 28.6s | 3.2s | **8.9x** |
|
|
44
|
+
| RTX A6000 | Llama-3-8B | 73.2s | 5.8s | **12.6x** |
|
|
45
|
+
|
|
46
|
+
Larger models show bigger speedups because weight loading dominates more of the total cold start time.
|
|
47
|
+
|
|
48
|
+
</details>
|
|
49
|
+
|
|
50
|
+
## How it works
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
Normal vLLM cold start:
|
|
54
|
+
Download weights → deserialize safetensors → copy to GPU → init KV cache → ready
|
|
55
|
+
[==================================] 20.7s
|
|
56
|
+
|
|
57
|
+
thaw restore:
|
|
58
|
+
Dummy init → DMA snapshot to GPU (pipelined, pinned memory, O_DIRECT)
|
|
59
|
+
[=====] 3.5s
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**Freeze** captures all GPU state into binary snapshots — model weights (`.thaw`) and KV cache blocks (`.thawkv`).
|
|
63
|
+
|
|
64
|
+
**Restore** initializes vLLM with dummy weights (fast — no disk I/O), then overwrites them from the snapshot using double-buffered pipelined DMA through pinned host memory. Two CUDA streams overlap PCIe transfers with disk reads. KV cache blocks are restored separately with their prefix cache hash mappings, so new requests immediately get cache hits.
|
|
65
|
+
|
|
66
|
+
Two restore modes:
|
|
67
|
+
- **Disk**: reads snapshot from NVMe with O_DIRECT, bypassing the kernel page cache. Throughput limited by NVMe bandwidth.
|
|
68
|
+
- **RAM hot path**: snapshot pre-loaded in memory (tmpfs, shared memory, mmap). Pure PCIe DMA — 10.69 GB/s on H100. For production use where snapshots are pre-staged.
|
|
69
|
+
|
|
70
|
+
**KV cache snapshots** capture the prefix-cached blocks that vLLM retains after generation. On restore, block data is DMA'd back to GPU and the prefix cache hash table is reconstructed. Requests with matching prefixes skip prefill — the most expensive part of inference.
|
|
71
|
+
|
|
72
|
+
## Architecture
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
thaw/
|
|
76
|
+
crates/
|
|
77
|
+
thaw-core/ Rust. File format, region tables, I/O. No CUDA dep.
|
|
78
|
+
thaw-cuda-sys/ Rust. FFI bindings to CUDA runtime (cudaMallocHost,
|
|
79
|
+
cudaMemcpyAsync, streams). Built via build.rs.
|
|
80
|
+
thaw-runtime/ Rust. Orchestration: freeze/restore pipelines, double-
|
|
81
|
+
buffered DMA, O_DIRECT, MockCuda for Mac testing.
|
|
82
|
+
thaw-py/ Rust. PyO3 bindings exposing pipelined freeze/restore
|
|
83
|
+
to Python. Builds a native .so via maturin.
|
|
84
|
+
thaw-cli/ Rust. GPU benchmark binary.
|
|
85
|
+
python/
|
|
86
|
+
thaw_vllm/ Python package (pip install thaw-vllm).
|
|
87
|
+
snapshot.py Freeze/restore weights, Rust backend fallback.
|
|
88
|
+
kv_snapshot.py KV cache freeze/restore.
|
|
89
|
+
loader.py vLLM ModelLoader: load_format="thaw".
|
|
90
|
+
server.py OpenAI-compatible API server.
|
|
91
|
+
cli.py CLI: thaw freeze, thaw serve, thaw info.
|
|
92
|
+
vllm_demo.py End-to-end benchmark: normal vs thaw cold start.
|
|
93
|
+
kv_cache_demo.py KV cache snapshot/restore demo with correctness test.
|
|
94
|
+
demos/
|
|
95
|
+
agent_fork.py Agent fork demo: clone session, fork parallel completions.
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Testing on Mac, shipping on GPU.** The `CudaBackend` trait abstracts all GPU operations. `MockCuda` (a HashMap-backed fake) lets 48 runtime tests run on any machine. The `cuda` feature flag activates real GPU paths only when needed.
|
|
99
|
+
|
|
100
|
+
## Quick start
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
pip install thaw-vllm[serve]
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
import thaw_vllm
|
|
108
|
+
from vllm import LLM, SamplingParams
|
|
109
|
+
|
|
110
|
+
# Freeze: save model weights to a snapshot
|
|
111
|
+
llm = LLM(model="meta-llama/Meta-Llama-3-8B", dtype="float16", enforce_eager=True)
|
|
112
|
+
thaw_vllm.freeze_model_pipelined(model, "/path/to/weights.thaw")
|
|
113
|
+
|
|
114
|
+
# Restore: two lines, 5.9x faster cold start
|
|
115
|
+
llm = thaw_vllm.load("meta-llama/Meta-Llama-3-8B", "/path/to/weights.thaw")
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Or use `load_format="thaw"` directly with vLLM:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
import thaw_vllm # registers the loader
|
|
122
|
+
llm = LLM(model="meta-llama/Meta-Llama-3-8B",
|
|
123
|
+
load_format="thaw",
|
|
124
|
+
model_loader_extra_config={"snapshot": "/path/to/weights.thaw"})
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Multi-GPU** — tensor parallel with per-rank snapshots:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
# Freeze: each GPU saves its shard
|
|
131
|
+
llm = LLM(model="meta-llama/Meta-Llama-3-70B-Instruct", tensor_parallel_size=2, ...)
|
|
132
|
+
thaw_vllm.freeze_model_tp(llm, "/path/to/weights.thaw")
|
|
133
|
+
# Creates: weights.thaw (rank 0), weights.rank1.thaw (rank 1)
|
|
134
|
+
|
|
135
|
+
# Restore: 17.2x faster than normal cold start
|
|
136
|
+
llm = thaw_vllm.load("meta-llama/Meta-Llama-3-70B-Instruct", "/path/to/weights.thaw",
|
|
137
|
+
tensor_parallel_size=2)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
**Agent fork demo** — clone a running AI session, fork parallel completions:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
python demos/agent_fork.py --snapshot weights.thaw
|
|
144
|
+
python demos/agent_fork.py --snapshot weights.thaw --full-cycle # destroy + restore
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**CLI:**
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
thaw freeze --model meta-llama/Meta-Llama-3-8B --output weights.thaw
|
|
151
|
+
thaw serve --model meta-llama/Meta-Llama-3-8B --snapshot weights.thaw
|
|
152
|
+
thaw info weights.thaw
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
<details>
|
|
156
|
+
<summary>Building with Rust+CUDA backend (optional, higher throughput)</summary>
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
git clone https://github.com/matteso1/thaw.git && cd thaw
|
|
160
|
+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
|
161
|
+
source "$HOME/.cargo/env"
|
|
162
|
+
pip install "maturin[patchelf]" vllm
|
|
163
|
+
cd crates/thaw-py && maturin develop --release --features cuda && cd ../..
|
|
164
|
+
pip install -e ".[serve]"
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
</details>
|
|
168
|
+
|
|
169
|
+
## Competitive landscape
|
|
170
|
+
|
|
171
|
+
The model loading space is active. Here's how thaw compares:
|
|
172
|
+
|
|
173
|
+
| Project | Approach | Throughput | Limitations |
|
|
174
|
+
|---------|----------|-----------|-------------|
|
|
175
|
+
| **thaw** | Pipelined DMA, pinned memory, O_DIRECT + KV cache snapshot | 6.7-14.8 GB/s per GPU | — |
|
|
176
|
+
| fastsafetensors (IBM) | GDS + 4x NVMe RAID0 | 26.4 GB/s | Requires GDS setup + RAID hardware |
|
|
177
|
+
| NVIDIA Model Streamer | Multi-threaded concurrent streaming | ~2 GB/s (single SSD) | NVIDIA-maintained, less flexible |
|
|
178
|
+
| CoreWeave Tensorizer | HTTP/S3 streaming + deserialization | ~4.6 GB/s local | Tied to CoreWeave ecosystem |
|
|
179
|
+
| vLLM Sleep Mode | Offload to CPU RAM, reload | 0.26-3s | Not a cold start — requires prior warm load |
|
|
180
|
+
| Modal GPU Snapshots | CUDA checkpoint/restore API | ~10x reduction | Alpha. Doesn't help with large model weight loading |
|
|
181
|
+
| InferX | GPU runtime snapshotting | Claims 2s for 70B | No public code or benchmarks |
|
|
182
|
+
|
|
183
|
+
**thaw's differentiation:**
|
|
184
|
+
1. **KV cache snapshot/restore** — nobody else does this. Preserves prefix cache across cold starts, eliminates prefill. Enables agent forking, session migration, warm handoff.
|
|
185
|
+
2. **Single NVMe performance** — most deployments don't have RAID0. thaw already matches or beats multi-threaded alternatives on one drive.
|
|
186
|
+
3. **No special hardware** — no GDS, no RAID, no driver patches. Works on any CUDA 12+ GPU.
|
|
187
|
+
|
|
188
|
+
See [docs/LANDSCAPE.md](./docs/LANDSCAPE.md) for detailed analysis.
|
|
189
|
+
|
|
190
|
+
## Roadmap
|
|
191
|
+
|
|
192
|
+
- [x] Weight snapshot/restore (pure Python path)
|
|
193
|
+
- [x] Rust+CUDA pipelined freeze/restore (double-buffered DMA, O_DIRECT)
|
|
194
|
+
- [x] RAM-backed restore path (PCIe-saturating, 10.69 GB/s)
|
|
195
|
+
- [x] PyO3 bindings + vLLM integration shim
|
|
196
|
+
- [x] H100 / A6000 / Blackwell benchmarks
|
|
197
|
+
- [x] **KV cache snapshot/restore** — the moat (freeze/restore prefix-cached blocks, verified on Llama-3-8B)
|
|
198
|
+
- [x] `pip install thaw-vllm` + CLI (`thaw freeze`, `thaw serve`, `thaw info`)
|
|
199
|
+
- [x] `load_format="thaw"` — native vLLM ModelLoader integration
|
|
200
|
+
- [x] OpenAI-compatible API server (`thaw serve`)
|
|
201
|
+
- [x] Streaming support in API server (SSE, OpenAI-compatible)
|
|
202
|
+
- [x] **Agent fork demo** — clone a running AI session, fork parallel completions from shared KV cache (full-cycle: 14.79 GB/s restore, 0.135s KV restore on H100 SXM)
|
|
203
|
+
- [x] **Multi-GPU / tensor parallel** — 17.2x speedup on Llama-3-70B with 2x A100 (TP=2), bit-exact correctness verified
|
|
204
|
+
- [ ] SGLang integration
|
|
205
|
+
- [ ] Cloud snapshot storage (S3/GCS)
|
|
206
|
+
- [ ] GPUDirect Storage support
|
|
207
|
+
|
|
208
|
+
## Design
|
|
209
|
+
|
|
210
|
+
Full technical architecture, file format spec, and rationale: [DESIGN.md](./DESIGN.md)
|
|
211
|
+
|
|
212
|
+
## License
|
|
213
|
+
|
|
214
|
+
MIT
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "thaw-vllm"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Fast snapshot/restore for LLM inference. 17x faster cold starts, multi-GPU tensor parallel, KV cache snapshots."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Nils Matteson"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["llm", "inference", "vllm", "gpu", "cold-start", "kv-cache"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"torch>=2.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
vllm = ["vllm>=0.18"]
|
|
29
|
+
serve = ["vllm>=0.18", "fastapi", "uvicorn"]
|
|
30
|
+
all = ["vllm>=0.18", "fastapi", "uvicorn"]
|
|
31
|
+
|
|
32
|
+
[project.scripts]
|
|
33
|
+
thaw = "thaw_vllm.cli:main"
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/matteso1/thaw"
|
|
37
|
+
Repository = "https://github.com/matteso1/thaw"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["python"]
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.package-dir]
|
|
43
|
+
"" = "python"
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# thaw_vllm — GPU snapshot/restore for vLLM model weights.
|
|
2
|
+
#
|
|
3
|
+
# This module provides two operations:
|
|
4
|
+
#
|
|
5
|
+
# freeze_model(model, path) — snapshot all model weights to a .thaw file
|
|
6
|
+
# restore_model(model, path) — load weights from a .thaw file back onto GPU
|
|
7
|
+
#
|
|
8
|
+
# The file format is thaw's binary format (see crates/thaw-core). This
|
|
9
|
+
# Python implementation reads/writes the same byte layout as the Rust
|
|
10
|
+
# side, so files are interchangeable. The Python path uses PyTorch's
|
|
11
|
+
# own CUDA memory operations under the hood — no custom FFI required.
|
|
12
|
+
#
|
|
13
|
+
# This is the MVP integration layer. In production, the Rust
|
|
14
|
+
# implementation (via thaw-py/PyO3) replaces this for higher
|
|
15
|
+
# throughput. The file format is identical either way.
|
|
16
|
+
|
|
17
|
+
from thaw_vllm.snapshot import (
|
|
18
|
+
freeze_model,
|
|
19
|
+
freeze_model_pipelined,
|
|
20
|
+
freeze_model_tp,
|
|
21
|
+
restore_model,
|
|
22
|
+
restore_model_from_ram,
|
|
23
|
+
restore_model_pipelined,
|
|
24
|
+
)
|
|
25
|
+
from thaw_vllm.kv_snapshot import (
|
|
26
|
+
freeze_kv_cache,
|
|
27
|
+
freeze_kv_cache_tp,
|
|
28
|
+
restore_kv_cache,
|
|
29
|
+
restore_kv_cache_tp,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Register load_format="thaw" with vLLM when available.
|
|
33
|
+
try:
|
|
34
|
+
from thaw_vllm.loader import ThawModelLoader # noqa: F401
|
|
35
|
+
except ImportError:
|
|
36
|
+
# vLLM not installed — loader registration is optional.
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def load(model: str, snapshot: str, kv_snapshot: str = None, **kwargs):
|
|
41
|
+
"""One-line thaw-powered model loading.
|
|
42
|
+
|
|
43
|
+
Usage:
|
|
44
|
+
import thaw_vllm
|
|
45
|
+
llm = thaw_vllm.load("meta-llama/Meta-Llama-3-8B", "/path/to/weights.thaw")
|
|
46
|
+
|
|
47
|
+
Multi-GPU:
|
|
48
|
+
llm = thaw_vllm.load("meta-llama/Meta-Llama-3-70B", "/path/to/weights.thaw",
|
|
49
|
+
tensor_parallel_size=4)
|
|
50
|
+
"""
|
|
51
|
+
from vllm import LLM
|
|
52
|
+
|
|
53
|
+
kwargs.setdefault("enforce_eager", True)
|
|
54
|
+
kwargs.setdefault("dtype", "float16")
|
|
55
|
+
|
|
56
|
+
tp_size = kwargs.get("tensor_parallel_size", 1)
|
|
57
|
+
|
|
58
|
+
llm = LLM(
|
|
59
|
+
model=model,
|
|
60
|
+
load_format="thaw",
|
|
61
|
+
model_loader_extra_config={"snapshot": snapshot},
|
|
62
|
+
**kwargs,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if kv_snapshot:
|
|
66
|
+
if tp_size > 1:
|
|
67
|
+
restore_kv_cache_tp(llm, kv_snapshot)
|
|
68
|
+
else:
|
|
69
|
+
restore_kv_cache(llm, kv_snapshot)
|
|
70
|
+
|
|
71
|
+
return llm
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
__all__ = [
|
|
75
|
+
"freeze_model",
|
|
76
|
+
"freeze_model_pipelined",
|
|
77
|
+
"freeze_model_tp",
|
|
78
|
+
"restore_model",
|
|
79
|
+
"restore_model_from_ram",
|
|
80
|
+
"restore_model_pipelined",
|
|
81
|
+
"freeze_kv_cache",
|
|
82
|
+
"freeze_kv_cache_tp",
|
|
83
|
+
"restore_kv_cache",
|
|
84
|
+
"restore_kv_cache_tp",
|
|
85
|
+
"load",
|
|
86
|
+
"ThawModelLoader",
|
|
87
|
+
]
|