turboadam 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- turboadam-0.1.0/LICENSE +21 -0
- turboadam-0.1.0/PKG-INFO +272 -0
- turboadam-0.1.0/README.md +241 -0
- turboadam-0.1.0/pyproject.toml +46 -0
- turboadam-0.1.0/setup.cfg +4 -0
- turboadam-0.1.0/src/turboadam/__init__.py +10 -0
- turboadam-0.1.0/src/turboadam/costate.py +464 -0
- turboadam-0.1.0/src/turboadam/oneq.py +77 -0
- turboadam-0.1.0/src/turboadam/optimizer.py +292 -0
- turboadam-0.1.0/src/turboadam/quantize.py +299 -0
- turboadam-0.1.0/src/turboadam/triton_kernels.py +516 -0
- turboadam-0.1.0/src/turboadam/utils.py +66 -0
- turboadam-0.1.0/src/turboadam.egg-info/PKG-INFO +272 -0
- turboadam-0.1.0/src/turboadam.egg-info/SOURCES.txt +24 -0
- turboadam-0.1.0/src/turboadam.egg-info/dependency_links.txt +1 -0
- turboadam-0.1.0/src/turboadam.egg-info/requires.txt +11 -0
- turboadam-0.1.0/src/turboadam.egg-info/top_level.txt +1 -0
- turboadam-0.1.0/tests/test_costate.py +678 -0
- turboadam-0.1.0/tests/test_oneq.py +70 -0
- turboadam-0.1.0/tests/test_optimizer.py +742 -0
- turboadam-0.1.0/tests/test_quantize.py +63 -0
- turboadam-0.1.0/tests/test_roundtrip.py +457 -0
- turboadam-0.1.0/tests/test_triton_costate.py +154 -0
- turboadam-0.1.0/tests/test_triton_smoke.py +33 -0
- turboadam-0.1.0/tests/test_triton_v.py +198 -0
- turboadam-0.1.0/tests/test_utils.py +157 -0
turboadam-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 David Kogan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
turboadam-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: turboadam
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Drop-in Adam/AdamW replacement with 6.5× optimizer-state memory reduction. One line change, no model modifications. Compresses both moments in-place during training with bounded per-element error guarantees.
|
|
5
|
+
Author-email: David Kogan <davidkny22@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/davidkny22/turboadam
|
|
8
|
+
Project-URL: Repository, https://github.com/davidkny22/turboadam
|
|
9
|
+
Project-URL: Issues, https://github.com/davidkny22/turboadam/issues
|
|
10
|
+
Keywords: optimizer,adam,adamw,pytorch,memory-efficient,quantization,deep-learning,training
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: torch>=2.2.0
|
|
22
|
+
Requires-Dist: numpy>=1.26.0
|
|
23
|
+
Provides-Extra: triton
|
|
24
|
+
Requires-Dist: triton>=2.2.0; extra == "triton"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
27
|
+
Requires-Dist: matplotlib>=3.8.0; extra == "dev"
|
|
28
|
+
Requires-Dist: datasets>=2.18.0; extra == "dev"
|
|
29
|
+
Requires-Dist: transformers>=4.40.0; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# TurboAdam
|
|
33
|
+
|
|
34
|
+
[]() []() []() []()
|
|
35
|
+
|
|
36
|
+
**Drop-in Adam/AdamW replacement with 6.5× optimizer-state memory reduction.**
|
|
37
|
+
|
|
38
|
+
One line change. No model modifications. No training-loop changes.
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from turboadam import TurboAdam
|
|
42
|
+
|
|
43
|
+
optimizer = TurboAdam(model.parameters(), lr=1e-3)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Why TurboAdam?
|
|
49
|
+
|
|
50
|
+
Adam stores two full-precision copies of every parameter (first and second moments). For a 7B model that is **28 GB** of optimizer state alone — often the memory bottleneck that forces smaller batch sizes or shorter context lengths.
|
|
51
|
+
|
|
52
|
+
TurboAdam compresses both moments in-place during training, cutting optimizer-state memory from **64 bits/param → 9.9 bits/param** (6.5× reduction). On GPT-2 124M it converges within **+0.25 loss points** of full-precision AdamW (**1.2% relative** — within run-to-run noise).
|
|
53
|
+
|
|
54
|
+
| Model size | AdamW optimizer state | TurboAdam | Savings |
|
|
55
|
+
|-----------|----------------------|-----------|---------|
|
|
56
|
+
| 125M (GPT-2) | 0.50 GB | 0.08 GB | **0.42 GB** |
|
|
57
|
+
| 7B | 28.0 GB | **4.3 GB** | **23.7 GB** |
|
|
58
|
+
| 70B | 280.0 GB | **43.0 GB** | **237.0 GB** |
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Quick start
|
|
63
|
+
|
|
64
|
+
### Install
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install turboadam
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
For the latest source version:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install git+https://github.com/davidkny22/turboadam.git
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Requirements: Python >=3.10, PyTorch >=2.2, Triton (optional, for CUDA speed-ups).
|
|
77
|
+
|
|
78
|
+
### Use
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from turboadam import TurboAdam
|
|
82
|
+
|
|
83
|
+
# Drop-in replacement for torch.optim.AdamW
|
|
84
|
+
optimizer = TurboAdam(
|
|
85
|
+
model.parameters(),
|
|
86
|
+
lr=6e-4,
|
|
87
|
+
betas=(0.9, 0.999),
|
|
88
|
+
weight_decay=0.01,
|
|
89
|
+
v_bits=4, # 2, 3, 4, 6, or 8
|
|
90
|
+
compress_m=True, # CoState first-moment compression
|
|
91
|
+
compress_v=True, # Log-scale second-moment compression
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## How it works
|
|
98
|
+
|
|
99
|
+
TurboAdam combines two independent, separable compression techniques. You can enable either or both.
|
|
100
|
+
|
|
101
|
+
### 1Q — Second-moment (v) compression
|
|
102
|
+
|
|
103
|
+
v is stored as n-bit **log-scale quantized** values per 128-element block:
|
|
104
|
+
|
|
105
|
+
1. **Decompress** block min/max → reconstruct v via exp interpolation
|
|
106
|
+
2. **EMA update**: `v_new = β₂·v_old + (1-β₂)·g²`
|
|
107
|
+
3. **Bias-correct** denominator: `denom = √(v / (1-β₂ᵗ)) + ε`
|
|
108
|
+
4. **Re-compress** with **stochastic rounding** (unbiased — prevents systematic EMA drift)
|
|
109
|
+
|
|
110
|
+
Storage per block: `n_bits` uint8 indices + 2× fp16 scales.
|
|
111
|
+
Default **4-bit** = **4.25 bits/param**.
|
|
112
|
+
|
|
113
|
+
**Key insight:** Theoretical analysis predicted 4-bit would fail due to accumulated quantization noise (22× amplification from β₂=0.999 EMA). In practice it works because quantization errors are correlated — same elements map to the same buckets step-to-step.
|
|
114
|
+
|
|
115
|
+
### CoState — First-moment (m) compression
|
|
116
|
+
|
|
117
|
+
Gradient-residual decomposition: `m = α·g + δ`
|
|
118
|
+
|
|
119
|
+
- `α = (m·g) / (g·g)` — scalar projection onto current gradient
|
|
120
|
+
- `δ = m - α·g` — residual orthogonal to gradient
|
|
121
|
+
|
|
122
|
+
δ is partitioned into 128-element blocks and classified into three **costates**:
|
|
123
|
+
|
|
124
|
+
| Costate | Condition | Storage | Typical share |
|
|
125
|
+
|---------|-----------|---------|---------------|
|
|
126
|
+
| **Null** | `r < P₁₀` | 1-bit flag | ~10% |
|
|
127
|
+
| **Phase** | `P₁₀ ≤ r < P₉₀` | 1-bit sign per element | ~80% |
|
|
128
|
+
| **Amplitude** | `r ≥ P₉₀` | 1-bit sign + fp16 block scale | ~10% |
|
|
129
|
+
|
|
130
|
+
**Key insight:** For Adam, direction matters more than magnitude because `m/√v` normalizes per-element. Sign-only encoding preserves direction for 80% of components. This is why CoState works at ~2 bits/param while low-rank approaches fail — they preserve magnitude for few directions but lose direction for many.
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Results
|
|
135
|
+
|
|
136
|
+
### Memory
|
|
137
|
+
|
|
138
|
+
Measured on one GPT-2 layer (9 parameter tensors, CUDA).
|
|
139
|
+
|
|
140
|
+
| Configuration | Persistent optimizer memory | vs AdamW |
|
|
141
|
+
|--------------|----------------------------|----------|
|
|
142
|
+
| AdamW (baseline) | 56.6 MB | 1.00× |
|
|
143
|
+
| TurboAdam (v only, 4-bit) | 35.6 MB | **0.63×** |
|
|
144
|
+
| TurboAdam (m only, CoState) | 29.6 MB | **0.52×** |
|
|
145
|
+
| **TurboAdam (m + v, default)** | **8.6 MB** | **0.15×** |
|
|
146
|
+
|
|
147
|
+
### Speed
|
|
148
|
+
|
|
149
|
+
Measured on one GPT-2 layer, RTX 4070, 200-step average.
|
|
150
|
+
|
|
151
|
+
| Configuration | Time/step | vs AdamW |
|
|
152
|
+
|--------------|-----------|----------|
|
|
153
|
+
| AdamW (baseline) | 12.0 ms | 1.00× |
|
|
154
|
+
| TurboAdam (v only) | 8.4 ms | **0.70×** |
|
|
155
|
+
| TurboAdam (m + v, default) | 17.0 ms | **1.41×** |
|
|
156
|
+
|
|
157
|
+
The v-only path is actually **faster** than AdamW because 4-bit log-scale decompression is cheaper than full fp32 EMA updates on small tensors. The m+v path adds ~40% overhead from CoState encode/decode.
|
|
158
|
+
|
|
159
|
+
### Convergence — GPT-2 124M on WikiText-103
|
|
160
|
+
|
|
161
|
+
| Configuration | Loss @ step 500 | Gap vs AdamW |
|
|
162
|
+
|--------------|-----------------|--------------|
|
|
163
|
+
| AdamW (full fp32) | 19.28 | — |
|
|
164
|
+
| TurboAdam (8-bit v + CoState) | 19.79 | +0.51 |
|
|
165
|
+
| **TurboAdam (4-bit v + CoState, default)** | **19.58** | **+0.25** |
|
|
166
|
+
| TurboAdam (CoState only, fp32 v) | 19.80 | +0.52 |
|
|
167
|
+
| TurboAdam (v only, fp32 m) | 19.28 | ~0.00 |
|
|
168
|
+
|
|
169
|
+
The +0.25 gap is structural to CoState's sign-only encoding and shrinks as training progresses (+2.94 at step 50, +0.25 at step 500). Threshold tuning and error feedback do not reduce it. For workloads where every tenth of a point matters, run with `compress_m=False` for v-only compression at zero convergence cost.
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## API
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
TurboAdam(
|
|
177
|
+
params, # iterable of parameters or param groups
|
|
178
|
+
lr=1e-3, # learning rate
|
|
179
|
+
betas=(0.9, 0.999), # (β₁, β₂) EMA decay coefficients
|
|
180
|
+
eps=1e-8, # numerical stability
|
|
181
|
+
weight_decay=0.0, # AdamW-style decoupled weight decay
|
|
182
|
+
block_size=128, # quantization block size (elements)
|
|
183
|
+
v_bits=4, # bits per element for v: 2, 3, 4, 6, or 8
|
|
184
|
+
compress_m=True, # enable CoState m compression
|
|
185
|
+
compress_v=True, # enable v compression
|
|
186
|
+
null_pct=0.10, # CoState null threshold percentile
|
|
187
|
+
amp_pct=0.90, # CoState amplitude threshold percentile
|
|
188
|
+
error_feedback=False, # CoState error feedback (tested, no improvement)
|
|
189
|
+
capturable=False, # CUDA graph capture (not yet supported)
|
|
190
|
+
min_m_compress_elements=4096, # minimum param size for CoState m compression
|
|
191
|
+
)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
All arguments are standard PyTorch Optimizer kwargs plus TurboAdam-specific compression controls. State dicts are fully compatible with `torch.save` / `torch.load`.
|
|
195
|
+
|
|
196
|
+
**Notes:**
|
|
197
|
+
- `torch.compile` will graph-break at `opt.step()` (expected for Python-loop optimizers; does not affect correctness).
|
|
198
|
+
- FSDP / DeepSpeed ZeRO compatibility is on the [roadmap](ROADMAP.md) for v0.2.0.
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## Validation
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
# Full test suite (151 tests)
|
|
206
|
+
python -m pytest tests/ -q
|
|
207
|
+
|
|
208
|
+
# Quick convergence smoke test
|
|
209
|
+
python -c "
|
|
210
|
+
import torch, torch.nn as nn
|
|
211
|
+
from turboadam import TurboAdam
|
|
212
|
+
|
|
213
|
+
torch.manual_seed(0)
|
|
214
|
+
x = nn.Parameter(torch.randn(50, device='cuda'))
|
|
215
|
+
opt = TurboAdam([x], lr=1e-2)
|
|
216
|
+
for _ in range(200):
|
|
217
|
+
opt.zero_grad()
|
|
218
|
+
loss = (x**2).sum()
|
|
219
|
+
loss.backward()
|
|
220
|
+
opt.step()
|
|
221
|
+
print(f'Final loss: {loss.item():.6f}') # < 5% of initial
|
|
222
|
+
"
|
|
223
|
+
|
|
224
|
+
# GPT-2 124M training run (~36 min on RTX 4070)
|
|
225
|
+
python experiments/train_turboadam.py --steps 500 --log_every 50
|
|
226
|
+
|
|
227
|
+
# Speed benchmark
|
|
228
|
+
python scripts/benchmark_speed.py
|
|
229
|
+
|
|
230
|
+
# Memory profiler
|
|
231
|
+
python scripts/profile_memory.py
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## Design decisions
|
|
237
|
+
|
|
238
|
+
1. **Compress-every-step (not freeze-refresh).** The original design froze v for 1000 steps and refreshed periodically. This caused a +3.75 loss gap from v staleness. Compress-every-step with stochastic rounding eliminates staleness — the EMA runs continuously on the compressed state.
|
|
239
|
+
|
|
240
|
+
2. **4-bit default.** 4-bit gives 6.5× compression with +0.25 gap. 8-bit gives 4.1× with +0.51. The sweet spot is 4-bit — going higher barely improves precision, going lower risks noise accumulation.
|
|
241
|
+
|
|
242
|
+
3. **Stochastic rounding.** Unbiased rounding prevents systematic drift in the EMA. Without it, deterministic rounding accumulates a bias of ~1000× the per-step error (for β₂=0.999).
|
|
243
|
+
|
|
244
|
+
4. **Sign-only for CoState (not low-rank).** We tested LoRA-Pre style low-rank projection (rank 8–512). It fails for Adam because momentum is NOT low-rank — rank-8 captures only 4% of energy. Sign-only encoding captures direction for ALL elements, which is what Adam's per-coordinate denominator normalization needs.
|
|
245
|
+
|
|
246
|
+
5. **P10/P90 thresholds.** Extensive testing showed threshold changes (P5/P85, P5/P80, P10/P95, etc.) produce identical convergence. The gap is structural to sign encoding, not the null/phase/amplitude split.
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## Project status
|
|
251
|
+
|
|
252
|
+
- **Phase 1** (current): RTX 4070 8GB, models ≤ 125M — **complete**. Correctness validated, speed optimized, Triton kernels production-ready.
|
|
253
|
+
- **Phase 2** (next): DGX Spark 128GB, models up to 7B — pending hardware.
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## Citation
|
|
258
|
+
|
|
259
|
+
```bibtex
|
|
260
|
+
@misc{kogan2026turboadam,
|
|
261
|
+
title={TurboAdam: Memory-Efficient Adam via In-Place Optimizer State Compression},
|
|
262
|
+
author={Kogan, David},
|
|
263
|
+
year={2026},
|
|
264
|
+
howpublished={\url{https://github.com/davidkogan/turboadam}}
|
|
265
|
+
}
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
MIT
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
# TurboAdam
|
|
2
|
+
|
|
3
|
+
[]() []() []() []()
|
|
4
|
+
|
|
5
|
+
**Drop-in Adam/AdamW replacement with 6.5× optimizer-state memory reduction.**
|
|
6
|
+
|
|
7
|
+
One line change. No model modifications. No training-loop changes.
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from turboadam import TurboAdam
|
|
11
|
+
|
|
12
|
+
optimizer = TurboAdam(model.parameters(), lr=1e-3)
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Why TurboAdam?
|
|
18
|
+
|
|
19
|
+
Adam stores two full-precision copies of every parameter (first and second moments). For a 7B model that is **28 GB** of optimizer state alone — often the memory bottleneck that forces smaller batch sizes or shorter context lengths.
|
|
20
|
+
|
|
21
|
+
TurboAdam compresses both moments in-place during training, cutting optimizer-state memory from **64 bits/param → 9.9 bits/param** (6.5× reduction). On GPT-2 124M it converges within **+0.25 loss points** of full-precision AdamW (**1.2% relative** — within run-to-run noise).
|
|
22
|
+
|
|
23
|
+
| Model size | AdamW optimizer state | TurboAdam | Savings |
|
|
24
|
+
|-----------|----------------------|-----------|---------|
|
|
25
|
+
| 125M (GPT-2) | 0.50 GB | 0.08 GB | **0.42 GB** |
|
|
26
|
+
| 7B | 28.0 GB | **4.3 GB** | **23.7 GB** |
|
|
27
|
+
| 70B | 280.0 GB | **43.0 GB** | **237.0 GB** |
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Quick start
|
|
32
|
+
|
|
33
|
+
### Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install turboadam
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
For the latest source version:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install git+https://github.com/davidkny22/turboadam.git
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Requirements: Python >=3.10, PyTorch >=2.2, Triton (optional, for CUDA speed-ups).
|
|
46
|
+
|
|
47
|
+
### Use
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from turboadam import TurboAdam
|
|
51
|
+
|
|
52
|
+
# Drop-in replacement for torch.optim.AdamW
|
|
53
|
+
optimizer = TurboAdam(
|
|
54
|
+
model.parameters(),
|
|
55
|
+
lr=6e-4,
|
|
56
|
+
betas=(0.9, 0.999),
|
|
57
|
+
weight_decay=0.01,
|
|
58
|
+
v_bits=4, # 2, 3, 4, 6, or 8
|
|
59
|
+
compress_m=True, # CoState first-moment compression
|
|
60
|
+
compress_v=True, # Log-scale second-moment compression
|
|
61
|
+
)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## How it works
|
|
67
|
+
|
|
68
|
+
TurboAdam combines two independent, separable compression techniques. You can enable either or both.
|
|
69
|
+
|
|
70
|
+
### 1Q — Second-moment (v) compression
|
|
71
|
+
|
|
72
|
+
v is stored as n-bit **log-scale quantized** values per 128-element block:
|
|
73
|
+
|
|
74
|
+
1. **Decompress** block min/max → reconstruct v via exp interpolation
|
|
75
|
+
2. **EMA update**: `v_new = β₂·v_old + (1-β₂)·g²`
|
|
76
|
+
3. **Bias-correct** denominator: `denom = √(v / (1-β₂ᵗ)) + ε`
|
|
77
|
+
4. **Re-compress** with **stochastic rounding** (unbiased — prevents systematic EMA drift)
|
|
78
|
+
|
|
79
|
+
Storage per block: `n_bits` uint8 indices + 2× fp16 scales.
|
|
80
|
+
Default **4-bit** = **4.25 bits/param**.
|
|
81
|
+
|
|
82
|
+
**Key insight:** Theoretical analysis predicted 4-bit would fail due to accumulated quantization noise (22× amplification from β₂=0.999 EMA). In practice it works because quantization errors are correlated — same elements map to the same buckets step-to-step.
|
|
83
|
+
|
|
84
|
+
### CoState — First-moment (m) compression
|
|
85
|
+
|
|
86
|
+
Gradient-residual decomposition: `m = α·g + δ`
|
|
87
|
+
|
|
88
|
+
- `α = (m·g) / (g·g)` — scalar projection onto current gradient
|
|
89
|
+
- `δ = m - α·g` — residual orthogonal to gradient
|
|
90
|
+
|
|
91
|
+
δ is partitioned into 128-element blocks and classified into three **costates**:
|
|
92
|
+
|
|
93
|
+
| Costate | Condition | Storage | Typical share |
|
|
94
|
+
|---------|-----------|---------|---------------|
|
|
95
|
+
| **Null** | `r < P₁₀` | 1-bit flag | ~10% |
|
|
96
|
+
| **Phase** | `P₁₀ ≤ r < P₉₀` | 1-bit sign per element | ~80% |
|
|
97
|
+
| **Amplitude** | `r ≥ P₉₀` | 1-bit sign + fp16 block scale | ~10% |
|
|
98
|
+
|
|
99
|
+
**Key insight:** For Adam, direction matters more than magnitude because `m/√v` normalizes per-element. Sign-only encoding preserves direction for 80% of components. This is why CoState works at ~2 bits/param while low-rank approaches fail — they preserve magnitude for few directions but lose direction for many.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Results
|
|
104
|
+
|
|
105
|
+
### Memory
|
|
106
|
+
|
|
107
|
+
Measured on one GPT-2 layer (9 parameter tensors, CUDA).
|
|
108
|
+
|
|
109
|
+
| Configuration | Persistent optimizer memory | vs AdamW |
|
|
110
|
+
|--------------|----------------------------|----------|
|
|
111
|
+
| AdamW (baseline) | 56.6 MB | 1.00× |
|
|
112
|
+
| TurboAdam (v only, 4-bit) | 35.6 MB | **0.63×** |
|
|
113
|
+
| TurboAdam (m only, CoState) | 29.6 MB | **0.52×** |
|
|
114
|
+
| **TurboAdam (m + v, default)** | **8.6 MB** | **0.15×** |
|
|
115
|
+
|
|
116
|
+
### Speed
|
|
117
|
+
|
|
118
|
+
Measured on one GPT-2 layer, RTX 4070, 200-step average.
|
|
119
|
+
|
|
120
|
+
| Configuration | Time/step | vs AdamW |
|
|
121
|
+
|--------------|-----------|----------|
|
|
122
|
+
| AdamW (baseline) | 12.0 ms | 1.00× |
|
|
123
|
+
| TurboAdam (v only) | 8.4 ms | **0.70×** |
|
|
124
|
+
| TurboAdam (m + v, default) | 17.0 ms | **1.41×** |
|
|
125
|
+
|
|
126
|
+
The v-only path is actually **faster** than AdamW because 4-bit log-scale decompression is cheaper than full fp32 EMA updates on small tensors. The m+v path adds ~40% overhead from CoState encode/decode.
|
|
127
|
+
|
|
128
|
+
### Convergence — GPT-2 124M on WikiText-103
|
|
129
|
+
|
|
130
|
+
| Configuration | Loss @ step 500 | Gap vs AdamW |
|
|
131
|
+
|--------------|-----------------|--------------|
|
|
132
|
+
| AdamW (full fp32) | 19.28 | — |
|
|
133
|
+
| TurboAdam (8-bit v + CoState) | 19.79 | +0.51 |
|
|
134
|
+
| **TurboAdam (4-bit v + CoState, default)** | **19.58** | **+0.25** |
|
|
135
|
+
| TurboAdam (CoState only, fp32 v) | 19.80 | +0.52 |
|
|
136
|
+
| TurboAdam (v only, fp32 m) | 19.28 | ~0.00 |
|
|
137
|
+
|
|
138
|
+
The +0.25 gap is structural to CoState's sign-only encoding and shrinks as training progresses (+2.94 at step 50, +0.25 at step 500). Threshold tuning and error feedback do not reduce it. For workloads where every tenth of a point matters, run with `compress_m=False` for v-only compression at zero convergence cost.
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## API
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
TurboAdam(
|
|
146
|
+
params, # iterable of parameters or param groups
|
|
147
|
+
lr=1e-3, # learning rate
|
|
148
|
+
betas=(0.9, 0.999), # (β₁, β₂) EMA decay coefficients
|
|
149
|
+
eps=1e-8, # numerical stability
|
|
150
|
+
weight_decay=0.0, # AdamW-style decoupled weight decay
|
|
151
|
+
block_size=128, # quantization block size (elements)
|
|
152
|
+
v_bits=4, # bits per element for v: 2, 3, 4, 6, or 8
|
|
153
|
+
compress_m=True, # enable CoState m compression
|
|
154
|
+
compress_v=True, # enable v compression
|
|
155
|
+
null_pct=0.10, # CoState null threshold percentile
|
|
156
|
+
amp_pct=0.90, # CoState amplitude threshold percentile
|
|
157
|
+
error_feedback=False, # CoState error feedback (tested, no improvement)
|
|
158
|
+
capturable=False, # CUDA graph capture (not yet supported)
|
|
159
|
+
min_m_compress_elements=4096, # minimum param size for CoState m compression
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
All arguments are standard PyTorch Optimizer kwargs plus TurboAdam-specific compression controls. State dicts are fully compatible with `torch.save` / `torch.load`.
|
|
164
|
+
|
|
165
|
+
**Notes:**
|
|
166
|
+
- `torch.compile` will graph-break at `opt.step()` (expected for Python-loop optimizers; does not affect correctness).
|
|
167
|
+
- FSDP / DeepSpeed ZeRO compatibility is on the [roadmap](ROADMAP.md) for v0.2.0.
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
## Validation
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
# Full test suite (151 tests)
|
|
175
|
+
python -m pytest tests/ -q
|
|
176
|
+
|
|
177
|
+
# Quick convergence smoke test
|
|
178
|
+
python -c "
|
|
179
|
+
import torch, torch.nn as nn
|
|
180
|
+
from turboadam import TurboAdam
|
|
181
|
+
|
|
182
|
+
torch.manual_seed(0)
|
|
183
|
+
x = nn.Parameter(torch.randn(50, device='cuda'))
|
|
184
|
+
opt = TurboAdam([x], lr=1e-2)
|
|
185
|
+
for _ in range(200):
|
|
186
|
+
opt.zero_grad()
|
|
187
|
+
loss = (x**2).sum()
|
|
188
|
+
loss.backward()
|
|
189
|
+
opt.step()
|
|
190
|
+
print(f'Final loss: {loss.item():.6f}') # < 5% of initial
|
|
191
|
+
"
|
|
192
|
+
|
|
193
|
+
# GPT-2 124M training run (~36 min on RTX 4070)
|
|
194
|
+
python experiments/train_turboadam.py --steps 500 --log_every 50
|
|
195
|
+
|
|
196
|
+
# Speed benchmark
|
|
197
|
+
python scripts/benchmark_speed.py
|
|
198
|
+
|
|
199
|
+
# Memory profiler
|
|
200
|
+
python scripts/profile_memory.py
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Design decisions
|
|
206
|
+
|
|
207
|
+
1. **Compress-every-step (not freeze-refresh).** The original design froze v for 1000 steps and refreshed periodically. This caused a +3.75 loss gap from v staleness. Compress-every-step with stochastic rounding eliminates staleness — the EMA runs continuously on the compressed state.
|
|
208
|
+
|
|
209
|
+
2. **4-bit default.** 4-bit gives 6.5× compression with +0.25 gap. 8-bit gives 4.1× with +0.51. The sweet spot is 4-bit — going higher barely improves precision, going lower risks noise accumulation.
|
|
210
|
+
|
|
211
|
+
3. **Stochastic rounding.** Unbiased rounding prevents systematic drift in the EMA. Without it, deterministic rounding accumulates a bias of ~1000× the per-step error (for β₂=0.999).
|
|
212
|
+
|
|
213
|
+
4. **Sign-only for CoState (not low-rank).** We tested LoRA-Pre style low-rank projection (rank 8–512). It fails for Adam because momentum is NOT low-rank — rank-8 captures only 4% of energy. Sign-only encoding captures direction for ALL elements, which is what Adam's per-coordinate denominator normalization needs.
|
|
214
|
+
|
|
215
|
+
5. **P10/P90 thresholds.** Extensive testing showed threshold changes (P5/P85, P5/P80, P10/P95, etc.) produce identical convergence. The gap is structural to sign encoding, not the null/phase/amplitude split.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Project status
|
|
220
|
+
|
|
221
|
+
- **Phase 1** (current): RTX 4070 8GB, models ≤ 125M — **complete**. Correctness validated, speed optimized, Triton kernels production-ready.
|
|
222
|
+
- **Phase 2** (next): DGX Spark 128GB, models up to 7B — pending hardware.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Citation
|
|
227
|
+
|
|
228
|
+
```bibtex
|
|
229
|
+
@misc{kogan2026turboadam,
|
|
230
|
+
title={TurboAdam: Memory-Efficient Adam via In-Place Optimizer State Compression},
|
|
231
|
+
author={Kogan, David},
|
|
232
|
+
year={2026},
|
|
233
|
+
howpublished={\url{https://github.com/davidkogan/turboadam}}
|
|
234
|
+
}
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## License
|
|
240
|
+
|
|
241
|
+
MIT
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "turboadam"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Drop-in Adam/AdamW replacement with 6.5× optimizer-state memory reduction. One line change, no model modifications. Compresses both moments in-place during training with bounded per-element error guarantees."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
keywords = ["optimizer", "adam", "adamw", "pytorch", "memory-efficient", "quantization", "deep-learning", "training"]
|
|
11
|
+
authors = [{ name = "David Kogan", email = "davidkny22@gmail.com" }]
|
|
12
|
+
license = { text = "MIT" }
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Intended Audience :: Science/Research",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"torch>=2.2.0",
|
|
25
|
+
"numpy>=1.26.0",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
triton = ["triton>=2.2.0"]
|
|
30
|
+
dev = [
|
|
31
|
+
"pytest>=8.0.0",
|
|
32
|
+
"matplotlib>=3.8.0",
|
|
33
|
+
"datasets>=2.18.0",
|
|
34
|
+
"transformers>=4.40.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/davidkny22/turboadam"
|
|
39
|
+
Repository = "https://github.com/davidkny22/turboadam"
|
|
40
|
+
Issues = "https://github.com/davidkny22/turboadam/issues"
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.packages.find]
|
|
43
|
+
where = ["src"]
|
|
44
|
+
|
|
45
|
+
[tool.pytest.ini_options]
|
|
46
|
+
testpaths = ["tests"]
|