turboadam 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 David Kogan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,272 @@
1
+ Metadata-Version: 2.4
2
+ Name: turboadam
3
+ Version: 0.1.0
4
+ Summary: Drop-in Adam/AdamW replacement with 6.5× optimizer-state memory reduction. One line change, no model modifications. Compresses both moments in-place during training with bounded per-element error guarantees.
5
+ Author-email: David Kogan <davidkny22@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/davidkny22/turboadam
8
+ Project-URL: Repository, https://github.com/davidkny22/turboadam
9
+ Project-URL: Issues, https://github.com/davidkny22/turboadam/issues
10
+ Keywords: optimizer,adam,adamw,pytorch,memory-efficient,quantization,deep-learning,training
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: torch>=2.2.0
22
+ Requires-Dist: numpy>=1.26.0
23
+ Provides-Extra: triton
24
+ Requires-Dist: triton>=2.2.0; extra == "triton"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
27
+ Requires-Dist: matplotlib>=3.8.0; extra == "dev"
28
+ Requires-Dist: datasets>=2.18.0; extra == "dev"
29
+ Requires-Dist: transformers>=4.40.0; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # TurboAdam
33
+
34
+ [![Tests](https://img.shields.io/badge/tests-151%2F151-brightgreen)]() [![Python](https://img.shields.io/badge/python-3.10%2B-blue)]() [![PyTorch](https://img.shields.io/badge/PyTorch-2.2%2B-orange)]() [![License](https://img.shields.io/badge/license-MIT-green)]()
35
+
36
+ **Drop-in Adam/AdamW replacement with 6.5× optimizer-state memory reduction.**
37
+
38
+ One line change. No model modifications. No training-loop changes.
39
+
40
+ ```python
41
+ from turboadam import TurboAdam
42
+
43
+ optimizer = TurboAdam(model.parameters(), lr=1e-3)
44
+ ```
45
+
46
+ ---
47
+
48
+ ## Why TurboAdam?
49
+
50
+ Adam stores two full-precision copies of every parameter (first and second moments). For a 7B model that is **28 GB** of optimizer state alone — often the memory bottleneck that forces smaller batch sizes or shorter context lengths.
51
+
52
+ TurboAdam compresses both moments in-place during training, cutting optimizer-state memory from **64 bits/param → 9.9 bits/param** (6.5× reduction). On GPT-2 124M it converges within **+0.25 loss points** of full-precision AdamW (**1.2% relative** — within run-to-run noise).
53
+
54
+ | Model size | AdamW optimizer state | TurboAdam | Savings |
55
+ |-----------|----------------------|-----------|---------|
56
+ | 125M (GPT-2) | 0.50 GB | 0.08 GB | **0.42 GB** |
57
+ | 7B | 28.0 GB | **4.3 GB** | **23.7 GB** |
58
+ | 70B | 280.0 GB | **43.0 GB** | **237.0 GB** |
59
+
60
+ ---
61
+
62
+ ## Quick start
63
+
64
+ ### Install
65
+
66
+ ```bash
67
+ pip install turboadam
68
+ ```
69
+
70
+ For the latest source version:
71
+
72
+ ```bash
73
+ pip install git+https://github.com/davidkny22/turboadam.git
74
+ ```
75
+
76
+ Requirements: Python >=3.10, PyTorch >=2.2, Triton (optional, for CUDA speed-ups).
77
+
78
+ ### Use
79
+
80
+ ```python
81
+ from turboadam import TurboAdam
82
+
83
+ # Drop-in replacement for torch.optim.AdamW
84
+ optimizer = TurboAdam(
85
+ model.parameters(),
86
+ lr=6e-4,
87
+ betas=(0.9, 0.999),
88
+ weight_decay=0.01,
89
+ v_bits=4, # 2, 3, 4, 6, or 8
90
+ compress_m=True, # CoState first-moment compression
91
+ compress_v=True, # Log-scale second-moment compression
92
+ )
93
+ ```
94
+
95
+ ---
96
+
97
+ ## How it works
98
+
99
+ TurboAdam combines two independent, separable compression techniques. You can enable either or both.
100
+
101
+ ### 1Q — Second-moment (v) compression
102
+
103
+ v is stored as n-bit **log-scale quantized** values per 128-element block:
104
+
105
+ 1. **Decompress** block min/max → reconstruct v via exp interpolation
106
+ 2. **EMA update**: `v_new = β₂·v_old + (1-β₂)·g²`
107
+ 3. **Bias-correct** denominator: `denom = √(v / (1-β₂ᵗ)) + ε`
108
+ 4. **Re-compress** with **stochastic rounding** (unbiased — prevents systematic EMA drift)
109
+
110
+ Storage per block: `n_bits` uint8 indices + 2× fp16 scales.
111
+ Default **4-bit** = **4.25 bits/param**.
112
+
113
+ **Key insight:** Theoretical analysis predicted 4-bit would fail due to accumulated quantization noise (22× amplification from β₂=0.999 EMA). In practice it works because quantization errors are correlated — same elements map to the same buckets step-to-step.
114
+
115
+ ### CoState — First-moment (m) compression
116
+
117
+ Gradient-residual decomposition: `m = α·g + δ`
118
+
119
+ - `α = (m·g) / (g·g)` — scalar projection onto current gradient
120
+ - `δ = m - α·g` — residual orthogonal to gradient
121
+
122
+ δ is partitioned into 128-element blocks and classified into three **costates**:
123
+
124
+ | Costate | Condition | Storage | Typical share |
125
+ |---------|-----------|---------|---------------|
126
+ | **Null** | `r < P₁₀` | 1-bit flag | ~10% |
127
+ | **Phase** | `P₁₀ ≤ r < P₉₀` | 1-bit sign per element | ~80% |
128
+ | **Amplitude** | `r ≥ P₉₀` | 1-bit sign + fp16 block scale | ~10% |
129
+
130
+ **Key insight:** For Adam, direction matters more than magnitude because `m/√v` normalizes per-element. Sign-only encoding preserves direction for 80% of components. This is why CoState works at ~2 bits/param while low-rank approaches fail — they preserve magnitude for few directions but lose direction for many.
131
+
132
+ ---
133
+
134
+ ## Results
135
+
136
+ ### Memory
137
+
138
+ Measured on one GPT-2 layer (9 parameter tensors, CUDA).
139
+
140
+ | Configuration | Persistent optimizer memory | vs AdamW |
141
+ |--------------|----------------------------|----------|
142
+ | AdamW (baseline) | 56.6 MB | 1.00× |
143
+ | TurboAdam (v only, 4-bit) | 35.6 MB | **0.63×** |
144
+ | TurboAdam (m only, CoState) | 29.6 MB | **0.52×** |
145
+ | **TurboAdam (m + v, default)** | **8.6 MB** | **0.15×** |
146
+
147
+ ### Speed
148
+
149
+ Measured on one GPT-2 layer, RTX 4070, 200-step average.
150
+
151
+ | Configuration | Time/step | vs AdamW |
152
+ |--------------|-----------|----------|
153
+ | AdamW (baseline) | 12.0 ms | 1.00× |
154
+ | TurboAdam (v only) | 8.4 ms | **0.70×** |
155
+ | TurboAdam (m + v, default) | 17.0 ms | **1.41×** |
156
+
157
+ The v-only path is actually **faster** than AdamW because 4-bit log-scale decompression is cheaper than full fp32 EMA updates on small tensors. The m+v path adds ~40% overhead from CoState encode/decode.
158
+
159
+ ### Convergence — GPT-2 124M on WikiText-103
160
+
161
+ | Configuration | Loss @ step 500 | Gap vs AdamW |
162
+ |--------------|-----------------|--------------|
163
+ | AdamW (full fp32) | 19.28 | — |
164
+ | TurboAdam (8-bit v + CoState) | 19.79 | +0.51 |
165
+ | **TurboAdam (4-bit v + CoState, default)** | **19.58** | **+0.25** |
166
+ | TurboAdam (CoState only, fp32 v) | 19.80 | +0.52 |
167
+ | TurboAdam (v only, fp32 m) | 19.28 | ~0.00 |
168
+
169
+ The +0.25 gap is structural to CoState's sign-only encoding and shrinks as training progresses (+2.94 at step 50, +0.25 at step 500). Threshold tuning and error feedback do not reduce it. For workloads where every tenth of a point matters, run with `compress_m=False` for v-only compression at zero convergence cost.
170
+
171
+ ---
172
+
173
+ ## API
174
+
175
+ ```python
176
+ TurboAdam(
177
+ params, # iterable of parameters or param groups
178
+ lr=1e-3, # learning rate
179
+ betas=(0.9, 0.999), # (β₁, β₂) EMA decay coefficients
180
+ eps=1e-8, # numerical stability
181
+ weight_decay=0.0, # AdamW-style decoupled weight decay
182
+ block_size=128, # quantization block size (elements)
183
+ v_bits=4, # bits per element for v: 2, 3, 4, 6, or 8
184
+ compress_m=True, # enable CoState m compression
185
+ compress_v=True, # enable v compression
186
+ null_pct=0.10, # CoState null threshold percentile
187
+ amp_pct=0.90, # CoState amplitude threshold percentile
188
+ error_feedback=False, # CoState error feedback (tested, no improvement)
189
+ capturable=False, # CUDA graph capture (not yet supported)
190
+ min_m_compress_elements=4096, # minimum param size for CoState m compression
191
+ )
192
+ ```
193
+
194
+ All arguments are standard PyTorch Optimizer kwargs plus TurboAdam-specific compression controls. State dicts are fully compatible with `torch.save` / `torch.load`.
195
+
196
+ **Notes:**
197
+ - `torch.compile` will graph-break at `opt.step()` (expected for Python-loop optimizers; does not affect correctness).
198
+ - FSDP / DeepSpeed ZeRO compatibility is on the [roadmap](ROADMAP.md) for v0.2.0.
199
+
200
+ ---
201
+
202
+ ## Validation
203
+
204
+ ```bash
205
+ # Full test suite (151 tests)
206
+ python -m pytest tests/ -q
207
+
208
+ # Quick convergence smoke test
209
+ python -c "
210
+ import torch, torch.nn as nn
211
+ from turboadam import TurboAdam
212
+
213
+ torch.manual_seed(0)
214
+ x = nn.Parameter(torch.randn(50, device='cuda'))
215
+ opt = TurboAdam([x], lr=1e-2)
216
+ for _ in range(200):
217
+ opt.zero_grad()
218
+ loss = (x**2).sum()
219
+ loss.backward()
220
+ opt.step()
221
+ print(f'Final loss: {loss.item():.6f}') # < 5% of initial
222
+ "
223
+
224
+ # GPT-2 124M training run (~36 min on RTX 4070)
225
+ python experiments/train_turboadam.py --steps 500 --log_every 50
226
+
227
+ # Speed benchmark
228
+ python scripts/benchmark_speed.py
229
+
230
+ # Memory profiler
231
+ python scripts/profile_memory.py
232
+ ```
233
+
234
+ ---
235
+
236
+ ## Design decisions
237
+
238
+ 1. **Compress-every-step (not freeze-refresh).** The original design froze v for 1000 steps and refreshed periodically. This caused a +3.75 loss gap from v staleness. Compress-every-step with stochastic rounding eliminates staleness — the EMA runs continuously on the compressed state.
239
+
240
+ 2. **4-bit default.** 4-bit gives 6.5× compression with +0.25 gap. 8-bit gives 4.1× with +0.51. The sweet spot is 4-bit — going higher barely improves precision, going lower risks noise accumulation.
241
+
242
+ 3. **Stochastic rounding.** Unbiased rounding prevents systematic drift in the EMA. Without it, deterministic rounding accumulates a bias of ~1000× the per-step error (for β₂=0.999).
243
+
244
+ 4. **Sign-only for CoState (not low-rank).** We tested LoRA-Pre style low-rank projection (rank 8–512). It fails for Adam because momentum is NOT low-rank — rank-8 captures only 4% of energy. Sign-only encoding captures direction for ALL elements, which is what Adam's per-coordinate denominator normalization needs.
245
+
246
+ 5. **P10/P90 thresholds.** Extensive testing showed threshold changes (P5/P85, P5/P80, P10/P95, etc.) produce identical convergence. The gap is structural to sign encoding, not the null/phase/amplitude split.
247
+
248
+ ---
249
+
250
+ ## Project status
251
+
252
+ - **Phase 1** (current): RTX 4070 8GB, models ≤ 125M — **complete**. Correctness validated, speed optimized, Triton kernels production-ready.
253
+ - **Phase 2** (next): DGX Spark 128GB, models up to 7B — pending hardware.
254
+
255
+ ---
256
+
257
+ ## Citation
258
+
259
+ ```bibtex
260
+ @misc{kogan2026turboadam,
261
+ title={TurboAdam: Memory-Efficient Adam via In-Place Optimizer State Compression},
262
+ author={Kogan, David},
263
+ year={2026},
264
+ howpublished={\url{https://github.com/davidkogan/turboadam}}
265
+ }
266
+ ```
267
+
268
+ ---
269
+
270
+ ## License
271
+
272
+ MIT
@@ -0,0 +1,241 @@
1
+ # TurboAdam
2
+
3
+ [![Tests](https://img.shields.io/badge/tests-151%2F151-brightgreen)]() [![Python](https://img.shields.io/badge/python-3.10%2B-blue)]() [![PyTorch](https://img.shields.io/badge/PyTorch-2.2%2B-orange)]() [![License](https://img.shields.io/badge/license-MIT-green)]()
4
+
5
+ **Drop-in Adam/AdamW replacement with 6.5× optimizer-state memory reduction.**
6
+
7
+ One line change. No model modifications. No training-loop changes.
8
+
9
+ ```python
10
+ from turboadam import TurboAdam
11
+
12
+ optimizer = TurboAdam(model.parameters(), lr=1e-3)
13
+ ```
14
+
15
+ ---
16
+
17
+ ## Why TurboAdam?
18
+
19
+ Adam stores two full-precision copies of every parameter (first and second moments). For a 7B model that is **28 GB** of optimizer state alone — often the memory bottleneck that forces smaller batch sizes or shorter context lengths.
20
+
21
+ TurboAdam compresses both moments in-place during training, cutting optimizer-state memory from **64 bits/param → 9.9 bits/param** (6.5× reduction). On GPT-2 124M it converges within **+0.25 loss points** of full-precision AdamW (**1.2% relative** — within run-to-run noise).
22
+
23
+ | Model size | AdamW optimizer state | TurboAdam | Savings |
24
+ |-----------|----------------------|-----------|---------|
25
+ | 125M (GPT-2) | 0.50 GB | 0.08 GB | **0.42 GB** |
26
+ | 7B | 28.0 GB | **4.3 GB** | **23.7 GB** |
27
+ | 70B | 280.0 GB | **43.0 GB** | **237.0 GB** |
28
+
29
+ ---
30
+
31
+ ## Quick start
32
+
33
+ ### Install
34
+
35
+ ```bash
36
+ pip install turboadam
37
+ ```
38
+
39
+ For the latest source version:
40
+
41
+ ```bash
42
+ pip install git+https://github.com/davidkny22/turboadam.git
43
+ ```
44
+
45
+ Requirements: Python >=3.10, PyTorch >=2.2, Triton (optional, for CUDA speed-ups).
46
+
47
+ ### Use
48
+
49
+ ```python
50
+ from turboadam import TurboAdam
51
+
52
+ # Drop-in replacement for torch.optim.AdamW
53
+ optimizer = TurboAdam(
54
+ model.parameters(),
55
+ lr=6e-4,
56
+ betas=(0.9, 0.999),
57
+ weight_decay=0.01,
58
+ v_bits=4, # 2, 3, 4, 6, or 8
59
+ compress_m=True, # CoState first-moment compression
60
+ compress_v=True, # Log-scale second-moment compression
61
+ )
62
+ ```
63
+
64
+ ---
65
+
66
+ ## How it works
67
+
68
+ TurboAdam combines two independent, separable compression techniques. You can enable either or both.
69
+
70
+ ### 1Q — Second-moment (v) compression
71
+
72
+ v is stored as n-bit **log-scale quantized** values per 128-element block:
73
+
74
+ 1. **Decompress** block min/max → reconstruct v via exp interpolation
75
+ 2. **EMA update**: `v_new = β₂·v_old + (1-β₂)·g²`
76
+ 3. **Bias-correct** denominator: `denom = √(v / (1-β₂ᵗ)) + ε`
77
+ 4. **Re-compress** with **stochastic rounding** (unbiased — prevents systematic EMA drift)
78
+
79
+ Storage per block: `n_bits` uint8 indices + 2× fp16 scales.
80
+ Default **4-bit** = **4.25 bits/param**.
81
+
82
+ **Key insight:** Theoretical analysis predicted 4-bit would fail due to accumulated quantization noise (22× amplification from β₂=0.999 EMA). In practice it works because quantization errors are correlated — same elements map to the same buckets step-to-step.
83
+
84
+ ### CoState — First-moment (m) compression
85
+
86
+ Gradient-residual decomposition: `m = α·g + δ`
87
+
88
+ - `α = (m·g) / (g·g)` — scalar projection onto current gradient
89
+ - `δ = m - α·g` — residual orthogonal to gradient
90
+
91
+ δ is partitioned into 128-element blocks and classified into three **costates**:
92
+
93
+ | Costate | Condition | Storage | Typical share |
94
+ |---------|-----------|---------|---------------|
95
+ | **Null** | `r < P₁₀` | 1-bit flag | ~10% |
96
+ | **Phase** | `P₁₀ ≤ r < P₉₀` | 1-bit sign per element | ~80% |
97
+ | **Amplitude** | `r ≥ P₉₀` | 1-bit sign + fp16 block scale | ~10% |
98
+
99
+ **Key insight:** For Adam, direction matters more than magnitude because `m/√v` normalizes per-element. Sign-only encoding preserves direction for 80% of components. This is why CoState works at ~2 bits/param while low-rank approaches fail — they preserve magnitude for few directions but lose direction for many.
100
+
101
+ ---
102
+
103
+ ## Results
104
+
105
+ ### Memory
106
+
107
+ Measured on one GPT-2 layer (9 parameter tensors, CUDA).
108
+
109
+ | Configuration | Persistent optimizer memory | vs AdamW |
110
+ |--------------|----------------------------|----------|
111
+ | AdamW (baseline) | 56.6 MB | 1.00× |
112
+ | TurboAdam (v only, 4-bit) | 35.6 MB | **0.63×** |
113
+ | TurboAdam (m only, CoState) | 29.6 MB | **0.52×** |
114
+ | **TurboAdam (m + v, default)** | **8.6 MB** | **0.15×** |
115
+
116
+ ### Speed
117
+
118
+ Measured on one GPT-2 layer, RTX 4070, 200-step average.
119
+
120
+ | Configuration | Time/step | vs AdamW |
121
+ |--------------|-----------|----------|
122
+ | AdamW (baseline) | 12.0 ms | 1.00× |
123
+ | TurboAdam (v only) | 8.4 ms | **0.70×** |
124
+ | TurboAdam (m + v, default) | 17.0 ms | **1.41×** |
125
+
126
+ The v-only path is actually **faster** than AdamW because 4-bit log-scale decompression is cheaper than full fp32 EMA updates on small tensors. The m+v path adds ~40% overhead from CoState encode/decode.
127
+
128
+ ### Convergence — GPT-2 124M on WikiText-103
129
+
130
+ | Configuration | Loss @ step 500 | Gap vs AdamW |
131
+ |--------------|-----------------|--------------|
132
+ | AdamW (full fp32) | 19.28 | — |
133
+ | TurboAdam (8-bit v + CoState) | 19.79 | +0.51 |
134
+ | **TurboAdam (4-bit v + CoState, default)** | **19.58** | **+0.25** |
135
+ | TurboAdam (CoState only, fp32 v) | 19.80 | +0.52 |
136
+ | TurboAdam (v only, fp32 m) | 19.28 | ~0.00 |
137
+
138
+ The +0.25 gap is structural to CoState's sign-only encoding and shrinks as training progresses (+2.94 at step 50, +0.25 at step 500). Threshold tuning and error feedback do not reduce it. For workloads where every tenth of a point matters, run with `compress_m=False` for v-only compression at zero convergence cost.
139
+
140
+ ---
141
+
142
+ ## API
143
+
144
+ ```python
145
+ TurboAdam(
146
+ params, # iterable of parameters or param groups
147
+ lr=1e-3, # learning rate
148
+ betas=(0.9, 0.999), # (β₁, β₂) EMA decay coefficients
149
+ eps=1e-8, # numerical stability
150
+ weight_decay=0.0, # AdamW-style decoupled weight decay
151
+ block_size=128, # quantization block size (elements)
152
+ v_bits=4, # bits per element for v: 2, 3, 4, 6, or 8
153
+ compress_m=True, # enable CoState m compression
154
+ compress_v=True, # enable v compression
155
+ null_pct=0.10, # CoState null threshold percentile
156
+ amp_pct=0.90, # CoState amplitude threshold percentile
157
+ error_feedback=False, # CoState error feedback (tested, no improvement)
158
+ capturable=False, # CUDA graph capture (not yet supported)
159
+ min_m_compress_elements=4096, # minimum param size for CoState m compression
160
+ )
161
+ ```
162
+
163
+ All arguments are standard PyTorch Optimizer kwargs plus TurboAdam-specific compression controls. State dicts are fully compatible with `torch.save` / `torch.load`.
164
+
165
+ **Notes:**
166
+ - `torch.compile` will graph-break at `opt.step()` (expected for Python-loop optimizers; does not affect correctness).
167
+ - FSDP / DeepSpeed ZeRO compatibility is on the [roadmap](ROADMAP.md) for v0.2.0.
168
+
169
+ ---
170
+
171
+ ## Validation
172
+
173
+ ```bash
174
+ # Full test suite (151 tests)
175
+ python -m pytest tests/ -q
176
+
177
+ # Quick convergence smoke test
178
+ python -c "
179
+ import torch, torch.nn as nn
180
+ from turboadam import TurboAdam
181
+
182
+ torch.manual_seed(0)
183
+ x = nn.Parameter(torch.randn(50, device='cuda'))
184
+ opt = TurboAdam([x], lr=1e-2)
185
+ for _ in range(200):
186
+ opt.zero_grad()
187
+ loss = (x**2).sum()
188
+ loss.backward()
189
+ opt.step()
190
+ print(f'Final loss: {loss.item():.6f}') # < 5% of initial
191
+ "
192
+
193
+ # GPT-2 124M training run (~36 min on RTX 4070)
194
+ python experiments/train_turboadam.py --steps 500 --log_every 50
195
+
196
+ # Speed benchmark
197
+ python scripts/benchmark_speed.py
198
+
199
+ # Memory profiler
200
+ python scripts/profile_memory.py
201
+ ```
202
+
203
+ ---
204
+
205
+ ## Design decisions
206
+
207
+ 1. **Compress-every-step (not freeze-refresh).** The original design froze v for 1000 steps and refreshed periodically. This caused a +3.75 loss gap from v staleness. Compress-every-step with stochastic rounding eliminates staleness — the EMA runs continuously on the compressed state.
208
+
209
+ 2. **4-bit default.** 4-bit gives 6.5× compression with +0.25 gap. 8-bit gives 4.1× with +0.51. The sweet spot is 4-bit — going higher barely improves precision, going lower risks noise accumulation.
210
+
211
+ 3. **Stochastic rounding.** Unbiased rounding prevents systematic drift in the EMA. Without it, deterministic rounding accumulates a bias of ~1000× the per-step error (for β₂=0.999).
212
+
213
+ 4. **Sign-only for CoState (not low-rank).** We tested LoRA-Pre style low-rank projection (rank 8–512). It fails for Adam because momentum is NOT low-rank — rank-8 captures only 4% of energy. Sign-only encoding captures direction for ALL elements, which is what Adam's per-coordinate denominator normalization needs.
214
+
215
+ 5. **P10/P90 thresholds.** Extensive testing showed threshold changes (P5/P85, P5/P80, P10/P95, etc.) produce identical convergence. The gap is structural to sign encoding, not the null/phase/amplitude split.
216
+
217
+ ---
218
+
219
+ ## Project status
220
+
221
+ - **Phase 1** (current): RTX 4070 8GB, models ≤ 125M — **complete**. Correctness validated, speed optimized, Triton kernels production-ready.
222
+ - **Phase 2** (next): DGX Spark 128GB, models up to 7B — pending hardware.
223
+
224
+ ---
225
+
226
+ ## Citation
227
+
228
+ ```bibtex
229
+ @misc{kogan2026turboadam,
230
+ title={TurboAdam: Memory-Efficient Adam via In-Place Optimizer State Compression},
231
+ author={Kogan, David},
232
+ year={2026},
233
+ howpublished={\url{https://github.com/davidkogan/turboadam}}
234
+ }
235
+ ```
236
+
237
+ ---
238
+
239
+ ## License
240
+
241
+ MIT
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "turboadam"
7
+ version = "0.1.0"
8
+ description = "Drop-in Adam/AdamW replacement with 6.5× optimizer-state memory reduction. One line change, no model modifications. Compresses both moments in-place during training with bounded per-element error guarantees."
9
+ readme = "README.md"
10
+ keywords = ["optimizer", "adam", "adamw", "pytorch", "memory-efficient", "quantization", "deep-learning", "training"]
11
+ authors = [{ name = "David Kogan", email = "davidkny22@gmail.com" }]
12
+ license = { text = "MIT" }
13
+ requires-python = ">=3.10"
14
+ classifiers = [
15
+ "License :: OSI Approved :: MIT License",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Intended Audience :: Science/Research",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ ]
23
+ dependencies = [
24
+ "torch>=2.2.0",
25
+ "numpy>=1.26.0",
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ triton = ["triton>=2.2.0"]
30
+ dev = [
31
+ "pytest>=8.0.0",
32
+ "matplotlib>=3.8.0",
33
+ "datasets>=2.18.0",
34
+ "transformers>=4.40.0",
35
+ ]
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/davidkny22/turboadam"
39
+ Repository = "https://github.com/davidkny22/turboadam"
40
+ Issues = "https://github.com/davidkny22/turboadam/issues"
41
+
42
+ [tool.setuptools.packages.find]
43
+ where = ["src"]
44
+
45
+ [tool.pytest.ini_options]
46
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,10 @@
1
+ from importlib.metadata import version, PackageNotFoundError
2
+
3
+ try:
4
+ __version__ = version("turboadam")
5
+ except PackageNotFoundError:
6
+ __version__ = "unknown"
7
+
8
+ from turboadam.optimizer import TurboAdam
9
+
10
+ __all__ = ["TurboAdam"]