overflowml 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- overflowml-0.1.0/.gitignore +6 -0
- overflowml-0.1.0/LICENSE +21 -0
- overflowml-0.1.0/PKG-INFO +224 -0
- overflowml-0.1.0/README.md +189 -0
- overflowml-0.1.0/examples/any_diffusers_pipeline.py +49 -0
- overflowml-0.1.0/examples/qwen_image_edit.py +59 -0
- overflowml-0.1.0/overflowml/__init__.py +16 -0
- overflowml-0.1.0/overflowml/cli.py +75 -0
- overflowml-0.1.0/overflowml/detect.py +191 -0
- overflowml-0.1.0/overflowml/optimize.py +264 -0
- overflowml-0.1.0/overflowml/strategy.py +205 -0
- overflowml-0.1.0/pyproject.toml +41 -0
- overflowml-0.1.0/tests/__init__.py +0 -0
- overflowml-0.1.0/tests/test_strategy.py +145 -0
overflowml-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 HybridLab AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: overflowml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Run AI models larger than your GPU. Auto-detects hardware and applies optimal memory strategy.
|
|
5
|
+
Project-URL: Homepage, https://github.com/hybridlab-ai/overflowml
|
|
6
|
+
Author: HybridLab AI
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: gpu,inference,machine-learning,offload,optimization,vram
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Requires-Dist: psutil>=5.9
|
|
17
|
+
Requires-Dist: torch>=2.0
|
|
18
|
+
Provides-Extra: all
|
|
19
|
+
Requires-Dist: accelerate; extra == 'all'
|
|
20
|
+
Requires-Dist: diffusers>=0.30; extra == 'all'
|
|
21
|
+
Requires-Dist: torchao>=0.5; extra == 'all'
|
|
22
|
+
Requires-Dist: transformers; extra == 'all'
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
25
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
26
|
+
Provides-Extra: diffusers
|
|
27
|
+
Requires-Dist: accelerate; extra == 'diffusers'
|
|
28
|
+
Requires-Dist: diffusers>=0.30; extra == 'diffusers'
|
|
29
|
+
Requires-Dist: transformers; extra == 'diffusers'
|
|
30
|
+
Provides-Extra: mlx
|
|
31
|
+
Requires-Dist: mlx>=0.20; extra == 'mlx'
|
|
32
|
+
Provides-Extra: quantize
|
|
33
|
+
Requires-Dist: torchao>=0.5; extra == 'quantize'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# OverflowML
|
|
37
|
+
|
|
38
|
+
**Run AI models larger than your GPU.** One line of code.
|
|
39
|
+
|
|
40
|
+
OverflowML auto-detects your hardware (NVIDIA, Apple Silicon, AMD, CPU) and applies the optimal memory strategy to load and run models that don't fit in VRAM. No manual configuration needed.
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import overflowml
|
|
44
|
+
|
|
45
|
+
pipe = load_your_model() # 40GB model, 24GB GPU? No problem.
|
|
46
|
+
overflowml.optimize_pipeline(pipe, model_size_gb=40)
|
|
47
|
+
result = pipe(prompt) # Just works.
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## The Problem
|
|
51
|
+
|
|
52
|
+
AI models are getting bigger. A single image generation model can be 40GB+. LLMs regularly hit 70GB-400GB. But most GPUs have 8-24GB of VRAM.
|
|
53
|
+
|
|
54
|
+
The current solutions are painful:
|
|
55
|
+
- **Manual offloading** — you need to know which PyTorch function to call, which flags work together, and which combinations crash
|
|
56
|
+
- **Quantization footguns** — FP8 is incompatible with CPU offload on Windows. Attention slicing crashes with sequential offload. INT4 needs specific libraries.
|
|
57
|
+
- **Trial and error** — every hardware/model/framework combo has different gotchas
|
|
58
|
+
|
|
59
|
+
OverflowML handles all of this automatically.
|
|
60
|
+
|
|
61
|
+
## How It Works
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
Model: 40GB (BF16) Your GPU: 24GB VRAM
|
|
65
|
+
│ │
|
|
66
|
+
OverflowML detects mismatch │
|
|
67
|
+
│ │
|
|
68
|
+
┌────▼────────────────────────────▼────┐
|
|
69
|
+
│ Strategy: Sequential CPU Offload │
|
|
70
|
+
│ Move 1 layer (~1GB) to GPU at a │
|
|
71
|
+
│ time, compute, move back. │
|
|
72
|
+
│ Peak VRAM: ~3GB │
|
|
73
|
+
│ System RAM used: ~40GB │
|
|
74
|
+
│ Speed: 33s/image (RTX 5090) │
|
|
75
|
+
└──────────────────────────────────────┘
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Strategy Decision Tree
|
|
79
|
+
|
|
80
|
+
| Model vs VRAM | Strategy | Peak VRAM | Speed |
|
|
81
|
+
|---------------|----------|-----------|-------|
|
|
82
|
+
| Model fits with 15% headroom | Direct GPU load | Full | Fastest |
|
|
83
|
+
| FP8 model fits | FP8 quantization | ~55% of model | Fast |
|
|
84
|
+
| Components fit individually | Model CPU offload | ~70% of model | Medium |
|
|
85
|
+
| Nothing fits | Sequential CPU offload | ~3GB | Slower but works |
|
|
86
|
+
| Not enough RAM either | INT4 quantization + sequential | ~3GB | Slowest |
|
|
87
|
+
|
|
88
|
+
### Apple Silicon (Unified Memory)
|
|
89
|
+
|
|
90
|
+
On Macs, CPU and GPU share the same memory pool — there's nothing to "offload." OverflowML detects this and skips offloading entirely. If the model fits in ~75% of your RAM, it loads directly. If not, quantization is recommended.
|
|
91
|
+
|
|
92
|
+
| Mac | Unified Memory | Largest Model (4-bit) |
|
|
93
|
+
|-----|---------------|----------------------|
|
|
94
|
+
| M4 Max | 128GB | ~80B params |
|
|
95
|
+
| M3/M4 Ultra | 192GB | ~120B params |
|
|
96
|
+
| M3 Ultra | 512GB | 670B params |
|
|
97
|
+
|
|
98
|
+
## Installation
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install overflowml
|
|
102
|
+
|
|
103
|
+
# With diffusers support:
|
|
104
|
+
pip install overflowml[diffusers]
|
|
105
|
+
|
|
106
|
+
# With quantization:
|
|
107
|
+
pip install overflowml[all]
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Usage
|
|
111
|
+
|
|
112
|
+
### Diffusers Pipeline (Recommended)
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import torch
|
|
116
|
+
import overflowml
|
|
117
|
+
from diffusers import FluxPipeline
|
|
118
|
+
|
|
119
|
+
pipe = FluxPipeline.from_pretrained(
|
|
120
|
+
"black-forest-labs/FLUX.1-dev",
|
|
121
|
+
torch_dtype=torch.bfloat16,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# One line — auto-detects hardware, picks optimal strategy
|
|
125
|
+
strategy = overflowml.optimize_pipeline(pipe, model_size_gb=24)
|
|
126
|
+
print(strategy.summary())
|
|
127
|
+
|
|
128
|
+
result = pipe("a sunset over mountains", num_inference_steps=20)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Batch Generation with Memory Guard
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from overflowml import MemoryGuard
|
|
135
|
+
|
|
136
|
+
guard = MemoryGuard(threshold=0.7) # cleanup at 70% VRAM usage
|
|
137
|
+
|
|
138
|
+
for prompt in prompts:
|
|
139
|
+
with guard: # auto-cleans VRAM between iterations
|
|
140
|
+
result = pipe(prompt)
|
|
141
|
+
result.images[0].save(f"output.png")
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### CLI — Hardware Detection
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
$ overflowml detect
|
|
148
|
+
|
|
149
|
+
=== OverflowML Hardware Detection ===
|
|
150
|
+
Accelerator: cuda
|
|
151
|
+
GPU: NVIDIA GeForce RTX 5090 (32GB VRAM)
|
|
152
|
+
System RAM: 194GB
|
|
153
|
+
Overflow capacity: 178GB (total effective: 210GB)
|
|
154
|
+
BF16: yes | FP8: yes
|
|
155
|
+
|
|
156
|
+
$ overflowml plan 40
|
|
157
|
+
|
|
158
|
+
=== Strategy for 40GB model ===
|
|
159
|
+
Offload: sequential_cpu
|
|
160
|
+
Dtype: bfloat16
|
|
161
|
+
GC cleanup: enabled (threshold 70%)
|
|
162
|
+
Estimated peak VRAM: 3.0GB
|
|
163
|
+
→ Sequential offload: 1 layer at a time (~3GB VRAM), model lives in 194GB RAM
|
|
164
|
+
WARNING: FP8 incompatible with CPU offload on Windows
|
|
165
|
+
WARNING: Do NOT enable attention_slicing with sequential offload
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Standalone Model
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
import overflowml
|
|
172
|
+
|
|
173
|
+
model = load_my_transformer()
|
|
174
|
+
strategy = overflowml.optimize_model(model, model_size_gb=14)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Proven Results
|
|
178
|
+
|
|
179
|
+
Built and battle-tested on a real production pipeline:
|
|
180
|
+
|
|
181
|
+
| Metric | Before OverflowML | After |
|
|
182
|
+
|--------|-------------------|-------|
|
|
183
|
+
| Time per step | 530s (VRAM thrashing) | 6.7s |
|
|
184
|
+
| Images generated | 0/30 (crashes) | 30/30 |
|
|
185
|
+
| Total time | Impossible | 16.4 minutes |
|
|
186
|
+
| Peak VRAM | 32GB (thrashing) | 3GB |
|
|
187
|
+
| Reliability | Crashes after 3 images | Zero failures |
|
|
188
|
+
|
|
189
|
+
*40GB model on RTX 5090 (32GB VRAM) + 194GB RAM, sequential offload, Lightning LoRA 4-step*
|
|
190
|
+
|
|
191
|
+
## Known Incompatibilities
|
|
192
|
+
|
|
193
|
+
These are automatically handled by OverflowML's strategy engine:
|
|
194
|
+
|
|
195
|
+
| Combination | Issue | OverflowML Fix |
|
|
196
|
+
|-------------|-------|----------------|
|
|
197
|
+
| FP8 + CPU offload (Windows) | `Float8Tensor` can't move between devices | Skips FP8, uses BF16 |
|
|
198
|
+
| `attention_slicing` + sequential offload | CUDA illegal memory access | Never enables both |
|
|
199
|
+
| `enable_model_cpu_offload` + 40GB transformer | Transformer exceeds VRAM | Uses sequential offload instead |
|
|
200
|
+
| `expandable_segments` on Windows WDDM | Not supported | Gracefully ignored |
|
|
201
|
+
|
|
202
|
+
## Architecture
|
|
203
|
+
|
|
204
|
+
```
|
|
205
|
+
overflowml/
|
|
206
|
+
├── detect.py — Hardware detection (CUDA, MPS, MLX, ROCm, CPU)
|
|
207
|
+
├── strategy.py — Strategy engine (picks optimal offload + quantization)
|
|
208
|
+
├── optimize.py — Applies strategy to pipelines and models
|
|
209
|
+
└── cli.py — Command-line interface
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Cross-Platform Support
|
|
213
|
+
|
|
214
|
+
| Platform | Accelerator | Status |
|
|
215
|
+
|----------|-------------|--------|
|
|
216
|
+
| Windows + NVIDIA | CUDA | Production-ready |
|
|
217
|
+
| Linux + NVIDIA | CUDA | Production-ready |
|
|
218
|
+
| macOS + Apple Silicon | MPS / MLX | Detection ready, optimization in progress |
|
|
219
|
+
| Linux + AMD | ROCm | Planned |
|
|
220
|
+
| CPU-only | CPU | Fallback always works |
|
|
221
|
+
|
|
222
|
+
## License
|
|
223
|
+
|
|
224
|
+
MIT
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# OverflowML
|
|
2
|
+
|
|
3
|
+
**Run AI models larger than your GPU.** One line of code.
|
|
4
|
+
|
|
5
|
+
OverflowML auto-detects your hardware (NVIDIA, Apple Silicon, AMD, CPU) and applies the optimal memory strategy to load and run models that don't fit in VRAM. No manual configuration needed.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
import overflowml
|
|
9
|
+
|
|
10
|
+
pipe = load_your_model() # 40GB model, 24GB GPU? No problem.
|
|
11
|
+
overflowml.optimize_pipeline(pipe, model_size_gb=40)
|
|
12
|
+
result = pipe(prompt) # Just works.
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## The Problem
|
|
16
|
+
|
|
17
|
+
AI models are getting bigger. A single image generation model can be 40GB+. LLMs regularly hit 70GB-400GB. But most GPUs have 8-24GB of VRAM.
|
|
18
|
+
|
|
19
|
+
The current solutions are painful:
|
|
20
|
+
- **Manual offloading** — you need to know which PyTorch function to call, which flags work together, and which combinations crash
|
|
21
|
+
- **Quantization footguns** — FP8 is incompatible with CPU offload on Windows. Attention slicing crashes with sequential offload. INT4 needs specific libraries.
|
|
22
|
+
- **Trial and error** — every hardware/model/framework combo has different gotchas
|
|
23
|
+
|
|
24
|
+
OverflowML handles all of this automatically.
|
|
25
|
+
|
|
26
|
+
## How It Works
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
Model: 40GB (BF16) Your GPU: 24GB VRAM
|
|
30
|
+
│ │
|
|
31
|
+
OverflowML detects mismatch │
|
|
32
|
+
│ │
|
|
33
|
+
┌────▼────────────────────────────▼────┐
|
|
34
|
+
│ Strategy: Sequential CPU Offload │
|
|
35
|
+
│ Move 1 layer (~1GB) to GPU at a │
|
|
36
|
+
│ time, compute, move back. │
|
|
37
|
+
│ Peak VRAM: ~3GB │
|
|
38
|
+
│ System RAM used: ~40GB │
|
|
39
|
+
│ Speed: 33s/image (RTX 5090) │
|
|
40
|
+
└──────────────────────────────────────┘
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Strategy Decision Tree
|
|
44
|
+
|
|
45
|
+
| Model vs VRAM | Strategy | Peak VRAM | Speed |
|
|
46
|
+
|---------------|----------|-----------|-------|
|
|
47
|
+
| Model fits with 15% headroom | Direct GPU load | Full | Fastest |
|
|
48
|
+
| FP8 model fits | FP8 quantization | ~55% of model | Fast |
|
|
49
|
+
| Components fit individually | Model CPU offload | ~70% of model | Medium |
|
|
50
|
+
| Nothing fits | Sequential CPU offload | ~3GB | Slower but works |
|
|
51
|
+
| Not enough RAM either | INT4 quantization + sequential | ~3GB | Slowest |
|
|
52
|
+
|
|
53
|
+
### Apple Silicon (Unified Memory)
|
|
54
|
+
|
|
55
|
+
On Macs, CPU and GPU share the same memory pool — there's nothing to "offload." OverflowML detects this and skips offloading entirely. If the model fits in ~75% of your RAM, it loads directly. If not, quantization is recommended.
|
|
56
|
+
|
|
57
|
+
| Mac | Unified Memory | Largest Model (4-bit) |
|
|
58
|
+
|-----|---------------|----------------------|
|
|
59
|
+
| M4 Max | 128GB | ~80B params |
|
|
60
|
+
| M3/M4 Ultra | 192GB | ~120B params |
|
|
61
|
+
| M3 Ultra | 512GB | 670B params |
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install overflowml
|
|
67
|
+
|
|
68
|
+
# With diffusers support:
|
|
69
|
+
pip install overflowml[diffusers]
|
|
70
|
+
|
|
71
|
+
# With quantization:
|
|
72
|
+
pip install overflowml[all]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Usage
|
|
76
|
+
|
|
77
|
+
### Diffusers Pipeline (Recommended)
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import torch
|
|
81
|
+
import overflowml
|
|
82
|
+
from diffusers import FluxPipeline
|
|
83
|
+
|
|
84
|
+
pipe = FluxPipeline.from_pretrained(
|
|
85
|
+
"black-forest-labs/FLUX.1-dev",
|
|
86
|
+
torch_dtype=torch.bfloat16,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# One line — auto-detects hardware, picks optimal strategy
|
|
90
|
+
strategy = overflowml.optimize_pipeline(pipe, model_size_gb=24)
|
|
91
|
+
print(strategy.summary())
|
|
92
|
+
|
|
93
|
+
result = pipe("a sunset over mountains", num_inference_steps=20)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Batch Generation with Memory Guard
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from overflowml import MemoryGuard
|
|
100
|
+
|
|
101
|
+
guard = MemoryGuard(threshold=0.7) # cleanup at 70% VRAM usage
|
|
102
|
+
|
|
103
|
+
for prompt in prompts:
|
|
104
|
+
with guard: # auto-cleans VRAM between iterations
|
|
105
|
+
result = pipe(prompt)
|
|
106
|
+
result.images[0].save(f"output.png")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### CLI — Hardware Detection
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
$ overflowml detect
|
|
113
|
+
|
|
114
|
+
=== OverflowML Hardware Detection ===
|
|
115
|
+
Accelerator: cuda
|
|
116
|
+
GPU: NVIDIA GeForce RTX 5090 (32GB VRAM)
|
|
117
|
+
System RAM: 194GB
|
|
118
|
+
Overflow capacity: 178GB (total effective: 210GB)
|
|
119
|
+
BF16: yes | FP8: yes
|
|
120
|
+
|
|
121
|
+
$ overflowml plan 40
|
|
122
|
+
|
|
123
|
+
=== Strategy for 40GB model ===
|
|
124
|
+
Offload: sequential_cpu
|
|
125
|
+
Dtype: bfloat16
|
|
126
|
+
GC cleanup: enabled (threshold 70%)
|
|
127
|
+
Estimated peak VRAM: 3.0GB
|
|
128
|
+
→ Sequential offload: 1 layer at a time (~3GB VRAM), model lives in 194GB RAM
|
|
129
|
+
WARNING: FP8 incompatible with CPU offload on Windows
|
|
130
|
+
WARNING: Do NOT enable attention_slicing with sequential offload
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Standalone Model
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
import overflowml
|
|
137
|
+
|
|
138
|
+
model = load_my_transformer()
|
|
139
|
+
strategy = overflowml.optimize_model(model, model_size_gb=14)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Proven Results
|
|
143
|
+
|
|
144
|
+
Built and battle-tested on a real production pipeline:
|
|
145
|
+
|
|
146
|
+
| Metric | Before OverflowML | After |
|
|
147
|
+
|--------|-------------------|-------|
|
|
148
|
+
| Time per step | 530s (VRAM thrashing) | 6.7s |
|
|
149
|
+
| Images generated | 0/30 (crashes) | 30/30 |
|
|
150
|
+
| Total time | Impossible | 16.4 minutes |
|
|
151
|
+
| Peak VRAM | 32GB (thrashing) | 3GB |
|
|
152
|
+
| Reliability | Crashes after 3 images | Zero failures |
|
|
153
|
+
|
|
154
|
+
*40GB model on RTX 5090 (32GB VRAM) + 194GB RAM, sequential offload, Lightning LoRA 4-step*
|
|
155
|
+
|
|
156
|
+
## Known Incompatibilities
|
|
157
|
+
|
|
158
|
+
These are automatically handled by OverflowML's strategy engine:
|
|
159
|
+
|
|
160
|
+
| Combination | Issue | OverflowML Fix |
|
|
161
|
+
|-------------|-------|----------------|
|
|
162
|
+
| FP8 + CPU offload (Windows) | `Float8Tensor` can't move between devices | Skips FP8, uses BF16 |
|
|
163
|
+
| `attention_slicing` + sequential offload | CUDA illegal memory access | Never enables both |
|
|
164
|
+
| `enable_model_cpu_offload` + 40GB transformer | Transformer exceeds VRAM | Uses sequential offload instead |
|
|
165
|
+
| `expandable_segments` on Windows WDDM | Not supported | Gracefully ignored |
|
|
166
|
+
|
|
167
|
+
## Architecture
|
|
168
|
+
|
|
169
|
+
```
|
|
170
|
+
overflowml/
|
|
171
|
+
├── detect.py — Hardware detection (CUDA, MPS, MLX, ROCm, CPU)
|
|
172
|
+
├── strategy.py — Strategy engine (picks optimal offload + quantization)
|
|
173
|
+
├── optimize.py — Applies strategy to pipelines and models
|
|
174
|
+
└── cli.py — Command-line interface
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Cross-Platform Support
|
|
178
|
+
|
|
179
|
+
| Platform | Accelerator | Status |
|
|
180
|
+
|----------|-------------|--------|
|
|
181
|
+
| Windows + NVIDIA | CUDA | Production-ready |
|
|
182
|
+
| Linux + NVIDIA | CUDA | Production-ready |
|
|
183
|
+
| macOS + Apple Silicon | MPS / MLX | Detection ready, optimization in progress |
|
|
184
|
+
| Linux + AMD | ROCm | Planned |
|
|
185
|
+
| CPU-only | CPU | Fallback always works |
|
|
186
|
+
|
|
187
|
+
## License
|
|
188
|
+
|
|
189
|
+
MIT
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Example: Use OverflowML with any diffusers pipeline.
|
|
2
|
+
|
|
3
|
+
Works with Stable Diffusion, SDXL, Flux, Qwen-Image-Edit, etc.
|
|
4
|
+
Just load your pipeline normally, then call optimize_pipeline().
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
import overflowml
|
|
9
|
+
from overflowml import MemoryGuard
|
|
10
|
+
|
|
11
|
+
# --- Example 1: SDXL (7GB model, fits most GPUs) ---
|
|
12
|
+
from diffusers import StableDiffusionXLPipeline
|
|
13
|
+
|
|
14
|
+
pipe = StableDiffusionXLPipeline.from_pretrained(
|
|
15
|
+
"stabilityai/stable-diffusion-xl-base-1.0",
|
|
16
|
+
torch_dtype=torch.float16,
|
|
17
|
+
)
|
|
18
|
+
strategy = overflowml.optimize_pipeline(pipe)
|
|
19
|
+
# On RTX 4090: no offload needed, torch.compile enabled
|
|
20
|
+
# On RTX 3060 (12GB): might use FP8 or model offload
|
|
21
|
+
|
|
22
|
+
result = pipe("a cat in space", num_inference_steps=20)
|
|
23
|
+
result.images[0].save("sdxl_output.png")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# --- Example 2: Flux (24GB model, needs offload on most GPUs) ---
|
|
27
|
+
from diffusers import FluxPipeline
|
|
28
|
+
|
|
29
|
+
pipe = FluxPipeline.from_pretrained(
|
|
30
|
+
"black-forest-labs/FLUX.1-dev",
|
|
31
|
+
torch_dtype=torch.bfloat16,
|
|
32
|
+
)
|
|
33
|
+
strategy = overflowml.optimize_pipeline(pipe, model_size_gb=24)
|
|
34
|
+
# On RTX 5090 (32GB): FP8 quantization, fits in VRAM
|
|
35
|
+
# On RTX 4090 (24GB): sequential offload, ~3GB VRAM
|
|
36
|
+
# On Mac M4 Max (128GB): direct load
|
|
37
|
+
|
|
38
|
+
result = pipe("a sunset over mountains", num_inference_steps=20)
|
|
39
|
+
result.images[0].save("flux_output.png")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# --- Example 3: Batch generation with MemoryGuard ---
|
|
43
|
+
guard = MemoryGuard(threshold=0.7)
|
|
44
|
+
|
|
45
|
+
prompts = ["a cat", "a dog", "a bird", "a fish"]
|
|
46
|
+
for i, prompt in enumerate(prompts):
|
|
47
|
+
with guard: # auto-cleans VRAM between images
|
|
48
|
+
result = pipe(prompt, num_inference_steps=20)
|
|
49
|
+
result.images[0].save(f"batch_{i}.png")
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Example: Run Qwen-Image-Edit-2511 (40GB model) on any GPU.
|
|
2
|
+
|
|
3
|
+
This model is 40GB in BF16 — too large for most GPUs.
|
|
4
|
+
OverflowML auto-detects your hardware and picks the best strategy:
|
|
5
|
+
- RTX 5090 (32GB): sequential offload, ~3GB VRAM, 33s/image
|
|
6
|
+
- RTX 4090 (24GB): sequential offload, ~3GB VRAM, ~50s/image
|
|
7
|
+
- RTX 3090 (24GB): sequential offload, ~3GB VRAM, ~80s/image
|
|
8
|
+
- Mac M2 Ultra (192GB): direct load, full speed
|
|
9
|
+
- Mac M4 Max (128GB): direct load, full speed
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import torch
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from PIL import Image
|
|
15
|
+
from diffusers import QwenImageEditPlusPipeline
|
|
16
|
+
|
|
17
|
+
import overflowml
|
|
18
|
+
from overflowml import MemoryGuard
|
|
19
|
+
|
|
20
|
+
# 1. Detect hardware
|
|
21
|
+
hw = overflowml.detect_hardware()
|
|
22
|
+
print(hw.summary())
|
|
23
|
+
|
|
24
|
+
# 2. Load model
|
|
25
|
+
pipe = QwenImageEditPlusPipeline.from_pretrained(
|
|
26
|
+
"Qwen/Qwen-Image-Edit-2511",
|
|
27
|
+
torch_dtype=torch.bfloat16,
|
|
28
|
+
low_cpu_mem_usage=True,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# 3. Optimize — ONE LINE does everything
|
|
32
|
+
strategy = overflowml.optimize_pipeline(pipe, model_size_gb=40)
|
|
33
|
+
print(strategy.summary())
|
|
34
|
+
|
|
35
|
+
# 4. Generate with memory guard (prevents VRAM fragmentation)
|
|
36
|
+
face = Image.open("avatar.png").convert("RGB")
|
|
37
|
+
guard = MemoryGuard(threshold=0.7, verbose=True)
|
|
38
|
+
|
|
39
|
+
prompts = [
|
|
40
|
+
"Photograph of this woman, clean white studio, confident gaze, editorial headshot.",
|
|
41
|
+
"Photograph of this woman, outdoor golden hour, casual white linen shirt, natural smile.",
|
|
42
|
+
"Photograph of this woman, cozy cafe, holding ceramic coffee cup, relaxed expression.",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
for i, prompt in enumerate(prompts):
|
|
46
|
+
with guard:
|
|
47
|
+
result = pipe(
|
|
48
|
+
image=[face],
|
|
49
|
+
prompt=f"[image 1] is the reference face. {prompt}",
|
|
50
|
+
negative_prompt="low quality, blurry, distorted",
|
|
51
|
+
true_cfg_scale=4.0,
|
|
52
|
+
guidance_scale=1.0,
|
|
53
|
+
num_inference_steps=25,
|
|
54
|
+
width=832,
|
|
55
|
+
height=1040,
|
|
56
|
+
generator=torch.manual_seed(42),
|
|
57
|
+
)
|
|
58
|
+
result.images[0].save(f"output_{i+1}.png")
|
|
59
|
+
print(f"Image {i+1} saved")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""OverflowML — Run AI models larger than your GPU."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from .detect import detect_hardware, HardwareProfile
|
|
6
|
+
from .strategy import pick_strategy, Strategy
|
|
7
|
+
from .optimize import optimize_pipeline, optimize_model
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"detect_hardware",
|
|
11
|
+
"HardwareProfile",
|
|
12
|
+
"pick_strategy",
|
|
13
|
+
"Strategy",
|
|
14
|
+
"optimize_pipeline",
|
|
15
|
+
"optimize_model",
|
|
16
|
+
]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""CLI tool for hardware detection and strategy recommendation."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
if sys.platform == "win32":
|
|
8
|
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
|
9
|
+
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
|
10
|
+
|
|
11
|
+
from .detect import detect_hardware
|
|
12
|
+
from .strategy import pick_strategy
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main():
|
|
16
|
+
parser = argparse.ArgumentParser(
|
|
17
|
+
prog="overflowml",
|
|
18
|
+
description="OverflowML — Run AI models larger than your GPU",
|
|
19
|
+
)
|
|
20
|
+
sub = parser.add_subparsers(dest="command")
|
|
21
|
+
|
|
22
|
+
# --- detect
|
|
23
|
+
sub.add_parser("detect", help="Detect hardware and show capabilities")
|
|
24
|
+
|
|
25
|
+
# --- plan
|
|
26
|
+
plan = sub.add_parser("plan", help="Plan optimal loading strategy for a model")
|
|
27
|
+
plan.add_argument("model_size", type=float, help="Model size in GB (BF16 weights)")
|
|
28
|
+
plan.add_argument("--fast", action="store_true", help="Prefer speed over VRAM savings")
|
|
29
|
+
plan.add_argument("--no-quantize", action="store_true", help="Disable quantization")
|
|
30
|
+
|
|
31
|
+
args = parser.parse_args()
|
|
32
|
+
|
|
33
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
34
|
+
|
|
35
|
+
if args.command == "detect":
|
|
36
|
+
hw = detect_hardware()
|
|
37
|
+
print("\n=== OverflowML Hardware Detection ===")
|
|
38
|
+
print(hw.summary())
|
|
39
|
+
print(f"\nFor a model that needs loading, run:")
|
|
40
|
+
print(f" overflowml plan <size_in_gb>")
|
|
41
|
+
print()
|
|
42
|
+
|
|
43
|
+
elif args.command == "plan":
|
|
44
|
+
hw = detect_hardware()
|
|
45
|
+
print("\n=== Hardware ===")
|
|
46
|
+
print(hw.summary())
|
|
47
|
+
print()
|
|
48
|
+
|
|
49
|
+
strategy = pick_strategy(
|
|
50
|
+
hw, args.model_size,
|
|
51
|
+
prefer_speed=args.fast,
|
|
52
|
+
allow_quantization=not args.no_quantize,
|
|
53
|
+
)
|
|
54
|
+
print(f"=== Strategy for {args.model_size:.0f}GB model ===")
|
|
55
|
+
print(strategy.summary())
|
|
56
|
+
print()
|
|
57
|
+
|
|
58
|
+
# Show code example
|
|
59
|
+
print("=== Usage ===")
|
|
60
|
+
print("```python")
|
|
61
|
+
print("import overflowml")
|
|
62
|
+
print("from diffusers import SomePipeline")
|
|
63
|
+
print()
|
|
64
|
+
print('pipe = SomePipeline.from_pretrained("model", torch_dtype=torch.bfloat16)')
|
|
65
|
+
print(f"strategy = overflowml.optimize_pipeline(pipe, model_size_gb={args.model_size})")
|
|
66
|
+
print("result = pipe(prompt)")
|
|
67
|
+
print("```")
|
|
68
|
+
print()
|
|
69
|
+
|
|
70
|
+
else:
|
|
71
|
+
parser.print_help()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
if __name__ == "__main__":
|
|
75
|
+
main()
|