flexium 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexium-0.1.0/LICENSE +21 -0
- flexium-0.1.0/PKG-INFO +328 -0
- flexium-0.1.0/README.md +279 -0
- flexium-0.1.0/flexium/__init__.py +39 -0
- flexium-0.1.0/flexium/auto.py +1418 -0
- flexium-0.1.0/flexium/cli/__init__.py +8 -0
- flexium-0.1.0/flexium/cli/flexium_ctl.py +431 -0
- flexium-0.1.0/flexium/config.py +200 -0
- flexium-0.1.0/flexium/dashboard/__init__.py +5 -0
- flexium-0.1.0/flexium/dashboard/app.py +94 -0
- flexium-0.1.0/flexium/dashboard/routes.py +565 -0
- flexium-0.1.0/flexium/dashboard/template.py +15 -0
- flexium-0.1.0/flexium/dashboard/templates/dashboard.html +1161 -0
- flexium-0.1.0/flexium/lightning/__init__.py +41 -0
- flexium-0.1.0/flexium/lightning/callback.py +223 -0
- flexium-0.1.0/flexium/lightning/utils.py +80 -0
- flexium-0.1.0/flexium/orchestrator/__init__.py +32 -0
- flexium-0.1.0/flexium/orchestrator/client.py +818 -0
- flexium-0.1.0/flexium/orchestrator/device_manager.py +173 -0
- flexium-0.1.0/flexium/orchestrator/device_registry.py +400 -0
- flexium-0.1.0/flexium/orchestrator/registry.py +967 -0
- flexium-0.1.0/flexium/orchestrator/server.py +834 -0
- flexium-0.1.0/flexium/proto/__init__.py +22 -0
- flexium-0.1.0/flexium/proto/orchestrator.proto +308 -0
- flexium-0.1.0/flexium/proto/orchestrator_pb2.py +108 -0
- flexium-0.1.0/flexium/proto/orchestrator_pb2_grpc.py +673 -0
- flexium-0.1.0/flexium/timing.py +145 -0
- flexium-0.1.0/flexium/utils/__init__.py +5 -0
- flexium-0.1.0/flexium/utils/gpu_errors.py +269 -0
- flexium-0.1.0/flexium/utils/gpu_info.py +908 -0
- flexium-0.1.0/flexium/utils/logging.py +97 -0
- flexium-0.1.0/flexium.egg-info/PKG-INFO +328 -0
- flexium-0.1.0/flexium.egg-info/SOURCES.txt +58 -0
- flexium-0.1.0/flexium.egg-info/dependency_links.txt +1 -0
- flexium-0.1.0/flexium.egg-info/entry_points.txt +2 -0
- flexium-0.1.0/flexium.egg-info/requires.txt +27 -0
- flexium-0.1.0/flexium.egg-info/top_level.txt +1 -0
- flexium-0.1.0/pyproject.toml +137 -0
- flexium-0.1.0/setup.cfg +4 -0
- flexium-0.1.0/tests/test_auto.py +1090 -0
- flexium-0.1.0/tests/test_cli.py +259 -0
- flexium-0.1.0/tests/test_config.py +246 -0
- flexium-0.1.0/tests/test_connection_manager.py +380 -0
- flexium-0.1.0/tests/test_dashboard.py +1263 -0
- flexium-0.1.0/tests/test_dataloader_after_fork.py +169 -0
- flexium-0.1.0/tests/test_device_manager.py +124 -0
- flexium-0.1.0/tests/test_device_registry.py +838 -0
- flexium-0.1.0/tests/test_device_reporting_integration.py +441 -0
- flexium-0.1.0/tests/test_error_recovery_integration.py +193 -0
- flexium-0.1.0/tests/test_gpu_errors.py +257 -0
- flexium-0.1.0/tests/test_gpu_info.py +395 -0
- flexium-0.1.0/tests/test_imports.py +43 -0
- flexium-0.1.0/tests/test_lightning.py +403 -0
- flexium-0.1.0/tests/test_logging.py +128 -0
- flexium-0.1.0/tests/test_orchestrator_client.py +592 -0
- flexium-0.1.0/tests/test_orchestrator_recovery.py +274 -0
- flexium-0.1.0/tests/test_pause_resume.py +660 -0
- flexium-0.1.0/tests/test_registry.py +1500 -0
- flexium-0.1.0/tests/test_resource_requirements.py +327 -0
- flexium-0.1.0/tests/test_timing.py +352 -0
flexium-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Flexium Authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
flexium-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flexium
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Flexium.AI - Flexible Resource Allocation for GPU training with live migration
|
|
5
|
+
Author: Flexium.AI
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/flexiumai/flexium
|
|
8
|
+
Project-URL: Documentation, https://github.com/flexiumai/flexium#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/flexiumai/flexium
|
|
10
|
+
Keywords: gpu,pytorch,training,orchestration,migration,cuda,flexible-resource-allocation
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: grpcio>=1.50.0
|
|
28
|
+
Requires-Dist: protobuf>=4.0.0
|
|
29
|
+
Requires-Dist: pynvml>=11.0.0
|
|
30
|
+
Requires-Dist: flask>=2.0.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
36
|
+
Requires-Dist: grpcio-tools>=1.50.0; extra == "dev"
|
|
37
|
+
Provides-Extra: torch
|
|
38
|
+
Requires-Dist: torch>=1.9.0; extra == "torch"
|
|
39
|
+
Requires-Dist: torchvision>=0.10.0; extra == "torch"
|
|
40
|
+
Provides-Extra: torch-cuda
|
|
41
|
+
Requires-Dist: pytorch-lightning>=2.0.0; extra == "torch-cuda"
|
|
42
|
+
Provides-Extra: lightning
|
|
43
|
+
Requires-Dist: pytorch-lightning>=2.0.0; extra == "lightning"
|
|
44
|
+
Provides-Extra: debug
|
|
45
|
+
Requires-Dist: debugpy>=1.6.0; extra == "debug"
|
|
46
|
+
Provides-Extra: all
|
|
47
|
+
Requires-Dist: flexium[debug,dev,torch]; extra == "all"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+
<p align="center">
|
|
51
|
+
<img src="logo_with_text.png" alt="Flexium.AI Logo" width="400">
|
|
52
|
+
</p>
|
|
53
|
+
|
|
54
|
+
<h1 align="center">Flexium.AI</h1>
|
|
55
|
+
|
|
56
|
+
<p align="center">
|
|
57
|
+
<strong>Flexible Resource Allocation for GPU Training</strong><br>
|
|
58
|
+
Live GPU migration for PyTorch - zero code changes required.
|
|
59
|
+
</p>
|
|
60
|
+
|
|
61
|
+
<p align="center">
|
|
62
|
+
<a href="#installation">Installation</a> •
|
|
63
|
+
<a href="#quick-start">Quick Start</a> •
|
|
64
|
+
<a href="#features">Features</a> •
|
|
65
|
+
<a href="#documentation">Documentation</a>
|
|
66
|
+
</p>
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
Flexium.AI allows you to move running PyTorch training jobs between GPUs without stopping them. Free up GPUs for colleagues, recover from hardware errors, and manage shared GPU clusters - all while your training continues seamlessly.
|
|
71
|
+
|
|
72
|
+
## The Problem
|
|
73
|
+
|
|
74
|
+
When you need to free a GPU during training, your options are bad:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
# You're training on cuda:0, colleague needs it urgently
|
|
78
|
+
model = model.to("cuda:1") # Doesn't work - memory stays on cuda:0!
|
|
79
|
+
torch.cuda.empty_cache() # Still doesn't free it
|
|
80
|
+
# Your colleague still can't use the GPU
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
PyTorch's CUDA memory allocator holds onto GPU memory even after `model.to()`. The only way to truly free a GPU is to kill the process - losing your training progress.
|
|
84
|
+
|
|
85
|
+
## The Solution
|
|
86
|
+
|
|
87
|
+
Flexium guarantees complete GPU release through driver-level migration (requires NVIDIA driver 580+):
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
import flexium.auto # Add this import
|
|
91
|
+
|
|
92
|
+
with flexium.auto.run(): # Wrap your training
|
|
93
|
+
model = Net().cuda() # Standard PyTorch - no changes needed!
|
|
94
|
+
for epoch in range(100):
|
|
95
|
+
for batch in dataloader:
|
|
96
|
+
# ... your normal training code ...
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Now you can migrate anytime:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
flexium-ctl migrate <process-id> cuda:1 # Training continues on new GPU
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
The GPU is **100% freed** - your colleague can use it immediately.
|
|
106
|
+
|
|
107
|
+
## Features
|
|
108
|
+
|
|
109
|
+
- **Zero code changes** - Just wrap your training in `flexium.auto.run()`
|
|
110
|
+
- **True GPU freedom** - Guarantees zero memory residue on source GPU
|
|
111
|
+
- **Live migration** - Move training between GPUs without losing progress
|
|
112
|
+
- **Automatic state management** - Training state preserved transparently
|
|
113
|
+
- **Web dashboard** - Visual GPU management at `http://localhost:8080`
|
|
114
|
+
- **CLI control** - `flexium-ctl` for scripting and automation
|
|
115
|
+
- **Pause/Resume** - Free GPU completely, resume later on any available GPU
|
|
116
|
+
- **Error recovery** - Auto-migrate on OOM, ECC errors, or hardware failures
|
|
117
|
+
- **Graceful degradation** - Training continues even if orchestrator dies
|
|
118
|
+
|
|
119
|
+
## Installation
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
# From PyPI (coming soon)
|
|
123
|
+
pip install flexium
|
|
124
|
+
|
|
125
|
+
# From source
|
|
126
|
+
git clone https://github.com/your-org/flexium.git
|
|
127
|
+
cd flexium
|
|
128
|
+
pip install -e .
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Requirements
|
|
132
|
+
|
|
133
|
+
- Python 3.8+
|
|
134
|
+
- PyTorch 2.0+ with CUDA support ([install guide](https://pytorch.org/get-started/locally/))
|
|
135
|
+
- NVIDIA GPU with CUDA support
|
|
136
|
+
- **NVIDIA Driver 580+** (required for zero-residue migration)
|
|
137
|
+
- Linux x86_64 (Windows/macOS not yet supported)
|
|
138
|
+
|
|
139
|
+
## Quick Start
|
|
140
|
+
|
|
141
|
+
### 1. Start the Orchestrator
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
flexium-ctl server --dashboard
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
This starts:
|
|
148
|
+
- gRPC server on port 50051 (coordinates migrations)
|
|
149
|
+
- Web dashboard at http://localhost:8080
|
|
150
|
+
|
|
151
|
+
### 2. Add Flexium to Your Training
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
import flexium.auto
|
|
155
|
+
import torch
|
|
156
|
+
|
|
157
|
+
with flexium.auto.run():
|
|
158
|
+
model = MyModel().cuda()
|
|
159
|
+
optimizer = torch.optim.Adam(model.parameters())
|
|
160
|
+
|
|
161
|
+
for epoch in range(100):
|
|
162
|
+
for batch in dataloader:
|
|
163
|
+
loss = model(batch.cuda()).mean()
|
|
164
|
+
loss.backward()
|
|
165
|
+
optimizer.step()
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
That's it! Your training is now migratable.
|
|
169
|
+
|
|
170
|
+
### 3. Migrate When Needed
|
|
171
|
+
|
|
172
|
+
**Via Dashboard:**
|
|
173
|
+
Open http://localhost:8080, find your process, click "Migrate", select target GPU.
|
|
174
|
+
|
|
175
|
+
**Via CLI:**
|
|
176
|
+
```bash
|
|
177
|
+
# List running processes
|
|
178
|
+
flexium-ctl list
|
|
179
|
+
|
|
180
|
+
# Migrate to different GPU
|
|
181
|
+
flexium-ctl migrate <process-id> cuda:1
|
|
182
|
+
|
|
183
|
+
# Pause (free GPU completely)
|
|
184
|
+
flexium-ctl pause <process-id>
|
|
185
|
+
|
|
186
|
+
# Resume on any available GPU
|
|
187
|
+
flexium-ctl resume <process-id>
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Real-World Scenarios
|
|
191
|
+
|
|
192
|
+
### GPU Contention
|
|
193
|
+
```bash
|
|
194
|
+
# Alice is training on cuda:0, Bob needs it urgently
|
|
195
|
+
flexium-ctl migrate alice-abc123 cuda:2
|
|
196
|
+
# Alice's training continues on cuda:2, cuda:0 is free for Bob
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### OOM Recovery
|
|
200
|
+
```python
|
|
201
|
+
with flexium.auto.run():
|
|
202
|
+
# If OOM occurs, flexium auto-migrates to GPU with more VRAM
|
|
203
|
+
model = LargeModel().cuda()
|
|
204
|
+
train(model)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Shared Cluster Management
|
|
208
|
+
```bash
|
|
209
|
+
# See all jobs across the cluster
|
|
210
|
+
flexium-ctl list
|
|
211
|
+
|
|
212
|
+
# Dashboard shows: who's using what, progress, VRAM usage
|
|
213
|
+
# One-click migration when someone needs a GPU
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### Hardware Failure
|
|
217
|
+
```python
|
|
218
|
+
with flexium.auto.run():
|
|
219
|
+
# ECC error? Driver crash? Auto-migrate to healthy GPU
|
|
220
|
+
# Bad GPU marked unhealthy, won't be used again
|
|
221
|
+
train(model)
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## How It Works
|
|
225
|
+
|
|
226
|
+
Flexium uses **driver-level GPU migration** (requires NVIDIA driver 580+) that guarantees complete GPU memory release - something that's impossible with standard PyTorch memory management.
|
|
227
|
+
|
|
228
|
+
When you request a migration:
|
|
229
|
+
1. GPU state is checkpointed at the driver level
|
|
230
|
+
2. Source GPU memory is completely released (0 MB residue)
|
|
231
|
+
3. State is restored on the target GPU
|
|
232
|
+
4. Training continues seamlessly
|
|
233
|
+
|
|
234
|
+
The orchestrator coordinates all GPU resources across your cluster, tracking utilization, health status, and process state to enable intelligent scheduling and automatic recovery.
|
|
235
|
+
|
|
236
|
+
## Documentation
|
|
237
|
+
|
|
238
|
+
- [Getting Started Guide](docs/getting-started.md)
|
|
239
|
+
- [API Reference](docs/api.md)
|
|
240
|
+
- [Architecture Overview](docs/ARCHITECTURE.md)
|
|
241
|
+
- [Examples](docs/examples.md)
|
|
242
|
+
- [Troubleshooting](docs/troubleshooting.md)
|
|
243
|
+
|
|
244
|
+
## Benchmarks
|
|
245
|
+
|
|
246
|
+
Migration overhead is minimal:
|
|
247
|
+
|
|
248
|
+
| Model Size | Checkpoint Time | Total Migration |
|
|
249
|
+
|------------|-----------------|-----------------|
|
|
250
|
+
| 100 MB | ~200 ms | ~700 ms |
|
|
251
|
+
| 500 MB | ~800 ms | ~2 sec |
|
|
252
|
+
| 1 GB | ~1.5 sec | ~4 sec |
|
|
253
|
+
|
|
254
|
+
Runtime overhead during normal training: **< 2%**
|
|
255
|
+
|
|
256
|
+
## Configuration
|
|
257
|
+
|
|
258
|
+
### Environment Variables
|
|
259
|
+
```bash
|
|
260
|
+
export GPU_ORCHESTRATOR=localhost:50051
|
|
261
|
+
export GPU_DEVICE=cuda:0
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
### Config File (`~/.flexiumrc`)
|
|
265
|
+
```yaml
|
|
266
|
+
orchestrator: localhost:50051
|
|
267
|
+
device: cuda:0
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
### Inline Parameters
|
|
271
|
+
```python
|
|
272
|
+
with flexium.auto.run(orchestrator="server:50051", device="cuda:1"):
|
|
273
|
+
...
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
## CLI Reference
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
# Server management
|
|
280
|
+
flexium-ctl server [--port PORT] [--dashboard]
|
|
281
|
+
|
|
282
|
+
# Process management
|
|
283
|
+
flexium-ctl list # List all processes
|
|
284
|
+
flexium-ctl status <process-id> # Get process details
|
|
285
|
+
flexium-ctl migrate <process-id> <device> # Migrate to device
|
|
286
|
+
flexium-ctl pause <process-id> # Pause (free GPU)
|
|
287
|
+
flexium-ctl resume <process-id> [device] # Resume training
|
|
288
|
+
|
|
289
|
+
# GPU management
|
|
290
|
+
flexium-ctl devices # List all GPUs
|
|
291
|
+
flexium-ctl device <gpu-uuid> # Get GPU details
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
## Contributing
|
|
295
|
+
|
|
296
|
+
We welcome contributions! Please see our contributing guidelines (coming soon).
|
|
297
|
+
|
|
298
|
+
```bash
|
|
299
|
+
# Development setup
|
|
300
|
+
git clone https://github.com/your-org/flexium.git
|
|
301
|
+
cd flexium
|
|
302
|
+
pip install -e ".[dev]"
|
|
303
|
+
|
|
304
|
+
# Run tests
|
|
305
|
+
pytest tests/ -v
|
|
306
|
+
|
|
307
|
+
# Run linting
|
|
308
|
+
ruff check flexium/
|
|
309
|
+
mypy flexium/
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## License
|
|
313
|
+
|
|
314
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
315
|
+
|
|
316
|
+
## Acknowledgments
|
|
317
|
+
|
|
318
|
+
Flexium.AI was inspired by the challenges of managing shared GPU clusters in research environments. Special thanks to everyone who provided feedback and tested early versions.
|
|
319
|
+
|
|
320
|
+
---
|
|
321
|
+
|
|
322
|
+
<p align="center">
|
|
323
|
+
<img src="logo_with_text.png" alt="Flexium.AI" width="150"><br>
|
|
324
|
+
<strong>Flexium.AI</strong> - Flexible Resource Allocation<br>
|
|
325
|
+
<em>Making GPU management effortless</em>
|
|
326
|
+
</p>
|
|
327
|
+
|
|
328
|
+
**Note:** Flexium.AI is currently in alpha. APIs may change. Please report issues on GitHub.
|
flexium-0.1.0/README.md
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="logo_with_text.png" alt="Flexium.AI Logo" width="400">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">Flexium.AI</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Flexible Resource Allocation for GPU Training</strong><br>
|
|
9
|
+
Live GPU migration for PyTorch - zero code changes required.
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
<p align="center">
|
|
13
|
+
<a href="#installation">Installation</a> •
|
|
14
|
+
<a href="#quick-start">Quick Start</a> •
|
|
15
|
+
<a href="#features">Features</a> •
|
|
16
|
+
<a href="#documentation">Documentation</a>
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
Flexium.AI allows you to move running PyTorch training jobs between GPUs without stopping them. Free up GPUs for colleagues, recover from hardware errors, and manage shared GPU clusters - all while your training continues seamlessly.
|
|
22
|
+
|
|
23
|
+
## The Problem
|
|
24
|
+
|
|
25
|
+
When you need to free a GPU during training, your options are bad:
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
# You're training on cuda:0, colleague needs it urgently
|
|
29
|
+
model = model.to("cuda:1") # Doesn't work - memory stays on cuda:0!
|
|
30
|
+
torch.cuda.empty_cache() # Still doesn't free it
|
|
31
|
+
# Your colleague still can't use the GPU
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
PyTorch's CUDA memory allocator holds onto GPU memory even after `model.to()`. The only way to truly free a GPU is to kill the process - losing your training progress.
|
|
35
|
+
|
|
36
|
+
## The Solution
|
|
37
|
+
|
|
38
|
+
Flexium guarantees complete GPU release through driver-level migration (requires NVIDIA driver 580+):
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import flexium.auto # Add this import
|
|
42
|
+
|
|
43
|
+
with flexium.auto.run(): # Wrap your training
|
|
44
|
+
model = Net().cuda() # Standard PyTorch - no changes needed!
|
|
45
|
+
for epoch in range(100):
|
|
46
|
+
for batch in dataloader:
|
|
47
|
+
# ... your normal training code ...
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Now you can migrate anytime:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
flexium-ctl migrate <process-id> cuda:1 # Training continues on new GPU
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
The GPU is **100% freed** - your colleague can use it immediately.
|
|
57
|
+
|
|
58
|
+
## Features
|
|
59
|
+
|
|
60
|
+
- **Zero code changes** - Just wrap your training in `flexium.auto.run()`
|
|
61
|
+
- **True GPU freedom** - Guarantees zero memory residue on source GPU
|
|
62
|
+
- **Live migration** - Move training between GPUs without losing progress
|
|
63
|
+
- **Automatic state management** - Training state preserved transparently
|
|
64
|
+
- **Web dashboard** - Visual GPU management at `http://localhost:8080`
|
|
65
|
+
- **CLI control** - `flexium-ctl` for scripting and automation
|
|
66
|
+
- **Pause/Resume** - Free GPU completely, resume later on any available GPU
|
|
67
|
+
- **Error recovery** - Auto-migrate on OOM, ECC errors, or hardware failures
|
|
68
|
+
- **Graceful degradation** - Training continues even if orchestrator dies
|
|
69
|
+
|
|
70
|
+
## Installation
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# From PyPI (coming soon)
|
|
74
|
+
pip install flexium
|
|
75
|
+
|
|
76
|
+
# From source
|
|
77
|
+
git clone https://github.com/your-org/flexium.git
|
|
78
|
+
cd flexium
|
|
79
|
+
pip install -e .
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Requirements
|
|
83
|
+
|
|
84
|
+
- Python 3.8+
|
|
85
|
+
- PyTorch 2.0+ with CUDA support ([install guide](https://pytorch.org/get-started/locally/))
|
|
86
|
+
- NVIDIA GPU with CUDA support
|
|
87
|
+
- **NVIDIA Driver 580+** (required for zero-residue migration)
|
|
88
|
+
- Linux x86_64 (Windows/macOS not yet supported)
|
|
89
|
+
|
|
90
|
+
## Quick Start
|
|
91
|
+
|
|
92
|
+
### 1. Start the Orchestrator
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
flexium-ctl server --dashboard
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
This starts:
|
|
99
|
+
- gRPC server on port 50051 (coordinates migrations)
|
|
100
|
+
- Web dashboard at http://localhost:8080
|
|
101
|
+
|
|
102
|
+
### 2. Add Flexium to Your Training
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
import flexium.auto
|
|
106
|
+
import torch
|
|
107
|
+
|
|
108
|
+
with flexium.auto.run():
|
|
109
|
+
model = MyModel().cuda()
|
|
110
|
+
optimizer = torch.optim.Adam(model.parameters())
|
|
111
|
+
|
|
112
|
+
for epoch in range(100):
|
|
113
|
+
for batch in dataloader:
|
|
114
|
+
loss = model(batch.cuda()).mean()
|
|
115
|
+
loss.backward()
|
|
116
|
+
optimizer.step()
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
That's it! Your training is now migratable.
|
|
120
|
+
|
|
121
|
+
### 3. Migrate When Needed
|
|
122
|
+
|
|
123
|
+
**Via Dashboard:**
|
|
124
|
+
Open http://localhost:8080, find your process, click "Migrate", select target GPU.
|
|
125
|
+
|
|
126
|
+
**Via CLI:**
|
|
127
|
+
```bash
|
|
128
|
+
# List running processes
|
|
129
|
+
flexium-ctl list
|
|
130
|
+
|
|
131
|
+
# Migrate to different GPU
|
|
132
|
+
flexium-ctl migrate <process-id> cuda:1
|
|
133
|
+
|
|
134
|
+
# Pause (free GPU completely)
|
|
135
|
+
flexium-ctl pause <process-id>
|
|
136
|
+
|
|
137
|
+
# Resume on any available GPU
|
|
138
|
+
flexium-ctl resume <process-id>
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Real-World Scenarios
|
|
142
|
+
|
|
143
|
+
### GPU Contention
|
|
144
|
+
```bash
|
|
145
|
+
# Alice is training on cuda:0, Bob needs it urgently
|
|
146
|
+
flexium-ctl migrate alice-abc123 cuda:2
|
|
147
|
+
# Alice's training continues on cuda:2, cuda:0 is free for Bob
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### OOM Recovery
|
|
151
|
+
```python
|
|
152
|
+
with flexium.auto.run():
|
|
153
|
+
# If OOM occurs, flexium auto-migrates to GPU with more VRAM
|
|
154
|
+
model = LargeModel().cuda()
|
|
155
|
+
train(model)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Shared Cluster Management
|
|
159
|
+
```bash
|
|
160
|
+
# See all jobs across the cluster
|
|
161
|
+
flexium-ctl list
|
|
162
|
+
|
|
163
|
+
# Dashboard shows: who's using what, progress, VRAM usage
|
|
164
|
+
# One-click migration when someone needs a GPU
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Hardware Failure
|
|
168
|
+
```python
|
|
169
|
+
with flexium.auto.run():
|
|
170
|
+
# ECC error? Driver crash? Auto-migrate to healthy GPU
|
|
171
|
+
# Bad GPU marked unhealthy, won't be used again
|
|
172
|
+
train(model)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## How It Works
|
|
176
|
+
|
|
177
|
+
Flexium uses **driver-level GPU migration** (requires NVIDIA driver 580+) that guarantees complete GPU memory release - something that's impossible with standard PyTorch memory management.
|
|
178
|
+
|
|
179
|
+
When you request a migration:
|
|
180
|
+
1. GPU state is checkpointed at the driver level
|
|
181
|
+
2. Source GPU memory is completely released (0 MB residue)
|
|
182
|
+
3. State is restored on the target GPU
|
|
183
|
+
4. Training continues seamlessly
|
|
184
|
+
|
|
185
|
+
The orchestrator coordinates all GPU resources across your cluster, tracking utilization, health status, and process state to enable intelligent scheduling and automatic recovery.
|
|
186
|
+
|
|
187
|
+
## Documentation
|
|
188
|
+
|
|
189
|
+
- [Getting Started Guide](docs/getting-started.md)
|
|
190
|
+
- [API Reference](docs/api.md)
|
|
191
|
+
- [Architecture Overview](docs/ARCHITECTURE.md)
|
|
192
|
+
- [Examples](docs/examples.md)
|
|
193
|
+
- [Troubleshooting](docs/troubleshooting.md)
|
|
194
|
+
|
|
195
|
+
## Benchmarks
|
|
196
|
+
|
|
197
|
+
Migration overhead is minimal:
|
|
198
|
+
|
|
199
|
+
| Model Size | Checkpoint Time | Total Migration |
|
|
200
|
+
|------------|-----------------|-----------------|
|
|
201
|
+
| 100 MB | ~200 ms | ~700 ms |
|
|
202
|
+
| 500 MB | ~800 ms | ~2 sec |
|
|
203
|
+
| 1 GB | ~1.5 sec | ~4 sec |
|
|
204
|
+
|
|
205
|
+
Runtime overhead during normal training: **< 2%**
|
|
206
|
+
|
|
207
|
+
## Configuration
|
|
208
|
+
|
|
209
|
+
### Environment Variables
|
|
210
|
+
```bash
|
|
211
|
+
export GPU_ORCHESTRATOR=localhost:50051
|
|
212
|
+
export GPU_DEVICE=cuda:0
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Config File (`~/.flexiumrc`)
|
|
216
|
+
```yaml
|
|
217
|
+
orchestrator: localhost:50051
|
|
218
|
+
device: cuda:0
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### Inline Parameters
|
|
222
|
+
```python
|
|
223
|
+
with flexium.auto.run(orchestrator="server:50051", device="cuda:1"):
|
|
224
|
+
...
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## CLI Reference
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
# Server management
|
|
231
|
+
flexium-ctl server [--port PORT] [--dashboard]
|
|
232
|
+
|
|
233
|
+
# Process management
|
|
234
|
+
flexium-ctl list # List all processes
|
|
235
|
+
flexium-ctl status <process-id> # Get process details
|
|
236
|
+
flexium-ctl migrate <process-id> <device> # Migrate to device
|
|
237
|
+
flexium-ctl pause <process-id> # Pause (free GPU)
|
|
238
|
+
flexium-ctl resume <process-id> [device] # Resume training
|
|
239
|
+
|
|
240
|
+
# GPU management
|
|
241
|
+
flexium-ctl devices # List all GPUs
|
|
242
|
+
flexium-ctl device <gpu-uuid> # Get GPU details
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Contributing
|
|
246
|
+
|
|
247
|
+
We welcome contributions! Please see our contributing guidelines (coming soon).
|
|
248
|
+
|
|
249
|
+
```bash
|
|
250
|
+
# Development setup
|
|
251
|
+
git clone https://github.com/your-org/flexium.git
|
|
252
|
+
cd flexium
|
|
253
|
+
pip install -e ".[dev]"
|
|
254
|
+
|
|
255
|
+
# Run tests
|
|
256
|
+
pytest tests/ -v
|
|
257
|
+
|
|
258
|
+
# Run linting
|
|
259
|
+
ruff check flexium/
|
|
260
|
+
mypy flexium/
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## License
|
|
264
|
+
|
|
265
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
266
|
+
|
|
267
|
+
## Acknowledgments
|
|
268
|
+
|
|
269
|
+
Flexium.AI was inspired by the challenges of managing shared GPU clusters in research environments. Special thanks to everyone who provided feedback and tested early versions.
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
<p align="center">
|
|
274
|
+
<img src="logo_with_text.png" alt="Flexium.AI" width="150"><br>
|
|
275
|
+
<strong>Flexium.AI</strong> - Flexible Resource Allocation<br>
|
|
276
|
+
<em>Making GPU management effortless</em>
|
|
277
|
+
</p>
|
|
278
|
+
|
|
279
|
+
**Note:** Flexium.AI is currently in alpha. APIs may change. Please report issues on GitHub.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Flexium - Dynamic GPU orchestration for PyTorch training.
|
|
2
|
+
|
|
3
|
+
This package provides GPU orchestration with live migration support using
|
|
4
|
+
driver-level migration for zero-residue GPU migration (requires driver 580+).
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
|
|
8
|
+
import flexium.auto
|
|
9
|
+
|
|
10
|
+
with flexium.auto.run():
|
|
11
|
+
model = nn.Linear(784, 10).cuda()
|
|
12
|
+
optimizer = torch.optim.Adam(model.parameters())
|
|
13
|
+
|
|
14
|
+
for epoch in range(100):
|
|
15
|
+
for batch in dataloader:
|
|
16
|
+
data, target = batch[0].cuda(), batch[1].cuda()
|
|
17
|
+
loss = model(data).sum()
|
|
18
|
+
loss.backward()
|
|
19
|
+
optimizer.step()
|
|
20
|
+
|
|
21
|
+
Configuration:
|
|
22
|
+
- Set GPU_ORCHESTRATOR environment variable to orchestrator address
|
|
23
|
+
- Or pass orchestrator= parameter to flexium.auto.run()
|
|
24
|
+
|
|
25
|
+
Migration is transparent - training continues in the same process,
|
|
26
|
+
same loop iteration, just on a different GPU.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
__version__ = "0.1.0"
|
|
32
|
+
|
|
33
|
+
# The main API is flexium.auto.run()
|
|
34
|
+
# Import with: import flexium.auto
|
|
35
|
+
# Usage: with flexium.auto.run(): ...
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"__version__",
|
|
39
|
+
]
|