flexium 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. flexium-0.1.0/LICENSE +21 -0
  2. flexium-0.1.0/PKG-INFO +328 -0
  3. flexium-0.1.0/README.md +279 -0
  4. flexium-0.1.0/flexium/__init__.py +39 -0
  5. flexium-0.1.0/flexium/auto.py +1418 -0
  6. flexium-0.1.0/flexium/cli/__init__.py +8 -0
  7. flexium-0.1.0/flexium/cli/flexium_ctl.py +431 -0
  8. flexium-0.1.0/flexium/config.py +200 -0
  9. flexium-0.1.0/flexium/dashboard/__init__.py +5 -0
  10. flexium-0.1.0/flexium/dashboard/app.py +94 -0
  11. flexium-0.1.0/flexium/dashboard/routes.py +565 -0
  12. flexium-0.1.0/flexium/dashboard/template.py +15 -0
  13. flexium-0.1.0/flexium/dashboard/templates/dashboard.html +1161 -0
  14. flexium-0.1.0/flexium/lightning/__init__.py +41 -0
  15. flexium-0.1.0/flexium/lightning/callback.py +223 -0
  16. flexium-0.1.0/flexium/lightning/utils.py +80 -0
  17. flexium-0.1.0/flexium/orchestrator/__init__.py +32 -0
  18. flexium-0.1.0/flexium/orchestrator/client.py +818 -0
  19. flexium-0.1.0/flexium/orchestrator/device_manager.py +173 -0
  20. flexium-0.1.0/flexium/orchestrator/device_registry.py +400 -0
  21. flexium-0.1.0/flexium/orchestrator/registry.py +967 -0
  22. flexium-0.1.0/flexium/orchestrator/server.py +834 -0
  23. flexium-0.1.0/flexium/proto/__init__.py +22 -0
  24. flexium-0.1.0/flexium/proto/orchestrator.proto +308 -0
  25. flexium-0.1.0/flexium/proto/orchestrator_pb2.py +108 -0
  26. flexium-0.1.0/flexium/proto/orchestrator_pb2_grpc.py +673 -0
  27. flexium-0.1.0/flexium/timing.py +145 -0
  28. flexium-0.1.0/flexium/utils/__init__.py +5 -0
  29. flexium-0.1.0/flexium/utils/gpu_errors.py +269 -0
  30. flexium-0.1.0/flexium/utils/gpu_info.py +908 -0
  31. flexium-0.1.0/flexium/utils/logging.py +97 -0
  32. flexium-0.1.0/flexium.egg-info/PKG-INFO +328 -0
  33. flexium-0.1.0/flexium.egg-info/SOURCES.txt +58 -0
  34. flexium-0.1.0/flexium.egg-info/dependency_links.txt +1 -0
  35. flexium-0.1.0/flexium.egg-info/entry_points.txt +2 -0
  36. flexium-0.1.0/flexium.egg-info/requires.txt +27 -0
  37. flexium-0.1.0/flexium.egg-info/top_level.txt +1 -0
  38. flexium-0.1.0/pyproject.toml +137 -0
  39. flexium-0.1.0/setup.cfg +4 -0
  40. flexium-0.1.0/tests/test_auto.py +1090 -0
  41. flexium-0.1.0/tests/test_cli.py +259 -0
  42. flexium-0.1.0/tests/test_config.py +246 -0
  43. flexium-0.1.0/tests/test_connection_manager.py +380 -0
  44. flexium-0.1.0/tests/test_dashboard.py +1263 -0
  45. flexium-0.1.0/tests/test_dataloader_after_fork.py +169 -0
  46. flexium-0.1.0/tests/test_device_manager.py +124 -0
  47. flexium-0.1.0/tests/test_device_registry.py +838 -0
  48. flexium-0.1.0/tests/test_device_reporting_integration.py +441 -0
  49. flexium-0.1.0/tests/test_error_recovery_integration.py +193 -0
  50. flexium-0.1.0/tests/test_gpu_errors.py +257 -0
  51. flexium-0.1.0/tests/test_gpu_info.py +395 -0
  52. flexium-0.1.0/tests/test_imports.py +43 -0
  53. flexium-0.1.0/tests/test_lightning.py +403 -0
  54. flexium-0.1.0/tests/test_logging.py +128 -0
  55. flexium-0.1.0/tests/test_orchestrator_client.py +592 -0
  56. flexium-0.1.0/tests/test_orchestrator_recovery.py +274 -0
  57. flexium-0.1.0/tests/test_pause_resume.py +660 -0
  58. flexium-0.1.0/tests/test_registry.py +1500 -0
  59. flexium-0.1.0/tests/test_resource_requirements.py +327 -0
  60. flexium-0.1.0/tests/test_timing.py +352 -0
flexium-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Flexium Authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
flexium-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,328 @@
1
+ Metadata-Version: 2.4
2
+ Name: flexium
3
+ Version: 0.1.0
4
+ Summary: Flexium.AI - Flexible Resource Allocation for GPU training with live migration
5
+ Author: Flexium.AI
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/flexiumai/flexium
8
+ Project-URL: Documentation, https://github.com/flexiumai/flexium#readme
9
+ Project-URL: Repository, https://github.com/flexiumai/flexium
10
+ Keywords: gpu,pytorch,training,orchestration,migration,cuda,flexible-resource-allocation
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: System :: Distributed Computing
24
+ Requires-Python: >=3.8
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: grpcio>=1.50.0
28
+ Requires-Dist: protobuf>=4.0.0
29
+ Requires-Dist: pynvml>=11.0.0
30
+ Requires-Dist: flask>=2.0.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
33
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
34
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
35
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
36
+ Requires-Dist: grpcio-tools>=1.50.0; extra == "dev"
37
+ Provides-Extra: torch
38
+ Requires-Dist: torch>=1.9.0; extra == "torch"
39
+ Requires-Dist: torchvision>=0.10.0; extra == "torch"
40
+ Provides-Extra: torch-cuda
41
+ Requires-Dist: pytorch-lightning>=2.0.0; extra == "torch-cuda"
42
+ Provides-Extra: lightning
43
+ Requires-Dist: pytorch-lightning>=2.0.0; extra == "lightning"
44
+ Provides-Extra: debug
45
+ Requires-Dist: debugpy>=1.6.0; extra == "debug"
46
+ Provides-Extra: all
47
+ Requires-Dist: flexium[debug,dev,torch]; extra == "all"
48
+ Dynamic: license-file
49
+
50
+ <p align="center">
51
+ <img src="logo_with_text.png" alt="Flexium.AI Logo" width="400">
52
+ </p>
53
+
54
+ <h1 align="center">Flexium.AI</h1>
55
+
56
+ <p align="center">
57
+ <strong>Flexible Resource Allocation for GPU Training</strong><br>
58
+ Live GPU migration for PyTorch - zero code changes required.
59
+ </p>
60
+
61
+ <p align="center">
62
+ <a href="#installation">Installation</a> •
63
+ <a href="#quick-start">Quick Start</a> •
64
+ <a href="#features">Features</a> •
65
+ <a href="#documentation">Documentation</a>
66
+ </p>
67
+
68
+ ---
69
+
70
+ Flexium.AI allows you to move running PyTorch training jobs between GPUs without stopping them. Free up GPUs for colleagues, recover from hardware errors, and manage shared GPU clusters - all while your training continues seamlessly.
71
+
72
+ ## The Problem
73
+
74
+ When you need to free a GPU during training, your options are bad:
75
+
76
+ ```python
77
+ # You're training on cuda:0, colleague needs it urgently
78
+ model = model.to("cuda:1") # Doesn't work - memory stays on cuda:0!
79
+ torch.cuda.empty_cache() # Still doesn't free it
80
+ # Your colleague still can't use the GPU
81
+ ```
82
+
83
+ PyTorch's CUDA memory allocator holds onto GPU memory even after `model.to()`. The only way to truly free a GPU is to kill the process - losing your training progress.
84
+
85
+ ## The Solution
86
+
87
+ Flexium guarantees complete GPU release through driver-level migration (requires NVIDIA driver 580+):
88
+
89
+ ```python
90
+ import flexium.auto # Add this import
91
+
92
+ with flexium.auto.run(): # Wrap your training
93
+ model = Net().cuda() # Standard PyTorch - no changes needed!
94
+ for epoch in range(100):
95
+ for batch in dataloader:
96
+ # ... your normal training code ...
97
+ ```
98
+
99
+ Now you can migrate anytime:
100
+
101
+ ```bash
102
+ flexium-ctl migrate <process-id> cuda:1 # Training continues on new GPU
103
+ ```
104
+
105
+ The GPU is **100% freed** - your colleague can use it immediately.
106
+
107
+ ## Features
108
+
109
+ - **Zero code changes** - Just wrap your training in `flexium.auto.run()`
110
+ - **True GPU freedom** - Guarantees zero memory residue on source GPU
111
+ - **Live migration** - Move training between GPUs without losing progress
112
+ - **Automatic state management** - Training state preserved transparently
113
+ - **Web dashboard** - Visual GPU management at `http://localhost:8080`
114
+ - **CLI control** - `flexium-ctl` for scripting and automation
115
+ - **Pause/Resume** - Free GPU completely, resume later on any available GPU
116
+ - **Error recovery** - Auto-migrate on OOM, ECC errors, or hardware failures
117
+ - **Graceful degradation** - Training continues even if orchestrator dies
118
+
119
+ ## Installation
120
+
121
+ ```bash
122
+ # From PyPI (coming soon)
123
+ pip install flexium
124
+
125
+ # From source
126
+ git clone https://github.com/your-org/flexium.git
127
+ cd flexium
128
+ pip install -e .
129
+ ```
130
+
131
+ ### Requirements
132
+
133
+ - Python 3.8+
134
+ - PyTorch 2.0+ with CUDA support ([install guide](https://pytorch.org/get-started/locally/))
135
+ - NVIDIA GPU with CUDA support
136
+ - **NVIDIA Driver 580+** (required for zero-residue migration)
137
+ - Linux x86_64 (Windows/macOS not yet supported)
138
+
139
+ ## Quick Start
140
+
141
+ ### 1. Start the Orchestrator
142
+
143
+ ```bash
144
+ flexium-ctl server --dashboard
145
+ ```
146
+
147
+ This starts:
148
+ - gRPC server on port 50051 (coordinates migrations)
149
+ - Web dashboard at http://localhost:8080
150
+
151
+ ### 2. Add Flexium to Your Training
152
+
153
+ ```python
154
+ import flexium.auto
155
+ import torch
156
+
157
+ with flexium.auto.run():
158
+ model = MyModel().cuda()
159
+ optimizer = torch.optim.Adam(model.parameters())
160
+
161
+ for epoch in range(100):
162
+ for batch in dataloader:
163
+ loss = model(batch.cuda()).mean()
164
+ loss.backward()
165
+ optimizer.step()
166
+ ```
167
+
168
+ That's it! Your training is now migratable.
169
+
170
+ ### 3. Migrate When Needed
171
+
172
+ **Via Dashboard:**
173
+ Open http://localhost:8080, find your process, click "Migrate", select target GPU.
174
+
175
+ **Via CLI:**
176
+ ```bash
177
+ # List running processes
178
+ flexium-ctl list
179
+
180
+ # Migrate to different GPU
181
+ flexium-ctl migrate <process-id> cuda:1
182
+
183
+ # Pause (free GPU completely)
184
+ flexium-ctl pause <process-id>
185
+
186
+ # Resume on any available GPU
187
+ flexium-ctl resume <process-id>
188
+ ```
189
+
190
+ ## Real-World Scenarios
191
+
192
+ ### GPU Contention
193
+ ```bash
194
+ # Alice is training on cuda:0, Bob needs it urgently
195
+ flexium-ctl migrate alice-abc123 cuda:2
196
+ # Alice's training continues on cuda:2, cuda:0 is free for Bob
197
+ ```
198
+
199
+ ### OOM Recovery
200
+ ```python
201
+ with flexium.auto.run():
202
+ # If OOM occurs, flexium auto-migrates to GPU with more VRAM
203
+ model = LargeModel().cuda()
204
+ train(model)
205
+ ```
206
+
207
+ ### Shared Cluster Management
208
+ ```bash
209
+ # See all jobs across the cluster
210
+ flexium-ctl list
211
+
212
+ # Dashboard shows: who's using what, progress, VRAM usage
213
+ # One-click migration when someone needs a GPU
214
+ ```
215
+
216
+ ### Hardware Failure
217
+ ```python
218
+ with flexium.auto.run():
219
+ # ECC error? Driver crash? Auto-migrate to healthy GPU
220
+ # Bad GPU marked unhealthy, won't be used again
221
+ train(model)
222
+ ```
223
+
224
+ ## How It Works
225
+
226
+ Flexium uses **driver-level GPU migration** (requires NVIDIA driver 580+) that guarantees complete GPU memory release - something that's impossible with standard PyTorch memory management.
227
+
228
+ When you request a migration:
229
+ 1. GPU state is checkpointed at the driver level
230
+ 2. Source GPU memory is completely released (0 MB residue)
231
+ 3. State is restored on the target GPU
232
+ 4. Training continues seamlessly
233
+
234
+ The orchestrator coordinates all GPU resources across your cluster, tracking utilization, health status, and process state to enable intelligent scheduling and automatic recovery.
235
+
236
+ ## Documentation
237
+
238
+ - [Getting Started Guide](docs/getting-started.md)
239
+ - [API Reference](docs/api.md)
240
+ - [Architecture Overview](docs/ARCHITECTURE.md)
241
+ - [Examples](docs/examples.md)
242
+ - [Troubleshooting](docs/troubleshooting.md)
243
+
244
+ ## Benchmarks
245
+
246
+ Migration overhead is minimal:
247
+
248
+ | Model Size | Checkpoint Time | Total Migration |
249
+ |------------|-----------------|-----------------|
250
+ | 100 MB | ~200 ms | ~700 ms |
251
+ | 500 MB | ~800 ms | ~2 sec |
252
+ | 1 GB | ~1.5 sec | ~4 sec |
253
+
254
+ Runtime overhead during normal training: **< 2%**
255
+
256
+ ## Configuration
257
+
258
+ ### Environment Variables
259
+ ```bash
260
+ export GPU_ORCHESTRATOR=localhost:50051
261
+ export GPU_DEVICE=cuda:0
262
+ ```
263
+
264
+ ### Config File (`~/.flexiumrc`)
265
+ ```yaml
266
+ orchestrator: localhost:50051
267
+ device: cuda:0
268
+ ```
269
+
270
+ ### Inline Parameters
271
+ ```python
272
+ with flexium.auto.run(orchestrator="server:50051", device="cuda:1"):
273
+ ...
274
+ ```
275
+
276
+ ## CLI Reference
277
+
278
+ ```bash
279
+ # Server management
280
+ flexium-ctl server [--port PORT] [--dashboard]
281
+
282
+ # Process management
283
+ flexium-ctl list # List all processes
284
+ flexium-ctl status <process-id> # Get process details
285
+ flexium-ctl migrate <process-id> <device> # Migrate to device
286
+ flexium-ctl pause <process-id> # Pause (free GPU)
287
+ flexium-ctl resume <process-id> [device] # Resume training
288
+
289
+ # GPU management
290
+ flexium-ctl devices # List all GPUs
291
+ flexium-ctl device <gpu-uuid> # Get GPU details
292
+ ```
293
+
294
+ ## Contributing
295
+
296
+ We welcome contributions! Please see our contributing guidelines (coming soon).
297
+
298
+ ```bash
299
+ # Development setup
300
+ git clone https://github.com/your-org/flexium.git
301
+ cd flexium
302
+ pip install -e ".[dev]"
303
+
304
+ # Run tests
305
+ pytest tests/ -v
306
+
307
+ # Run linting
308
+ ruff check flexium/
309
+ mypy flexium/
310
+ ```
311
+
312
+ ## License
313
+
314
+ MIT License - see [LICENSE](LICENSE) for details.
315
+
316
+ ## Acknowledgments
317
+
318
+ Flexium.AI was inspired by the challenges of managing shared GPU clusters in research environments. Special thanks to everyone who provided feedback and tested early versions.
319
+
320
+ ---
321
+
322
+ <p align="center">
323
+ <img src="logo_with_text.png" alt="Flexium.AI" width="150"><br>
324
+ <strong>Flexium.AI</strong> - Flexible Resource Allocation<br>
325
+ <em>Making GPU management effortless</em>
326
+ </p>
327
+
328
+ **Note:** Flexium.AI is currently in alpha. APIs may change. Please report issues on GitHub.
@@ -0,0 +1,279 @@
1
+ <p align="center">
2
+ <img src="logo_with_text.png" alt="Flexium.AI Logo" width="400">
3
+ </p>
4
+
5
+ <h1 align="center">Flexium.AI</h1>
6
+
7
+ <p align="center">
8
+ <strong>Flexible Resource Allocation for GPU Training</strong><br>
9
+ Live GPU migration for PyTorch - zero code changes required.
10
+ </p>
11
+
12
+ <p align="center">
13
+ <a href="#installation">Installation</a> •
14
+ <a href="#quick-start">Quick Start</a> •
15
+ <a href="#features">Features</a> •
16
+ <a href="#documentation">Documentation</a>
17
+ </p>
18
+
19
+ ---
20
+
21
+ Flexium.AI allows you to move running PyTorch training jobs between GPUs without stopping them. Free up GPUs for colleagues, recover from hardware errors, and manage shared GPU clusters - all while your training continues seamlessly.
22
+
23
+ ## The Problem
24
+
25
+ When you need to free a GPU during training, your options are bad:
26
+
27
+ ```python
28
+ # You're training on cuda:0, colleague needs it urgently
29
+ model = model.to("cuda:1") # Doesn't work - memory stays on cuda:0!
30
+ torch.cuda.empty_cache() # Still doesn't free it
31
+ # Your colleague still can't use the GPU
32
+ ```
33
+
34
+ PyTorch's CUDA memory allocator holds onto GPU memory even after `model.to()`. The only way to truly free a GPU is to kill the process - losing your training progress.
35
+
36
+ ## The Solution
37
+
38
+ Flexium guarantees complete GPU release through driver-level migration (requires NVIDIA driver 580+):
39
+
40
+ ```python
41
+ import flexium.auto # Add this import
42
+
43
+ with flexium.auto.run(): # Wrap your training
44
+ model = Net().cuda() # Standard PyTorch - no changes needed!
45
+ for epoch in range(100):
46
+ for batch in dataloader:
47
+ # ... your normal training code ...
48
+ ```
49
+
50
+ Now you can migrate anytime:
51
+
52
+ ```bash
53
+ flexium-ctl migrate <process-id> cuda:1 # Training continues on new GPU
54
+ ```
55
+
56
+ The GPU is **100% freed** - your colleague can use it immediately.
57
+
58
+ ## Features
59
+
60
+ - **Zero code changes** - Just wrap your training in `flexium.auto.run()`
61
+ - **True GPU freedom** - Guarantees zero memory residue on source GPU
62
+ - **Live migration** - Move training between GPUs without losing progress
63
+ - **Automatic state management** - Training state preserved transparently
64
+ - **Web dashboard** - Visual GPU management at `http://localhost:8080`
65
+ - **CLI control** - `flexium-ctl` for scripting and automation
66
+ - **Pause/Resume** - Free GPU completely, resume later on any available GPU
67
+ - **Error recovery** - Auto-migrate on OOM, ECC errors, or hardware failures
68
+ - **Graceful degradation** - Training continues even if orchestrator dies
69
+
70
+ ## Installation
71
+
72
+ ```bash
73
+ # From PyPI (coming soon)
74
+ pip install flexium
75
+
76
+ # From source
77
+ git clone https://github.com/your-org/flexium.git
78
+ cd flexium
79
+ pip install -e .
80
+ ```
81
+
82
+ ### Requirements
83
+
84
+ - Python 3.8+
85
+ - PyTorch 2.0+ with CUDA support ([install guide](https://pytorch.org/get-started/locally/))
86
+ - NVIDIA GPU with CUDA support
87
+ - **NVIDIA Driver 580+** (required for zero-residue migration)
88
+ - Linux x86_64 (Windows/macOS not yet supported)
89
+
90
+ ## Quick Start
91
+
92
+ ### 1. Start the Orchestrator
93
+
94
+ ```bash
95
+ flexium-ctl server --dashboard
96
+ ```
97
+
98
+ This starts:
99
+ - gRPC server on port 50051 (coordinates migrations)
100
+ - Web dashboard at http://localhost:8080
101
+
102
+ ### 2. Add Flexium to Your Training
103
+
104
+ ```python
105
+ import flexium.auto
106
+ import torch
107
+
108
+ with flexium.auto.run():
109
+ model = MyModel().cuda()
110
+ optimizer = torch.optim.Adam(model.parameters())
111
+
112
+ for epoch in range(100):
113
+ for batch in dataloader:
114
+ loss = model(batch.cuda()).mean()
115
+ loss.backward()
116
+ optimizer.step()
117
+ ```
118
+
119
+ That's it! Your training is now migratable.
120
+
121
+ ### 3. Migrate When Needed
122
+
123
+ **Via Dashboard:**
124
+ Open http://localhost:8080, find your process, click "Migrate", select target GPU.
125
+
126
+ **Via CLI:**
127
+ ```bash
128
+ # List running processes
129
+ flexium-ctl list
130
+
131
+ # Migrate to different GPU
132
+ flexium-ctl migrate <process-id> cuda:1
133
+
134
+ # Pause (free GPU completely)
135
+ flexium-ctl pause <process-id>
136
+
137
+ # Resume on any available GPU
138
+ flexium-ctl resume <process-id>
139
+ ```
140
+
141
+ ## Real-World Scenarios
142
+
143
+ ### GPU Contention
144
+ ```bash
145
+ # Alice is training on cuda:0, Bob needs it urgently
146
+ flexium-ctl migrate alice-abc123 cuda:2
147
+ # Alice's training continues on cuda:2, cuda:0 is free for Bob
148
+ ```
149
+
150
+ ### OOM Recovery
151
+ ```python
152
+ with flexium.auto.run():
153
+ # If OOM occurs, flexium auto-migrates to GPU with more VRAM
154
+ model = LargeModel().cuda()
155
+ train(model)
156
+ ```
157
+
158
+ ### Shared Cluster Management
159
+ ```bash
160
+ # See all jobs across the cluster
161
+ flexium-ctl list
162
+
163
+ # Dashboard shows: who's using what, progress, VRAM usage
164
+ # One-click migration when someone needs a GPU
165
+ ```
166
+
167
+ ### Hardware Failure
168
+ ```python
169
+ with flexium.auto.run():
170
+ # ECC error? Driver crash? Auto-migrate to healthy GPU
171
+ # Bad GPU marked unhealthy, won't be used again
172
+ train(model)
173
+ ```
174
+
175
+ ## How It Works
176
+
177
+ Flexium uses **driver-level GPU migration** (requires NVIDIA driver 580+) that guarantees complete GPU memory release - something that's impossible with standard PyTorch memory management.
178
+
179
+ When you request a migration:
180
+ 1. GPU state is checkpointed at the driver level
181
+ 2. Source GPU memory is completely released (0 MB residue)
182
+ 3. State is restored on the target GPU
183
+ 4. Training continues seamlessly
184
+
185
+ The orchestrator coordinates all GPU resources across your cluster, tracking utilization, health status, and process state to enable intelligent scheduling and automatic recovery.
186
+
187
+ ## Documentation
188
+
189
+ - [Getting Started Guide](docs/getting-started.md)
190
+ - [API Reference](docs/api.md)
191
+ - [Architecture Overview](docs/ARCHITECTURE.md)
192
+ - [Examples](docs/examples.md)
193
+ - [Troubleshooting](docs/troubleshooting.md)
194
+
195
+ ## Benchmarks
196
+
197
+ Migration overhead is minimal:
198
+
199
+ | Model Size | Checkpoint Time | Total Migration |
200
+ |------------|-----------------|-----------------|
201
+ | 100 MB | ~200 ms | ~700 ms |
202
+ | 500 MB | ~800 ms | ~2 sec |
203
+ | 1 GB | ~1.5 sec | ~4 sec |
204
+
205
+ Runtime overhead during normal training: **< 2%**
206
+
207
+ ## Configuration
208
+
209
+ ### Environment Variables
210
+ ```bash
211
+ export GPU_ORCHESTRATOR=localhost:50051
212
+ export GPU_DEVICE=cuda:0
213
+ ```
214
+
215
+ ### Config File (`~/.flexiumrc`)
216
+ ```yaml
217
+ orchestrator: localhost:50051
218
+ device: cuda:0
219
+ ```
220
+
221
+ ### Inline Parameters
222
+ ```python
223
+ with flexium.auto.run(orchestrator="server:50051", device="cuda:1"):
224
+ ...
225
+ ```
226
+
227
+ ## CLI Reference
228
+
229
+ ```bash
230
+ # Server management
231
+ flexium-ctl server [--port PORT] [--dashboard]
232
+
233
+ # Process management
234
+ flexium-ctl list # List all processes
235
+ flexium-ctl status <process-id> # Get process details
236
+ flexium-ctl migrate <process-id> <device> # Migrate to device
237
+ flexium-ctl pause <process-id> # Pause (free GPU)
238
+ flexium-ctl resume <process-id> [device] # Resume training
239
+
240
+ # GPU management
241
+ flexium-ctl devices # List all GPUs
242
+ flexium-ctl device <gpu-uuid> # Get GPU details
243
+ ```
244
+
245
+ ## Contributing
246
+
247
+ We welcome contributions! Please see our contributing guidelines (coming soon).
248
+
249
+ ```bash
250
+ # Development setup
251
+ git clone https://github.com/your-org/flexium.git
252
+ cd flexium
253
+ pip install -e ".[dev]"
254
+
255
+ # Run tests
256
+ pytest tests/ -v
257
+
258
+ # Run linting
259
+ ruff check flexium/
260
+ mypy flexium/
261
+ ```
262
+
263
+ ## License
264
+
265
+ MIT License - see [LICENSE](LICENSE) for details.
266
+
267
+ ## Acknowledgments
268
+
269
+ Flexium.AI was inspired by the challenges of managing shared GPU clusters in research environments. Special thanks to everyone who provided feedback and tested early versions.
270
+
271
+ ---
272
+
273
+ <p align="center">
274
+ <img src="logo_with_text.png" alt="Flexium.AI" width="150"><br>
275
+ <strong>Flexium.AI</strong> - Flexible Resource Allocation<br>
276
+ <em>Making GPU management effortless</em>
277
+ </p>
278
+
279
+ **Note:** Flexium.AI is currently in alpha. APIs may change. Please report issues on GitHub.
@@ -0,0 +1,39 @@
1
+ """Flexium - Dynamic GPU orchestration for PyTorch training.
2
+
3
+ This package provides GPU orchestration with live migration support using
4
+ driver-level migration for zero-residue GPU migration (requires driver 580+).
5
+
6
+ Usage:
7
+
8
+ import flexium.auto
9
+
10
+ with flexium.auto.run():
11
+ model = nn.Linear(784, 10).cuda()
12
+ optimizer = torch.optim.Adam(model.parameters())
13
+
14
+ for epoch in range(100):
15
+ for batch in dataloader:
16
+ data, target = batch[0].cuda(), batch[1].cuda()
17
+ loss = model(data).sum()
18
+ loss.backward()
19
+ optimizer.step()
20
+
21
+ Configuration:
22
+ - Set GPU_ORCHESTRATOR environment variable to orchestrator address
23
+ - Or pass orchestrator= parameter to flexium.auto.run()
24
+
25
+ Migration is transparent - training continues in the same process,
26
+ same loop iteration, just on a different GPU.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ __version__ = "0.1.0"
32
+
33
+ # The main API is flexium.auto.run()
34
+ # Import with: import flexium.auto
35
+ # Usage: with flexium.auto.run(): ...
36
+
37
+ __all__ = [
38
+ "__version__",
39
+ ]