alloc 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alloc-0.0.1/PKG-INFO +256 -0
- alloc-0.0.1/README.md +226 -0
- alloc-0.0.1/pyproject.toml +47 -0
- alloc-0.0.1/setup.cfg +4 -0
- alloc-0.0.1/src/alloc/__init__.py +11 -0
- alloc-0.0.1/src/alloc/artifact_writer.py +67 -0
- alloc-0.0.1/src/alloc/callbacks.py +342 -0
- alloc-0.0.1/src/alloc/catalog/__init__.py +138 -0
- alloc-0.0.1/src/alloc/catalog/default_rate_card.json +18 -0
- alloc-0.0.1/src/alloc/catalog/gpus.v1.json +174 -0
- alloc-0.0.1/src/alloc/cli.py +1341 -0
- alloc-0.0.1/src/alloc/config.py +124 -0
- alloc-0.0.1/src/alloc/context.py +191 -0
- alloc-0.0.1/src/alloc/display.py +580 -0
- alloc-0.0.1/src/alloc/extractor_runner.py +141 -0
- alloc-0.0.1/src/alloc/ghost.py +167 -0
- alloc-0.0.1/src/alloc/model_extractor.py +170 -0
- alloc-0.0.1/src/alloc/model_registry.py +138 -0
- alloc-0.0.1/src/alloc/probe.py +461 -0
- alloc-0.0.1/src/alloc/stability.py +144 -0
- alloc-0.0.1/src/alloc/upload.py +138 -0
- alloc-0.0.1/src/alloc/yaml_config.py +287 -0
- alloc-0.0.1/src/alloc.egg-info/PKG-INFO +256 -0
- alloc-0.0.1/src/alloc.egg-info/SOURCES.txt +41 -0
- alloc-0.0.1/src/alloc.egg-info/dependency_links.txt +1 -0
- alloc-0.0.1/src/alloc.egg-info/entry_points.txt +2 -0
- alloc-0.0.1/src/alloc.egg-info/requires.txt +12 -0
- alloc-0.0.1/src/alloc.egg-info/top_level.txt +1 -0
- alloc-0.0.1/tests/test_artifact.py +128 -0
- alloc-0.0.1/tests/test_auth.py +155 -0
- alloc-0.0.1/tests/test_callbacks.py +330 -0
- alloc-0.0.1/tests/test_catalog.py +83 -0
- alloc-0.0.1/tests/test_cli.py +205 -0
- alloc-0.0.1/tests/test_context.py +135 -0
- alloc-0.0.1/tests/test_ghost.py +82 -0
- alloc-0.0.1/tests/test_init_from_org.py +98 -0
- alloc-0.0.1/tests/test_model_extractor.py +232 -0
- alloc-0.0.1/tests/test_probe_hw.py +83 -0
- alloc-0.0.1/tests/test_probe_multi.py +114 -0
- alloc-0.0.1/tests/test_stability.py +173 -0
- alloc-0.0.1/tests/test_upload.py +157 -0
- alloc-0.0.1/tests/test_verdict.py +187 -0
- alloc-0.0.1/tests/test_yaml_config.py +215 -0
alloc-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: alloc
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
|
+
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://alloclabs.com
|
|
8
|
+
Project-URL: Repository, https://github.com/alloc-labs/alloc
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.8
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: typer>=0.9.0
|
|
21
|
+
Requires-Dist: rich>=13.0.0
|
|
22
|
+
Requires-Dist: httpx>=0.24.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Provides-Extra: gpu
|
|
26
|
+
Requires-Dist: pynvml>=11.5.0; extra == "gpu"
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# alloc (by [Alloc Labs](https://www.alloclabs.com))
|
|
32
|
+
|
|
33
|
+
Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
34
|
+
|
|
35
|
+
[](https://www.alloclabs.com)
|
|
36
|
+
[](https://pypi.org/project/alloc/)
|
|
37
|
+
[](LICENSE)
|
|
38
|
+
|
|
39
|
+
> Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
|
|
40
|
+
|
|
41
|
+
## What Alloc Does
|
|
42
|
+
|
|
43
|
+
Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
|
|
44
|
+
|
|
45
|
+
- **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
|
|
46
|
+
- **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
|
|
47
|
+
- **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
|
|
48
|
+
|
|
49
|
+
Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
|
|
50
|
+
|
|
51
|
+
## Who This Is For
|
|
52
|
+
|
|
53
|
+
- **Solo engineers** who want a fast sanity check before burning GPU time
|
|
54
|
+
- **ML teams** who need repeatable right-sizing and bottleneck visibility
|
|
55
|
+
- **Platform/infra leads** who want budget-aware controls without rewriting training code
|
|
56
|
+
|
|
57
|
+
## Why It Is Low Friction
|
|
58
|
+
|
|
59
|
+
- **No code changes required** for baseline value (`alloc run`)
|
|
60
|
+
- **Optional deeper integration** via callbacks when you want richer timing signals
|
|
61
|
+
- **Local-first artifacts** so users still get value without cloud connectivity
|
|
62
|
+
- **Progressive adoption** from local CLI to team workflows and governance
|
|
63
|
+
|
|
64
|
+
## Install
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install alloc
|
|
68
|
+
|
|
69
|
+
# With GPU monitoring support (NVML via pynvml)
|
|
70
|
+
pip install alloc[gpu]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Notes:
|
|
74
|
+
- `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
|
|
75
|
+
- `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
|
|
76
|
+
|
|
77
|
+
## Commands
|
|
78
|
+
|
|
79
|
+
### `alloc scan`: Remote Ghost Scan (no GPU needed)
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
alloc scan --model llama-3-70b --gpu A100-80GB
|
|
83
|
+
alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
|
|
84
|
+
alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
|
|
85
|
+
|
|
86
|
+
# Objective + budget constraints
|
|
87
|
+
alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
|
|
88
|
+
|
|
89
|
+
# Topology hints (optional, improves planner quality)
|
|
90
|
+
alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### `alloc ghost`: Local VRAM estimation
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
alloc ghost train.py --dtype bf16 --batch-size 32
|
|
97
|
+
alloc ghost train.py --param-count-b 7.0 # manual override
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
|
|
101
|
+
|
|
102
|
+
### `alloc run`: Training with GPU monitoring
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
alloc run python train.py # calibrate and exit (default)
|
|
106
|
+
alloc run --full python train.py # monitor full training run
|
|
107
|
+
alloc run torchrun --nproc_per_node=4 train.py
|
|
108
|
+
alloc run -- python train.py --epochs 10
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
|
|
112
|
+
|
|
113
|
+
**Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
|
|
114
|
+
|
|
115
|
+
**Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
|
|
116
|
+
|
|
117
|
+
**Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
|
|
118
|
+
|
|
119
|
+
### `alloc login`: Authenticate with dashboard
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
alloc login
|
|
123
|
+
# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
|
|
124
|
+
|
|
125
|
+
alloc login --token <ACCESS_TOKEN>
|
|
126
|
+
# Paste an access token from the dashboard (no password prompt)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### `alloc whoami`: Show current auth + org context
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
alloc whoami
|
|
133
|
+
alloc whoami --json
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
|
|
137
|
+
|
|
138
|
+
### `alloc logout`: Clear local session
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
alloc logout
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
|
|
145
|
+
|
|
146
|
+
### `alloc upload`: Upload artifact to dashboard
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
alloc upload alloc_artifact.json.gz
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
|
|
153
|
+
|
|
154
|
+
If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
|
|
155
|
+
|
|
156
|
+
### `alloc catalog`: Browse GPU hardware catalog
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
alloc catalog list # list all 13 GPUs (sorted by VRAM)
|
|
160
|
+
alloc catalog list --sort cost # sort by $/hr
|
|
161
|
+
alloc catalog list --sort tflops # sort by BF16 TFLOPS
|
|
162
|
+
alloc catalog show H100 # detailed specs for H100
|
|
163
|
+
alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
|
|
167
|
+
|
|
168
|
+
### `alloc init`: Configure GPU fleet and budget
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
alloc init # interactive wizard
|
|
172
|
+
alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
|
|
173
|
+
alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
|
|
177
|
+
|
|
178
|
+
### `alloc version`
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
alloc version
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Python API
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
import alloc
|
|
188
|
+
|
|
189
|
+
# Static VRAM analysis (never crashes your training)
|
|
190
|
+
report = alloc.ghost(model)
|
|
191
|
+
print(report.total_gb) # e.g., 115.42
|
|
192
|
+
|
|
193
|
+
# Or from param count (no torch needed)
|
|
194
|
+
report = alloc.ghost(param_count_b=7.0, dtype="bf16")
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Framework Callbacks
|
|
198
|
+
|
|
199
|
+
Optional callbacks for deeper profiling. Captures step-level timing, throughput, and dataloader wait estimates.
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
# HuggingFace Transformers
|
|
203
|
+
from alloc import HuggingFaceCallback
|
|
204
|
+
trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
|
|
205
|
+
|
|
206
|
+
# PyTorch Lightning
|
|
207
|
+
from alloc import LightningCallback
|
|
208
|
+
trainer = Trainer(..., callbacks=[LightningCallback()])
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), samples/sec, and estimated dataloader wait %. This unlocks higher confidence analysis and dataloader bottleneck detection.
|
|
212
|
+
|
|
213
|
+
## Configuration
|
|
214
|
+
|
|
215
|
+
Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
|
|
216
|
+
|
|
217
|
+
| Variable | Default | Description |
|
|
218
|
+
|----------|---------|-------------|
|
|
219
|
+
| `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
|
|
220
|
+
| `ALLOC_TOKEN` | (empty) | Auth token for API calls |
|
|
221
|
+
| `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
|
|
222
|
+
| `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
|
|
223
|
+
| `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
|
|
224
|
+
|
|
225
|
+
## Architecture
|
|
226
|
+
|
|
227
|
+
| Module | Purpose |
|
|
228
|
+
|--------|---------|
|
|
229
|
+
| `ghost.py` | VRAM estimation from parameter count. Computes weights + gradients + optimizer + activations + buffer breakdown. |
|
|
230
|
+
| `model_extractor.py` | Three-method model discovery: subprocess execution (`nn.Module` finder), AST parsing (`from_pretrained`), manual override. |
|
|
231
|
+
| `probe.py` | External GPU monitoring via `pynvml`. Process-tree aware multi-GPU discovery. Captures hardware context (driver, CUDA, SM version). |
|
|
232
|
+
| `stability.py` | Multi-signal stability detection for calibrate-and-exit (VRAM plateau + util std dev + power std dev). |
|
|
233
|
+
| `catalog/` | Bundled GPU hardware catalog (13 GPUs) with specs and pricing. Powers `alloc catalog` commands. |
|
|
234
|
+
| `context.py` | Context autodiscovery: git (SHA, branch, repo), container (Docker/Podman), Ray (job ID, cluster). |
|
|
235
|
+
| `artifact_writer.py` | Artifact Writer: writes `alloc_artifact.json.gz` with probe, ghost, hardware, and context sections. |
|
|
236
|
+
| `cli.py` | Typer CLI with `ghost`, `run`, `scan`, `login`, `upload`, `init`, `catalog`, `version` commands. |
|
|
237
|
+
| `yaml_config.py` | `.alloc.yaml` parser: fleet, explore, priority, budget. Loaded automatically by `ghost`, `run`, `scan`. |
|
|
238
|
+
| `callbacks.py` | Framework callbacks: HuggingFace `TrainerCallback` and Lightning `Callback` with step timing (p50/p90), throughput, and dataloader wait estimation. |
|
|
239
|
+
| `upload.py` | Artifact uploader: POSTs `.json.gz` to `POST /runs/ingest`. |
|
|
240
|
+
| `display.py` | Rich terminal formatting for reports. |
|
|
241
|
+
| `config.py` | Env-var-only configuration (API URL, Supabase URL, token storage). |
|
|
242
|
+
|
|
243
|
+
## Design Principles
|
|
244
|
+
|
|
245
|
+
1. **Zero config**: `alloc run python train.py` works out of the box
|
|
246
|
+
2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
|
|
247
|
+
3. **Never crash user's training**: All Alloc failures are caught and training continues
|
|
248
|
+
4. **Progressive disclosure**: Individual use first, team governance later
|
|
249
|
+
|
|
250
|
+
## Telemetry Levels
|
|
251
|
+
|
|
252
|
+
Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
|
|
253
|
+
|
|
254
|
+
- **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
|
|
255
|
+
- **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
|
|
256
|
+
- **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
|
alloc-0.0.1/README.md
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# alloc (by [Alloc Labs](https://www.alloclabs.com))
|
|
2
|
+
|
|
3
|
+
Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
4
|
+
|
|
5
|
+
[](https://www.alloclabs.com)
|
|
6
|
+
[](https://pypi.org/project/alloc/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
> Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
|
|
10
|
+
|
|
11
|
+
## What Alloc Does
|
|
12
|
+
|
|
13
|
+
Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
|
|
14
|
+
|
|
15
|
+
- **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
|
|
16
|
+
- **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
|
|
17
|
+
- **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
|
|
18
|
+
|
|
19
|
+
Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
|
|
20
|
+
|
|
21
|
+
## Who This Is For
|
|
22
|
+
|
|
23
|
+
- **Solo engineers** who want a fast sanity check before burning GPU time
|
|
24
|
+
- **ML teams** who need repeatable right-sizing and bottleneck visibility
|
|
25
|
+
- **Platform/infra leads** who want budget-aware controls without rewriting training code
|
|
26
|
+
|
|
27
|
+
## Why It Is Low Friction
|
|
28
|
+
|
|
29
|
+
- **No code changes required** for baseline value (`alloc run`)
|
|
30
|
+
- **Optional deeper integration** via callbacks when you want richer timing signals
|
|
31
|
+
- **Local-first artifacts** so users still get value without cloud connectivity
|
|
32
|
+
- **Progressive adoption** from local CLI to team workflows and governance
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install alloc
|
|
38
|
+
|
|
39
|
+
# With GPU monitoring support (NVML via pynvml)
|
|
40
|
+
pip install alloc[gpu]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Notes:
|
|
44
|
+
- `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
|
|
45
|
+
- `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
|
|
46
|
+
|
|
47
|
+
## Commands
|
|
48
|
+
|
|
49
|
+
### `alloc scan`: Remote Ghost Scan (no GPU needed)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
alloc scan --model llama-3-70b --gpu A100-80GB
|
|
53
|
+
alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
|
|
54
|
+
alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
|
|
55
|
+
|
|
56
|
+
# Objective + budget constraints
|
|
57
|
+
alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
|
|
58
|
+
|
|
59
|
+
# Topology hints (optional, improves planner quality)
|
|
60
|
+
alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### `alloc ghost`: Local VRAM estimation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
alloc ghost train.py --dtype bf16 --batch-size 32
|
|
67
|
+
alloc ghost train.py --param-count-b 7.0 # manual override
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
|
|
71
|
+
|
|
72
|
+
### `alloc run`: Training with GPU monitoring
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
alloc run python train.py # calibrate and exit (default)
|
|
76
|
+
alloc run --full python train.py # monitor full training run
|
|
77
|
+
alloc run torchrun --nproc_per_node=4 train.py
|
|
78
|
+
alloc run -- python train.py --epochs 10
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
|
|
82
|
+
|
|
83
|
+
**Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
|
|
84
|
+
|
|
85
|
+
**Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
|
|
86
|
+
|
|
87
|
+
**Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
|
|
88
|
+
|
|
89
|
+
### `alloc login`: Authenticate with dashboard
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
alloc login
|
|
93
|
+
# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
|
|
94
|
+
|
|
95
|
+
alloc login --token <ACCESS_TOKEN>
|
|
96
|
+
# Paste an access token from the dashboard (no password prompt)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### `alloc whoami`: Show current auth + org context
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
alloc whoami
|
|
103
|
+
alloc whoami --json
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
|
|
107
|
+
|
|
108
|
+
### `alloc logout`: Clear local session
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
alloc logout
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
|
|
115
|
+
|
|
116
|
+
### `alloc upload`: Upload artifact to dashboard
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
alloc upload alloc_artifact.json.gz
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
|
|
123
|
+
|
|
124
|
+
If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
|
|
125
|
+
|
|
126
|
+
### `alloc catalog`: Browse GPU hardware catalog
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
alloc catalog list # list all 13 GPUs (sorted by VRAM)
|
|
130
|
+
alloc catalog list --sort cost # sort by $/hr
|
|
131
|
+
alloc catalog list --sort tflops # sort by BF16 TFLOPS
|
|
132
|
+
alloc catalog show H100 # detailed specs for H100
|
|
133
|
+
alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
|
|
137
|
+
|
|
138
|
+
### `alloc init`: Configure GPU fleet and budget
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
alloc init # interactive wizard
|
|
142
|
+
alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
|
|
143
|
+
alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
|
|
147
|
+
|
|
148
|
+
### `alloc version`
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
alloc version
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Python API
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
import alloc
|
|
158
|
+
|
|
159
|
+
# Static VRAM analysis (never crashes your training)
|
|
160
|
+
report = alloc.ghost(model)
|
|
161
|
+
print(report.total_gb) # e.g., 115.42
|
|
162
|
+
|
|
163
|
+
# Or from param count (no torch needed)
|
|
164
|
+
report = alloc.ghost(param_count_b=7.0, dtype="bf16")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Framework Callbacks
|
|
168
|
+
|
|
169
|
+
Optional callbacks for deeper profiling. Captures step-level timing, throughput, and dataloader wait estimates.
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
# HuggingFace Transformers
|
|
173
|
+
from alloc import HuggingFaceCallback
|
|
174
|
+
trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
|
|
175
|
+
|
|
176
|
+
# PyTorch Lightning
|
|
177
|
+
from alloc import LightningCallback
|
|
178
|
+
trainer = Trainer(..., callbacks=[LightningCallback()])
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), samples/sec, and estimated dataloader wait %. This unlocks higher confidence analysis and dataloader bottleneck detection.
|
|
182
|
+
|
|
183
|
+
## Configuration
|
|
184
|
+
|
|
185
|
+
Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
|
|
186
|
+
|
|
187
|
+
| Variable | Default | Description |
|
|
188
|
+
|----------|---------|-------------|
|
|
189
|
+
| `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
|
|
190
|
+
| `ALLOC_TOKEN` | (empty) | Auth token for API calls |
|
|
191
|
+
| `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
|
|
192
|
+
| `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
|
|
193
|
+
| `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
|
|
194
|
+
|
|
195
|
+
## Architecture
|
|
196
|
+
|
|
197
|
+
| Module | Purpose |
|
|
198
|
+
|--------|---------|
|
|
199
|
+
| `ghost.py` | VRAM estimation from parameter count. Computes weights + gradients + optimizer + activations + buffer breakdown. |
|
|
200
|
+
| `model_extractor.py` | Three-method model discovery: subprocess execution (`nn.Module` finder), AST parsing (`from_pretrained`), manual override. |
|
|
201
|
+
| `probe.py` | External GPU monitoring via `pynvml`. Process-tree aware multi-GPU discovery. Captures hardware context (driver, CUDA, SM version). |
|
|
202
|
+
| `stability.py` | Multi-signal stability detection for calibrate-and-exit (VRAM plateau + util std dev + power std dev). |
|
|
203
|
+
| `catalog/` | Bundled GPU hardware catalog (13 GPUs) with specs and pricing. Powers `alloc catalog` commands. |
|
|
204
|
+
| `context.py` | Context autodiscovery: git (SHA, branch, repo), container (Docker/Podman), Ray (job ID, cluster). |
|
|
205
|
+
| `artifact_writer.py` | Artifact Writer: writes `alloc_artifact.json.gz` with probe, ghost, hardware, and context sections. |
|
|
206
|
+
| `cli.py` | Typer CLI with `ghost`, `run`, `scan`, `login`, `upload`, `init`, `catalog`, `version` commands. |
|
|
207
|
+
| `yaml_config.py` | `.alloc.yaml` parser: fleet, explore, priority, budget. Loaded automatically by `ghost`, `run`, `scan`. |
|
|
208
|
+
| `callbacks.py` | Framework callbacks: HuggingFace `TrainerCallback` and Lightning `Callback` with step timing (p50/p90), throughput, and dataloader wait estimation. |
|
|
209
|
+
| `upload.py` | Artifact uploader: POSTs `.json.gz` to `POST /runs/ingest`. |
|
|
210
|
+
| `display.py` | Rich terminal formatting for reports. |
|
|
211
|
+
| `config.py` | Env-var-only configuration (API URL, Supabase URL, token storage). |
|
|
212
|
+
|
|
213
|
+
## Design Principles
|
|
214
|
+
|
|
215
|
+
1. **Zero config**: `alloc run python train.py` works out of the box
|
|
216
|
+
2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
|
|
217
|
+
3. **Never crash user's training**: All Alloc failures are caught and training continues
|
|
218
|
+
4. **Progressive disclosure**: Individual use first, team governance later
|
|
219
|
+
|
|
220
|
+
## Telemetry Levels
|
|
221
|
+
|
|
222
|
+
Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
|
|
223
|
+
|
|
224
|
+
- **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
|
|
225
|
+
- **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
|
|
226
|
+
- **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "alloc"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 3 - Alpha",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.8",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"typer>=0.9.0",
|
|
26
|
+
"rich>=13.0.0",
|
|
27
|
+
"httpx>=0.24.0",
|
|
28
|
+
"pydantic>=2.0.0",
|
|
29
|
+
"pyyaml>=6.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
gpu = ["pynvml>=11.5.0"]
|
|
34
|
+
dev = ["pytest>=7.0.0", "pytest-cov>=4.0.0"]
|
|
35
|
+
|
|
36
|
+
[project.scripts]
|
|
37
|
+
alloc = "alloc.cli:app"
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://alloclabs.com"
|
|
41
|
+
Repository = "https://github.com/alloc-labs/alloc"
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
where = ["src"]
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.package-data]
|
|
47
|
+
"alloc.catalog" = ["*.json"]
|
alloc-0.0.1/setup.cfg
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Alloc — GPU intelligence for ML training."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "0.0.1"
|
|
6
|
+
|
|
7
|
+
from alloc.ghost import ghost, GhostReport
|
|
8
|
+
from alloc.callbacks import AllocCallback as HuggingFaceCallback
|
|
9
|
+
from alloc.callbacks import AllocLightningCallback as LightningCallback
|
|
10
|
+
|
|
11
|
+
__all__ = ["ghost", "GhostReport", "HuggingFaceCallback", "LightningCallback", "__version__"]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Artifact Writer — write alloc_artifact.json.gz.
|
|
2
|
+
|
|
3
|
+
Optionally uploads to W&B if wandb is active.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import gzip
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def write_report(
|
|
16
|
+
ghost_report: Optional[dict] = None,
|
|
17
|
+
probe_result: Optional[dict] = None,
|
|
18
|
+
output_path: Optional[str] = None,
|
|
19
|
+
hardware_context: Optional[dict] = None,
|
|
20
|
+
context: Optional[dict] = None,
|
|
21
|
+
) -> str:
|
|
22
|
+
"""Write an artifact to disk.
|
|
23
|
+
|
|
24
|
+
Resolution order for output path:
|
|
25
|
+
1. Explicit output_path parameter
|
|
26
|
+
2. ALLOC_OUT env var
|
|
27
|
+
3. ./alloc_artifact.json.gz
|
|
28
|
+
|
|
29
|
+
Returns the path written to. Never raises.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
resolved_path = (
|
|
33
|
+
output_path
|
|
34
|
+
or os.environ.get("ALLOC_OUT", "")
|
|
35
|
+
or "alloc_artifact.json.gz"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
report = {
|
|
39
|
+
"version": "0.0.1",
|
|
40
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
41
|
+
"ghost": ghost_report,
|
|
42
|
+
"probe": probe_result,
|
|
43
|
+
"hardware": hardware_context,
|
|
44
|
+
"context": context if context else None,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
with gzip.open(resolved_path, "wt", encoding="utf-8") as f:
|
|
48
|
+
json.dump(report, f, indent=2)
|
|
49
|
+
|
|
50
|
+
_try_wandb_upload(resolved_path)
|
|
51
|
+
return resolved_path
|
|
52
|
+
except Exception:
|
|
53
|
+
return ""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _try_wandb_upload(path: str) -> None:
|
|
57
|
+
"""Upload to W&B if wandb is active. Silent no-op otherwise."""
|
|
58
|
+
if not os.environ.get("WANDB_RUN_ID"):
|
|
59
|
+
return
|
|
60
|
+
try:
|
|
61
|
+
import wandb
|
|
62
|
+
if wandb.run is not None:
|
|
63
|
+
artifact = wandb.Artifact("alloc-profile", type="profile")
|
|
64
|
+
artifact.add_file(path)
|
|
65
|
+
wandb.run.log_artifact(artifact)
|
|
66
|
+
except Exception:
|
|
67
|
+
pass
|