alloc 0.3.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {alloc-0.3.1 → alloc-0.5.0}/PKG-INFO +70 -41
- {alloc-0.3.1 → alloc-0.5.0}/README.md +67 -38
- {alloc-0.3.1 → alloc-0.5.0}/pyproject.toml +3 -3
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/__init__.py +1 -1
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/callbacks.py +69 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/catalog/__init__.py +29 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/cli.py +576 -89
- alloc-0.5.0/src/alloc/config.py +124 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/display.py +33 -7
- alloc-0.5.0/src/alloc/extractor_runner.py +141 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/ghost.py +9 -2
- alloc-0.5.0/src/alloc/model_extractor.py +170 -0
- alloc-0.5.0/src/alloc/model_registry.py +138 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/probe.py +53 -41
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/upload.py +8 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/yaml_config.py +51 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/PKG-INFO +70 -41
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/SOURCES.txt +4 -0
- alloc-0.5.0/tests/test_auth.py +155 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_callbacks.py +103 -6
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_cli.py +3 -5
- alloc-0.5.0/tests/test_init_from_org.py +98 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_verdict.py +1 -1
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_yaml_config.py +2 -0
- alloc-0.3.1/src/alloc/config.py +0 -65
- alloc-0.3.1/src/alloc/model_extractor.py +0 -332
- {alloc-0.3.1 → alloc-0.5.0}/setup.cfg +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/artifact_writer.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/catalog/default_rate_card.json +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/catalog/gpus.v1.json +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/context.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc/stability.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/dependency_links.txt +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/entry_points.txt +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/requires.txt +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/top_level.txt +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_artifact.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_catalog.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_context.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_ghost.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_model_extractor.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_probe_hw.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_probe_multi.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_stability.py +0 -0
- {alloc-0.3.1 → alloc-0.5.0}/tests/test_upload.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
|
-
License: Apache-2.0
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: Homepage, https://alloclabs.com
|
|
8
8
|
Project-URL: Repository, https://github.com/alloc-labs/alloc
|
|
9
9
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -28,28 +28,25 @@ Provides-Extra: dev
|
|
|
28
28
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
29
29
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
30
30
|
|
|
31
|
-
#
|
|
31
|
+
# alloc (by [Alloc Labs](https://www.alloclabs.com))
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
34
34
|
|
|
35
35
|
[](https://www.alloclabs.com)
|
|
36
36
|
[](https://pypi.org/project/alloc/)
|
|
37
37
|
[](LICENSE)
|
|
38
38
|
|
|
39
|
-
> Built by [Alloc Labs](https://www.alloclabs.com)
|
|
39
|
+
> Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
|
|
40
40
|
|
|
41
41
|
## What Alloc Does
|
|
42
42
|
|
|
43
|
-
Most ML teams
|
|
43
|
+
Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
|
|
44
44
|
|
|
45
|
-
|
|
45
|
+
- **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
|
|
46
|
+
- **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
|
|
47
|
+
- **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
|
|
46
48
|
|
|
47
|
-
|
|
48
|
-
- **Live calibration** — run briefly, collect real utilization/timing signals, then stop (`alloc run`)
|
|
49
|
-
- **Run intelligence** — upload artifacts for cost-aware analysis and proposals (`alloc upload`)
|
|
50
|
-
- **Policy path** — move from single-user optimization to team budget/governance over time
|
|
51
|
-
|
|
52
|
-
Works with PyTorch, HuggingFace, Lightning, and launcher flows such as `python`, `torchrun`, and `accelerate`. Local profiling works without outbound internet.
|
|
49
|
+
Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
|
|
53
50
|
|
|
54
51
|
## Who This Is For
|
|
55
52
|
|
|
@@ -69,21 +66,31 @@ Works with PyTorch, HuggingFace, Lightning, and launcher flows such as `python`,
|
|
|
69
66
|
```bash
|
|
70
67
|
pip install alloc
|
|
71
68
|
|
|
72
|
-
# With GPU monitoring support
|
|
69
|
+
# With GPU monitoring support (NVML via pynvml)
|
|
73
70
|
pip install alloc[gpu]
|
|
74
71
|
```
|
|
75
72
|
|
|
73
|
+
Notes:
|
|
74
|
+
- `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
|
|
75
|
+
- `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
|
|
76
|
+
|
|
76
77
|
## Commands
|
|
77
78
|
|
|
78
|
-
### `alloc scan
|
|
79
|
+
### `alloc scan`: Remote Ghost Scan (no GPU needed)
|
|
79
80
|
|
|
80
81
|
```bash
|
|
81
82
|
alloc scan --model llama-3-70b --gpu A100-80GB
|
|
82
83
|
alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
|
|
83
84
|
alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
|
|
85
|
+
|
|
86
|
+
# Objective + budget constraints
|
|
87
|
+
alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
|
|
88
|
+
|
|
89
|
+
# Topology hints (optional, improves planner quality)
|
|
90
|
+
alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
|
|
84
91
|
```
|
|
85
92
|
|
|
86
|
-
### `alloc ghost
|
|
93
|
+
### `alloc ghost`: Local VRAM estimation
|
|
87
94
|
|
|
88
95
|
```bash
|
|
89
96
|
alloc ghost train.py --dtype bf16 --batch-size 32
|
|
@@ -92,7 +99,7 @@ alloc ghost train.py --param-count-b 7.0 # manual override
|
|
|
92
99
|
|
|
93
100
|
Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
|
|
94
101
|
|
|
95
|
-
### `alloc run
|
|
102
|
+
### `alloc run`: Training with GPU monitoring
|
|
96
103
|
|
|
97
104
|
```bash
|
|
98
105
|
alloc run python train.py # calibrate and exit (default)
|
|
@@ -103,20 +110,40 @@ alloc run -- python train.py --epochs 10
|
|
|
103
110
|
|
|
104
111
|
Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
|
|
105
112
|
|
|
106
|
-
**Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize
|
|
113
|
+
**Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
|
|
107
114
|
|
|
108
115
|
**Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
|
|
109
116
|
|
|
110
117
|
**Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
|
|
111
118
|
|
|
112
|
-
### `alloc login
|
|
119
|
+
### `alloc login`: Authenticate with dashboard
|
|
113
120
|
|
|
114
121
|
```bash
|
|
115
122
|
alloc login
|
|
116
|
-
# Prompts for email + password, stores token in ~/.alloc/config.json
|
|
123
|
+
# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
|
|
124
|
+
|
|
125
|
+
alloc login --token <ACCESS_TOKEN>
|
|
126
|
+
# Paste an access token from the dashboard (no password prompt)
|
|
117
127
|
```
|
|
118
128
|
|
|
119
|
-
### `alloc
|
|
129
|
+
### `alloc whoami`: Show current auth + org context
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
alloc whoami
|
|
133
|
+
alloc whoami --json
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
|
|
137
|
+
|
|
138
|
+
### `alloc logout`: Clear local session
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
alloc logout
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
|
|
145
|
+
|
|
146
|
+
### `alloc upload`: Upload artifact to dashboard
|
|
120
147
|
|
|
121
148
|
```bash
|
|
122
149
|
alloc upload alloc_artifact.json.gz
|
|
@@ -124,7 +151,9 @@ alloc upload alloc_artifact.json.gz
|
|
|
124
151
|
|
|
125
152
|
Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
|
|
126
153
|
|
|
127
|
-
|
|
154
|
+
If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
|
|
155
|
+
|
|
156
|
+
### `alloc catalog`: Browse GPU hardware catalog
|
|
128
157
|
|
|
129
158
|
```bash
|
|
130
159
|
alloc catalog list # list all 13 GPUs (sorted by VRAM)
|
|
@@ -136,11 +165,12 @@ alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
|
|
|
136
165
|
|
|
137
166
|
Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
|
|
138
167
|
|
|
139
|
-
### `alloc init
|
|
168
|
+
### `alloc init`: Configure GPU fleet and budget
|
|
140
169
|
|
|
141
170
|
```bash
|
|
142
171
|
alloc init # interactive wizard
|
|
143
172
|
alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
|
|
173
|
+
alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
|
|
144
174
|
```
|
|
145
175
|
|
|
146
176
|
Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
|
|
@@ -182,13 +212,15 @@ Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), sampl
|
|
|
182
212
|
|
|
183
213
|
## Configuration
|
|
184
214
|
|
|
185
|
-
|
|
215
|
+
Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
|
|
186
216
|
|
|
187
217
|
| Variable | Default | Description |
|
|
188
218
|
|----------|---------|-------------|
|
|
189
219
|
| `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
|
|
190
220
|
| `ALLOC_TOKEN` | (empty) | Auth token for API calls |
|
|
191
|
-
| `ALLOC_UPLOAD` | `false` | Upload results to dashboard |
|
|
221
|
+
| `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
|
|
222
|
+
| `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
|
|
223
|
+
| `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
|
|
192
224
|
|
|
193
225
|
## Architecture
|
|
194
226
|
|
|
@@ -210,18 +242,15 @@ All config via environment variables. Zero config files required.
|
|
|
210
242
|
|
|
211
243
|
## Design Principles
|
|
212
244
|
|
|
213
|
-
1. **Zero config
|
|
214
|
-
2. **No monkey-patching
|
|
215
|
-
3. **Never crash user's training
|
|
216
|
-
4. **Progressive disclosure
|
|
217
|
-
|
|
218
|
-
##
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
| PCIe/NVLink transfer rates | Communication bottlenecks in multi-GPU setups |
|
|
226
|
-
| Compute throughput (TFLOPS) | Actual vs theoretical — feeds cost-efficiency analysis |
|
|
227
|
-
| Power draw | Thermal throttling detection |
|
|
245
|
+
1. **Zero config**: `alloc run python train.py` works out of the box
|
|
246
|
+
2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
|
|
247
|
+
3. **Never crash user's training**: All Alloc failures are caught and training continues
|
|
248
|
+
4. **Progressive disclosure**: Individual use first, team governance later
|
|
249
|
+
|
|
250
|
+
## Telemetry Levels
|
|
251
|
+
|
|
252
|
+
Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
|
|
253
|
+
|
|
254
|
+
- **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
|
|
255
|
+
- **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
|
|
256
|
+
- **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
|
|
@@ -1,25 +1,22 @@
|
|
|
1
|
-
#
|
|
1
|
+
# alloc (by [Alloc Labs](https://www.alloclabs.com))
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
4
4
|
|
|
5
5
|
[](https://www.alloclabs.com)
|
|
6
6
|
[](https://pypi.org/project/alloc/)
|
|
7
7
|
[](LICENSE)
|
|
8
8
|
|
|
9
|
-
> Built by [Alloc Labs](https://www.alloclabs.com)
|
|
9
|
+
> Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
|
|
10
10
|
|
|
11
11
|
## What Alloc Does
|
|
12
12
|
|
|
13
|
-
Most ML teams
|
|
13
|
+
Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
- **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
|
|
16
|
+
- **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
|
|
17
|
+
- **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
|
|
16
18
|
|
|
17
|
-
|
|
18
|
-
- **Live calibration** — run briefly, collect real utilization/timing signals, then stop (`alloc run`)
|
|
19
|
-
- **Run intelligence** — upload artifacts for cost-aware analysis and proposals (`alloc upload`)
|
|
20
|
-
- **Policy path** — move from single-user optimization to team budget/governance over time
|
|
21
|
-
|
|
22
|
-
Works with PyTorch, HuggingFace, Lightning, and launcher flows such as `python`, `torchrun`, and `accelerate`. Local profiling works without outbound internet.
|
|
19
|
+
Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
|
|
23
20
|
|
|
24
21
|
## Who This Is For
|
|
25
22
|
|
|
@@ -39,21 +36,31 @@ Works with PyTorch, HuggingFace, Lightning, and launcher flows such as `python`,
|
|
|
39
36
|
```bash
|
|
40
37
|
pip install alloc
|
|
41
38
|
|
|
42
|
-
# With GPU monitoring support
|
|
39
|
+
# With GPU monitoring support (NVML via pynvml)
|
|
43
40
|
pip install alloc[gpu]
|
|
44
41
|
```
|
|
45
42
|
|
|
43
|
+
Notes:
|
|
44
|
+
- `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
|
|
45
|
+
- `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
|
|
46
|
+
|
|
46
47
|
## Commands
|
|
47
48
|
|
|
48
|
-
### `alloc scan
|
|
49
|
+
### `alloc scan`: Remote Ghost Scan (no GPU needed)
|
|
49
50
|
|
|
50
51
|
```bash
|
|
51
52
|
alloc scan --model llama-3-70b --gpu A100-80GB
|
|
52
53
|
alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
|
|
53
54
|
alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
|
|
55
|
+
|
|
56
|
+
# Objective + budget constraints
|
|
57
|
+
alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
|
|
58
|
+
|
|
59
|
+
# Topology hints (optional, improves planner quality)
|
|
60
|
+
alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
|
|
54
61
|
```
|
|
55
62
|
|
|
56
|
-
### `alloc ghost
|
|
63
|
+
### `alloc ghost`: Local VRAM estimation
|
|
57
64
|
|
|
58
65
|
```bash
|
|
59
66
|
alloc ghost train.py --dtype bf16 --batch-size 32
|
|
@@ -62,7 +69,7 @@ alloc ghost train.py --param-count-b 7.0 # manual override
|
|
|
62
69
|
|
|
63
70
|
Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
|
|
64
71
|
|
|
65
|
-
### `alloc run
|
|
72
|
+
### `alloc run`: Training with GPU monitoring
|
|
66
73
|
|
|
67
74
|
```bash
|
|
68
75
|
alloc run python train.py # calibrate and exit (default)
|
|
@@ -73,20 +80,40 @@ alloc run -- python train.py --epochs 10
|
|
|
73
80
|
|
|
74
81
|
Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
|
|
75
82
|
|
|
76
|
-
**Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize
|
|
83
|
+
**Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
|
|
77
84
|
|
|
78
85
|
**Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
|
|
79
86
|
|
|
80
87
|
**Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
|
|
81
88
|
|
|
82
|
-
### `alloc login
|
|
89
|
+
### `alloc login`: Authenticate with dashboard
|
|
83
90
|
|
|
84
91
|
```bash
|
|
85
92
|
alloc login
|
|
86
|
-
# Prompts for email + password, stores token in ~/.alloc/config.json
|
|
93
|
+
# Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
|
|
94
|
+
|
|
95
|
+
alloc login --token <ACCESS_TOKEN>
|
|
96
|
+
# Paste an access token from the dashboard (no password prompt)
|
|
87
97
|
```
|
|
88
98
|
|
|
89
|
-
### `alloc
|
|
99
|
+
### `alloc whoami`: Show current auth + org context
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
alloc whoami
|
|
103
|
+
alloc whoami --json
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
|
|
107
|
+
|
|
108
|
+
### `alloc logout`: Clear local session
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
alloc logout
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
|
|
115
|
+
|
|
116
|
+
### `alloc upload`: Upload artifact to dashboard
|
|
90
117
|
|
|
91
118
|
```bash
|
|
92
119
|
alloc upload alloc_artifact.json.gz
|
|
@@ -94,7 +121,9 @@ alloc upload alloc_artifact.json.gz
|
|
|
94
121
|
|
|
95
122
|
Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
|
|
96
123
|
|
|
97
|
-
|
|
124
|
+
If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
|
|
125
|
+
|
|
126
|
+
### `alloc catalog`: Browse GPU hardware catalog
|
|
98
127
|
|
|
99
128
|
```bash
|
|
100
129
|
alloc catalog list # list all 13 GPUs (sorted by VRAM)
|
|
@@ -106,11 +135,12 @@ alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
|
|
|
106
135
|
|
|
107
136
|
Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
|
|
108
137
|
|
|
109
|
-
### `alloc init
|
|
138
|
+
### `alloc init`: Configure GPU fleet and budget
|
|
110
139
|
|
|
111
140
|
```bash
|
|
112
141
|
alloc init # interactive wizard
|
|
113
142
|
alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
|
|
143
|
+
alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
|
|
114
144
|
```
|
|
115
145
|
|
|
116
146
|
Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
|
|
@@ -152,13 +182,15 @@ Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), sampl
|
|
|
152
182
|
|
|
153
183
|
## Configuration
|
|
154
184
|
|
|
155
|
-
|
|
185
|
+
Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
|
|
156
186
|
|
|
157
187
|
| Variable | Default | Description |
|
|
158
188
|
|----------|---------|-------------|
|
|
159
189
|
| `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
|
|
160
190
|
| `ALLOC_TOKEN` | (empty) | Auth token for API calls |
|
|
161
|
-
| `ALLOC_UPLOAD` | `false` | Upload results to dashboard |
|
|
191
|
+
| `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
|
|
192
|
+
| `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
|
|
193
|
+
| `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
|
|
162
194
|
|
|
163
195
|
## Architecture
|
|
164
196
|
|
|
@@ -180,18 +212,15 @@ All config via environment variables. Zero config files required.
|
|
|
180
212
|
|
|
181
213
|
## Design Principles
|
|
182
214
|
|
|
183
|
-
1. **Zero config
|
|
184
|
-
2. **No monkey-patching
|
|
185
|
-
3. **Never crash user's training
|
|
186
|
-
4. **Progressive disclosure
|
|
187
|
-
|
|
188
|
-
##
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
| PCIe/NVLink transfer rates | Communication bottlenecks in multi-GPU setups |
|
|
196
|
-
| Compute throughput (TFLOPS) | Actual vs theoretical — feeds cost-efficiency analysis |
|
|
197
|
-
| Power draw | Thermal throttling detection |
|
|
215
|
+
1. **Zero config**: `alloc run python train.py` works out of the box
|
|
216
|
+
2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
|
|
217
|
+
3. **Never crash user's training**: All Alloc failures are caught and training continues
|
|
218
|
+
4. **Progressive disclosure**: Individual use first, team governance later
|
|
219
|
+
|
|
220
|
+
## Telemetry Levels
|
|
221
|
+
|
|
222
|
+
Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
|
|
223
|
+
|
|
224
|
+
- **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
|
|
225
|
+
- **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
|
|
226
|
+
- **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
|
|
@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "alloc"
|
|
7
|
-
version = "0.
|
|
8
|
-
description = "
|
|
7
|
+
version = "0.5.0"
|
|
8
|
+
description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
license =
|
|
10
|
+
license = "Apache-2.0"
|
|
11
11
|
requires-python = ">=3.8"
|
|
12
12
|
authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
|
|
13
13
|
classifiers = [
|
|
@@ -81,6 +81,42 @@ def _estimate_dataloader_wait(cv):
|
|
|
81
81
|
return round((cv - 0.1) / 0.4 * 30.0, 1)
|
|
82
82
|
|
|
83
83
|
|
|
84
|
+
def _detect_distributed():
|
|
85
|
+
# type: () -> tuple
|
|
86
|
+
"""Detect if running inside a torch.distributed process group.
|
|
87
|
+
|
|
88
|
+
Returns (is_distributed, rank, world_size). Fail-safe: returns
|
|
89
|
+
(False, 0, 1) if torch.distributed is unavailable or not initialized.
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
import torch.distributed as dist
|
|
93
|
+
if dist.is_initialized():
|
|
94
|
+
return True, dist.get_rank(), dist.get_world_size()
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
return False, 0, 1
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _estimate_comm_overhead(step_times_ms, dataloader_wait_pct=0.0):
|
|
101
|
+
# type: (List[float], float) -> Optional[float]
|
|
102
|
+
"""Estimate communication overhead % for distributed training.
|
|
103
|
+
|
|
104
|
+
Uses the p90/p50 spread as a proxy for sync barrier delays.
|
|
105
|
+
Subtracts estimated dataloader contribution to avoid double-counting.
|
|
106
|
+
Returns None if insufficient data.
|
|
107
|
+
"""
|
|
108
|
+
if len(step_times_ms) < 10:
|
|
109
|
+
return None
|
|
110
|
+
sorted_vals = sorted(step_times_ms)
|
|
111
|
+
p50 = _compute_percentile(sorted_vals, 50)
|
|
112
|
+
p90 = _compute_percentile(sorted_vals, 90)
|
|
113
|
+
if p50 <= 0:
|
|
114
|
+
return None
|
|
115
|
+
raw_pct = ((p90 - p50) / p50) * 100
|
|
116
|
+
comm_pct = max(0.0, raw_pct - dataloader_wait_pct)
|
|
117
|
+
return round(min(40.0, comm_pct), 1)
|
|
118
|
+
|
|
119
|
+
|
|
84
120
|
def _write_callback_data(data):
|
|
85
121
|
# type: (Dict[str, Any]) -> None
|
|
86
122
|
"""Write callback data to the alloc sidecar file.
|
|
@@ -101,6 +137,9 @@ def _build_sidecar(
|
|
|
101
137
|
step_count, # type: int
|
|
102
138
|
step_times_ms, # type: List[float]
|
|
103
139
|
batch_size, # type: Optional[int]
|
|
140
|
+
is_distributed=False, # type: bool
|
|
141
|
+
rank=0, # type: int
|
|
142
|
+
world_size=1, # type: int
|
|
104
143
|
):
|
|
105
144
|
# type: (...) -> Dict[str, Any]
|
|
106
145
|
"""Build the sidecar dict from collected timing data."""
|
|
@@ -124,6 +163,15 @@ def _build_sidecar(
|
|
|
124
163
|
"batch_size": batch_size,
|
|
125
164
|
"dataloader_wait_pct": dataloader_wait_pct,
|
|
126
165
|
}
|
|
166
|
+
|
|
167
|
+
if is_distributed:
|
|
168
|
+
data["is_distributed"] = True
|
|
169
|
+
data["rank"] = rank
|
|
170
|
+
data["world_size"] = world_size
|
|
171
|
+
comm = _estimate_comm_overhead(step_times_ms, dataloader_wait_pct)
|
|
172
|
+
if comm is not None:
|
|
173
|
+
data["comm_overhead_pct"] = comm
|
|
174
|
+
|
|
127
175
|
return data
|
|
128
176
|
|
|
129
177
|
|
|
@@ -142,9 +190,17 @@ try:
|
|
|
142
190
|
self._step_start = None # type: Optional[float]
|
|
143
191
|
self._batch_size = None # type: Optional[int]
|
|
144
192
|
self._last_write_step = 0 # type: int
|
|
193
|
+
self._dist_checked = False # type: bool
|
|
194
|
+
self._is_distributed = False # type: bool
|
|
195
|
+
self._rank = 0 # type: int
|
|
196
|
+
self._world_size = 1 # type: int
|
|
145
197
|
|
|
146
198
|
def on_step_begin(self, args, state, control, **kwargs):
|
|
147
199
|
self._step_start = time.monotonic()
|
|
200
|
+
# Detect distributed once after process group is initialized
|
|
201
|
+
if not self._dist_checked:
|
|
202
|
+
self._is_distributed, self._rank, self._world_size = _detect_distributed()
|
|
203
|
+
self._dist_checked = True
|
|
148
204
|
|
|
149
205
|
def on_step_end(self, args, state, control, **kwargs):
|
|
150
206
|
self.step_count = state.global_step
|
|
@@ -183,6 +239,9 @@ try:
|
|
|
183
239
|
step_count=self.step_count,
|
|
184
240
|
step_times_ms=self._step_times_ms,
|
|
185
241
|
batch_size=self._batch_size,
|
|
242
|
+
is_distributed=self._is_distributed,
|
|
243
|
+
rank=self._rank,
|
|
244
|
+
world_size=self._world_size,
|
|
186
245
|
)
|
|
187
246
|
_write_callback_data(data)
|
|
188
247
|
|
|
@@ -214,9 +273,16 @@ try:
|
|
|
214
273
|
self._step_start = None # type: Optional[float]
|
|
215
274
|
self._batch_size = None # type: Optional[int]
|
|
216
275
|
self._last_write_step = 0 # type: int
|
|
276
|
+
self._dist_checked = False # type: bool
|
|
277
|
+
self._is_distributed = False # type: bool
|
|
278
|
+
self._rank = 0 # type: int
|
|
279
|
+
self._world_size = 1 # type: int
|
|
217
280
|
|
|
218
281
|
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
|
|
219
282
|
self._step_start = time.monotonic()
|
|
283
|
+
if not self._dist_checked:
|
|
284
|
+
self._is_distributed, self._rank, self._world_size = _detect_distributed()
|
|
285
|
+
self._dist_checked = True
|
|
220
286
|
|
|
221
287
|
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
222
288
|
self.step_count = trainer.global_step
|
|
@@ -259,6 +325,9 @@ try:
|
|
|
259
325
|
step_count=self.step_count,
|
|
260
326
|
step_times_ms=self._step_times_ms,
|
|
261
327
|
batch_size=self._batch_size,
|
|
328
|
+
is_distributed=self._is_distributed,
|
|
329
|
+
rank=self._rank,
|
|
330
|
+
world_size=self._world_size,
|
|
262
331
|
)
|
|
263
332
|
_write_callback_data(data)
|
|
264
333
|
|
|
@@ -76,6 +76,35 @@ def list_gpus() -> List[dict]:
|
|
|
76
76
|
return sorted(result, key=lambda x: x["vram_gb"], reverse=True)
|
|
77
77
|
|
|
78
78
|
|
|
79
|
+
def get_default_rate(gpu_name: str) -> Optional[float]:
|
|
80
|
+
"""Look up the average default $/hr for a GPU by name or alias.
|
|
81
|
+
|
|
82
|
+
Tries to match the probe-reported GPU name against catalog display names.
|
|
83
|
+
Returns the average across clouds, or None if not found.
|
|
84
|
+
"""
|
|
85
|
+
rate_card = _load_rate_card()
|
|
86
|
+
rates = rate_card.get("rates", {})
|
|
87
|
+
|
|
88
|
+
# Direct match by display name
|
|
89
|
+
for display_name, cloud_rates in rates.items():
|
|
90
|
+
if display_name.lower() in gpu_name.lower() or gpu_name.lower() in display_name.lower():
|
|
91
|
+
vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
|
|
92
|
+
return sum(vals) / len(vals) if vals else None
|
|
93
|
+
|
|
94
|
+
# Try aliases → display name
|
|
95
|
+
for alias, stable_id in _ALIASES.items():
|
|
96
|
+
if alias.lower() in gpu_name.lower():
|
|
97
|
+
catalog = _load_catalog()
|
|
98
|
+
spec = catalog.get("gpus", {}).get(stable_id)
|
|
99
|
+
if spec:
|
|
100
|
+
dn = spec.get("display_name", "")
|
|
101
|
+
cloud_rates = rates.get(dn, {})
|
|
102
|
+
vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
|
|
103
|
+
return sum(vals) / len(vals) if vals else None
|
|
104
|
+
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
79
108
|
def get_gpu(gpu_id: str) -> Optional[dict]:
|
|
80
109
|
"""Look up a GPU by stable ID or alias.
|
|
81
110
|
|