alloc 0.3.1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {alloc-0.3.1 → alloc-0.5.0}/PKG-INFO +70 -41
  2. {alloc-0.3.1 → alloc-0.5.0}/README.md +67 -38
  3. {alloc-0.3.1 → alloc-0.5.0}/pyproject.toml +3 -3
  4. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/__init__.py +1 -1
  5. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/callbacks.py +69 -0
  6. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/catalog/__init__.py +29 -0
  7. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/cli.py +576 -89
  8. alloc-0.5.0/src/alloc/config.py +124 -0
  9. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/display.py +33 -7
  10. alloc-0.5.0/src/alloc/extractor_runner.py +141 -0
  11. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/ghost.py +9 -2
  12. alloc-0.5.0/src/alloc/model_extractor.py +170 -0
  13. alloc-0.5.0/src/alloc/model_registry.py +138 -0
  14. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/probe.py +53 -41
  15. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/upload.py +8 -0
  16. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/yaml_config.py +51 -0
  17. {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/PKG-INFO +70 -41
  18. {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/SOURCES.txt +4 -0
  19. alloc-0.5.0/tests/test_auth.py +155 -0
  20. {alloc-0.3.1 → alloc-0.5.0}/tests/test_callbacks.py +103 -6
  21. {alloc-0.3.1 → alloc-0.5.0}/tests/test_cli.py +3 -5
  22. alloc-0.5.0/tests/test_init_from_org.py +98 -0
  23. {alloc-0.3.1 → alloc-0.5.0}/tests/test_verdict.py +1 -1
  24. {alloc-0.3.1 → alloc-0.5.0}/tests/test_yaml_config.py +2 -0
  25. alloc-0.3.1/src/alloc/config.py +0 -65
  26. alloc-0.3.1/src/alloc/model_extractor.py +0 -332
  27. {alloc-0.3.1 → alloc-0.5.0}/setup.cfg +0 -0
  28. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/artifact_writer.py +0 -0
  29. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/catalog/default_rate_card.json +0 -0
  30. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/catalog/gpus.v1.json +0 -0
  31. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/context.py +0 -0
  32. {alloc-0.3.1 → alloc-0.5.0}/src/alloc/stability.py +0 -0
  33. {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/dependency_links.txt +0 -0
  34. {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/entry_points.txt +0 -0
  35. {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/requires.txt +0 -0
  36. {alloc-0.3.1 → alloc-0.5.0}/src/alloc.egg-info/top_level.txt +0 -0
  37. {alloc-0.3.1 → alloc-0.5.0}/tests/test_artifact.py +0 -0
  38. {alloc-0.3.1 → alloc-0.5.0}/tests/test_catalog.py +0 -0
  39. {alloc-0.3.1 → alloc-0.5.0}/tests/test_context.py +0 -0
  40. {alloc-0.3.1 → alloc-0.5.0}/tests/test_ghost.py +0 -0
  41. {alloc-0.3.1 → alloc-0.5.0}/tests/test_model_extractor.py +0 -0
  42. {alloc-0.3.1 → alloc-0.5.0}/tests/test_probe_hw.py +0 -0
  43. {alloc-0.3.1 → alloc-0.5.0}/tests/test_probe_multi.py +0 -0
  44. {alloc-0.3.1 → alloc-0.5.0}/tests/test_stability.py +0 -0
  45. {alloc-0.3.1 → alloc-0.5.0}/tests/test_upload.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.3.1
4
- Summary: Training cost intelligence for ML workloads: estimate fit, profile runs, and ship budget-aware GPU decisions.
3
+ Version: 0.5.0
4
+ Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
- License: Apache-2.0
6
+ License-Expression: Apache-2.0
7
7
  Project-URL: Homepage, https://alloclabs.com
8
8
  Project-URL: Repository, https://github.com/alloc-labs/alloc
9
9
  Classifier: Development Status :: 3 - Alpha
@@ -28,28 +28,25 @@ Provides-Extra: dev
28
28
  Requires-Dist: pytest>=7.0.0; extra == "dev"
29
29
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
30
30
 
31
- # Alloc by [Alloc Labs](https://www.alloclabs.com)
31
+ # alloc (by [Alloc Labs](https://www.alloclabs.com))
32
32
 
33
- **Training cost intelligence for ML workloads.** Alloc helps you decide what to run, where to run it, and whether the run should continue, before expensive mistakes hit your cloud bill.
33
+ Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
34
34
 
35
35
  [![Website](https://img.shields.io/badge/alloclabs.com-website-22c55e)](https://www.alloclabs.com)
36
36
  [![PyPI](https://img.shields.io/pypi/v/alloc)](https://pypi.org/project/alloc/)
37
37
  [![License](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
38
38
 
39
- > Built by [Alloc Labs](https://www.alloclabs.com) GPU cost optimization for ML training.
39
+ > Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
40
40
 
41
41
  ## What Alloc Does
42
42
 
43
- Most ML teams overpay because resource decisions are guesswork and feedback arrives too late. A single mis-sized run can burn thousands before anyone notices.
43
+ Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
44
44
 
45
- Alloc gives you a progressive workflow:
45
+ - **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
46
+ - **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
47
+ - **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
46
48
 
47
- - **Pre-flight feasibility** estimate fit and strategy risk before launch (`alloc ghost`, `alloc scan`)
48
- - **Live calibration** — run briefly, collect real utilization/timing signals, then stop (`alloc run`)
49
- - **Run intelligence** — upload artifacts for cost-aware analysis and proposals (`alloc upload`)
50
- - **Policy path** — move from single-user optimization to team budget/governance over time
51
-
52
- Works with PyTorch, HuggingFace, Lightning, and launcher flows such as `python`, `torchrun`, and `accelerate`. Local profiling works without outbound internet.
49
+ Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
53
50
 
54
51
  ## Who This Is For
55
52
 
@@ -69,21 +66,31 @@ Works with PyTorch, HuggingFace, Lightning, and launcher flows such as `python`,
69
66
  ```bash
70
67
  pip install alloc
71
68
 
72
- # With GPU monitoring support
69
+ # With GPU monitoring support (NVML via pynvml)
73
70
  pip install alloc[gpu]
74
71
  ```
75
72
 
73
+ Notes:
74
+ - `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
75
+ - `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
76
+
76
77
  ## Commands
77
78
 
78
- ### `alloc scan` Remote Ghost Scan (no GPU needed)
79
+ ### `alloc scan`: Remote Ghost Scan (no GPU needed)
79
80
 
80
81
  ```bash
81
82
  alloc scan --model llama-3-70b --gpu A100-80GB
82
83
  alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
83
84
  alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
85
+
86
+ # Objective + budget constraints
87
+ alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
88
+
89
+ # Topology hints (optional, improves planner quality)
90
+ alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
84
91
  ```
85
92
 
86
- ### `alloc ghost` Local VRAM estimation
93
+ ### `alloc ghost`: Local VRAM estimation
87
94
 
88
95
  ```bash
89
96
  alloc ghost train.py --dtype bf16 --batch-size 32
@@ -92,7 +99,7 @@ alloc ghost train.py --param-count-b 7.0 # manual override
92
99
 
93
100
  Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
94
101
 
95
- ### `alloc run` Training with GPU monitoring
102
+ ### `alloc run`: Training with GPU monitoring
96
103
 
97
104
  ```bash
98
105
  alloc run python train.py # calibrate and exit (default)
@@ -103,20 +110,40 @@ alloc run -- python train.py --epochs 10
103
110
 
104
111
  Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
105
112
 
106
- **Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize (~30-60s), prints a verdict with bottleneck classification and recommendation, then exits. Use `--full` to monitor the entire run. Use `--timeout N` to adjust max calibration time (default 120s).
113
+ **Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
107
114
 
108
115
  **Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
109
116
 
110
117
  **Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
111
118
 
112
- ### `alloc login` Authenticate with dashboard
119
+ ### `alloc login`: Authenticate with dashboard
113
120
 
114
121
  ```bash
115
122
  alloc login
116
- # Prompts for email + password, stores token in ~/.alloc/config.json
123
+ # Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
124
+
125
+ alloc login --token <ACCESS_TOKEN>
126
+ # Paste an access token from the dashboard (no password prompt)
117
127
  ```
118
128
 
119
- ### `alloc upload` Upload artifact to dashboard
129
+ ### `alloc whoami`: Show current auth + org context
130
+
131
+ ```bash
132
+ alloc whoami
133
+ alloc whoami --json
134
+ ```
135
+
136
+ Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
137
+
138
+ ### `alloc logout`: Clear local session
139
+
140
+ ```bash
141
+ alloc logout
142
+ ```
143
+
144
+ Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
145
+
146
+ ### `alloc upload`: Upload artifact to dashboard
120
147
 
121
148
  ```bash
122
149
  alloc upload alloc_artifact.json.gz
@@ -124,7 +151,9 @@ alloc upload alloc_artifact.json.gz
124
151
 
125
152
  Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
126
153
 
127
- ### `alloc catalog` Browse GPU hardware catalog
154
+ If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
155
+
156
+ ### `alloc catalog`: Browse GPU hardware catalog
128
157
 
129
158
  ```bash
130
159
  alloc catalog list # list all 13 GPUs (sorted by VRAM)
@@ -136,11 +165,12 @@ alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
136
165
 
137
166
  Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
138
167
 
139
- ### `alloc init` Configure GPU fleet & budget
168
+ ### `alloc init`: Configure GPU fleet and budget
140
169
 
141
170
  ```bash
142
171
  alloc init # interactive wizard
143
172
  alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
173
+ alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
144
174
  ```
145
175
 
146
176
  Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
@@ -182,13 +212,15 @@ Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), sampl
182
212
 
183
213
  ## Configuration
184
214
 
185
- All config via environment variables. Zero config files required.
215
+ Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
186
216
 
187
217
  | Variable | Default | Description |
188
218
  |----------|---------|-------------|
189
219
  | `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
190
220
  | `ALLOC_TOKEN` | (empty) | Auth token for API calls |
191
- | `ALLOC_UPLOAD` | `false` | Upload results to dashboard |
221
+ | `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
222
+ | `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
223
+ | `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
192
224
 
193
225
  ## Architecture
194
226
 
@@ -210,18 +242,15 @@ All config via environment variables. Zero config files required.
210
242
 
211
243
  ## Design Principles
212
244
 
213
- 1. **Zero config** `alloc run python train.py` works out of the box
214
- 2. **No monkey-patching** External monitoring only, explicit opt-in API
215
- 3. **Never crash user's training** All Alloc failures are caught and silenced
216
- 4. **Progressive disclosure** Individual use first, team governance later
217
-
218
- ## Deep GPU Metrics (via Probe)
219
-
220
- | Metric | Why It Matters |
221
- |--------|---------------|
222
- | Memory bandwidth utilization | Identifies memory-bandwidth-bound workloads |
223
- | Tensor core vs CUDA core utilization | Reveals if workload uses tensor cores (FP16/BF16) |
224
- | SM occupancy | Low occupancy = kernel launch overhead or small batches |
225
- | PCIe/NVLink transfer rates | Communication bottlenecks in multi-GPU setups |
226
- | Compute throughput (TFLOPS) | Actual vs theoretical — feeds cost-efficiency analysis |
227
- | Power draw | Thermal throttling detection |
245
+ 1. **Zero config**: `alloc run python train.py` works out of the box
246
+ 2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
247
+ 3. **Never crash user's training**: All Alloc failures are caught and training continues
248
+ 4. **Progressive disclosure**: Individual use first, team governance later
249
+
250
+ ## Telemetry Levels
251
+
252
+ Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
253
+
254
+ - **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
255
+ - **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
256
+ - **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
@@ -1,25 +1,22 @@
1
- # Alloc by [Alloc Labs](https://www.alloclabs.com)
1
+ # alloc (by [Alloc Labs](https://www.alloclabs.com))
2
2
 
3
- **Training cost intelligence for ML workloads.** Alloc helps you decide what to run, where to run it, and whether the run should continue, before expensive mistakes hit your cloud bill.
3
+ Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
4
4
 
5
5
  [![Website](https://img.shields.io/badge/alloclabs.com-website-22c55e)](https://www.alloclabs.com)
6
6
  [![PyPI](https://img.shields.io/pypi/v/alloc)](https://pypi.org/project/alloc/)
7
7
  [![License](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
8
8
 
9
- > Built by [Alloc Labs](https://www.alloclabs.com) GPU cost optimization for ML training.
9
+ > Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
10
10
 
11
11
  ## What Alloc Does
12
12
 
13
- Most ML teams overpay because resource decisions are guesswork and feedback arrives too late. A single mis-sized run can burn thousands before anyone notices.
13
+ Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
14
14
 
15
- Alloc gives you a progressive workflow:
15
+ - **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
16
+ - **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
17
+ - **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
16
18
 
17
- - **Pre-flight feasibility** estimate fit and strategy risk before launch (`alloc ghost`, `alloc scan`)
18
- - **Live calibration** — run briefly, collect real utilization/timing signals, then stop (`alloc run`)
19
- - **Run intelligence** — upload artifacts for cost-aware analysis and proposals (`alloc upload`)
20
- - **Policy path** — move from single-user optimization to team budget/governance over time
21
-
22
- Works with PyTorch, HuggingFace, Lightning, and launcher flows such as `python`, `torchrun`, and `accelerate`. Local profiling works without outbound internet.
19
+ Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
23
20
 
24
21
  ## Who This Is For
25
22
 
@@ -39,21 +36,31 @@ Works with PyTorch, HuggingFace, Lightning, and launcher flows such as `python`,
39
36
  ```bash
40
37
  pip install alloc
41
38
 
42
- # With GPU monitoring support
39
+ # With GPU monitoring support (NVML via pynvml)
43
40
  pip install alloc[gpu]
44
41
  ```
45
42
 
43
+ Notes:
44
+ - `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
45
+ - `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
46
+
46
47
  ## Commands
47
48
 
48
- ### `alloc scan` Remote Ghost Scan (no GPU needed)
49
+ ### `alloc scan`: Remote Ghost Scan (no GPU needed)
49
50
 
50
51
  ```bash
51
52
  alloc scan --model llama-3-70b --gpu A100-80GB
52
53
  alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
53
54
  alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
55
+
56
+ # Objective + budget constraints
57
+ alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
58
+
59
+ # Topology hints (optional, improves planner quality)
60
+ alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
54
61
  ```
55
62
 
56
- ### `alloc ghost` Local VRAM estimation
63
+ ### `alloc ghost`: Local VRAM estimation
57
64
 
58
65
  ```bash
59
66
  alloc ghost train.py --dtype bf16 --batch-size 32
@@ -62,7 +69,7 @@ alloc ghost train.py --param-count-b 7.0 # manual override
62
69
 
63
70
  Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
64
71
 
65
- ### `alloc run` Training with GPU monitoring
72
+ ### `alloc run`: Training with GPU monitoring
66
73
 
67
74
  ```bash
68
75
  alloc run python train.py # calibrate and exit (default)
@@ -73,20 +80,40 @@ alloc run -- python train.py --epochs 10
73
80
 
74
81
  Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
75
82
 
76
- **Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize (~30-60s), prints a verdict with bottleneck classification and recommendation, then exits. Use `--full` to monitor the entire run. Use `--timeout N` to adjust max calibration time (default 120s).
83
+ **Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
77
84
 
78
85
  **Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
79
86
 
80
87
  **Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
81
88
 
82
- ### `alloc login` Authenticate with dashboard
89
+ ### `alloc login`: Authenticate with dashboard
83
90
 
84
91
  ```bash
85
92
  alloc login
86
- # Prompts for email + password, stores token in ~/.alloc/config.json
93
+ # Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
94
+
95
+ alloc login --token <ACCESS_TOKEN>
96
+ # Paste an access token from the dashboard (no password prompt)
87
97
  ```
88
98
 
89
- ### `alloc upload` Upload artifact to dashboard
99
+ ### `alloc whoami`: Show current auth + org context
100
+
101
+ ```bash
102
+ alloc whoami
103
+ alloc whoami --json
104
+ ```
105
+
106
+ Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
107
+
108
+ ### `alloc logout`: Clear local session
109
+
110
+ ```bash
111
+ alloc logout
112
+ ```
113
+
114
+ Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
115
+
116
+ ### `alloc upload`: Upload artifact to dashboard
90
117
 
91
118
  ```bash
92
119
  alloc upload alloc_artifact.json.gz
@@ -94,7 +121,9 @@ alloc upload alloc_artifact.json.gz
94
121
 
95
122
  Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
96
123
 
97
- ### `alloc catalog` Browse GPU hardware catalog
124
+ If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
125
+
126
+ ### `alloc catalog`: Browse GPU hardware catalog
98
127
 
99
128
  ```bash
100
129
  alloc catalog list # list all 13 GPUs (sorted by VRAM)
@@ -106,11 +135,12 @@ alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
106
135
 
107
136
  Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
108
137
 
109
- ### `alloc init` Configure GPU fleet & budget
138
+ ### `alloc init`: Configure GPU fleet and budget
110
139
 
111
140
  ```bash
112
141
  alloc init # interactive wizard
113
142
  alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
143
+ alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
114
144
  ```
115
145
 
116
146
  Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
@@ -152,13 +182,15 @@ Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), sampl
152
182
 
153
183
  ## Configuration
154
184
 
155
- All config via environment variables. Zero config files required.
185
+ Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
156
186
 
157
187
  | Variable | Default | Description |
158
188
  |----------|---------|-------------|
159
189
  | `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
160
190
  | `ALLOC_TOKEN` | (empty) | Auth token for API calls |
161
- | `ALLOC_UPLOAD` | `false` | Upload results to dashboard |
191
+ | `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
192
+ | `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
193
+ | `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
162
194
 
163
195
  ## Architecture
164
196
 
@@ -180,18 +212,15 @@ All config via environment variables. Zero config files required.
180
212
 
181
213
  ## Design Principles
182
214
 
183
- 1. **Zero config** `alloc run python train.py` works out of the box
184
- 2. **No monkey-patching** External monitoring only, explicit opt-in API
185
- 3. **Never crash user's training** All Alloc failures are caught and silenced
186
- 4. **Progressive disclosure** Individual use first, team governance later
187
-
188
- ## Deep GPU Metrics (via Probe)
189
-
190
- | Metric | Why It Matters |
191
- |--------|---------------|
192
- | Memory bandwidth utilization | Identifies memory-bandwidth-bound workloads |
193
- | Tensor core vs CUDA core utilization | Reveals if workload uses tensor cores (FP16/BF16) |
194
- | SM occupancy | Low occupancy = kernel launch overhead or small batches |
195
- | PCIe/NVLink transfer rates | Communication bottlenecks in multi-GPU setups |
196
- | Compute throughput (TFLOPS) | Actual vs theoretical — feeds cost-efficiency analysis |
197
- | Power draw | Thermal throttling detection |
215
+ 1. **Zero config**: `alloc run python train.py` works out of the box
216
+ 2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
217
+ 3. **Never crash user's training**: All Alloc failures are caught and training continues
218
+ 4. **Progressive disclosure**: Individual use first, team governance later
219
+
220
+ ## Telemetry Levels
221
+
222
+ Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
223
+
224
+ - **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
225
+ - **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
226
+ - **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alloc"
7
- version = "0.3.1"
8
- description = "Training cost intelligence for ML workloads: estimate fit, profile runs, and ship budget-aware GPU decisions."
7
+ version = "0.5.0"
8
+ description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
9
  readme = "README.md"
10
- license = {text = "Apache-2.0"}
10
+ license = "Apache-2.0"
11
11
  requires-python = ">=3.8"
12
12
  authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
13
13
  classifiers = [
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "0.3.1"
5
+ __version__ = "0.5.0"
6
6
 
7
7
  from alloc.ghost import ghost, GhostReport
8
8
  from alloc.callbacks import AllocCallback as HuggingFaceCallback
@@ -81,6 +81,42 @@ def _estimate_dataloader_wait(cv):
81
81
  return round((cv - 0.1) / 0.4 * 30.0, 1)
82
82
 
83
83
 
84
+ def _detect_distributed():
85
+ # type: () -> tuple
86
+ """Detect if running inside a torch.distributed process group.
87
+
88
+ Returns (is_distributed, rank, world_size). Fail-safe: returns
89
+ (False, 0, 1) if torch.distributed is unavailable or not initialized.
90
+ """
91
+ try:
92
+ import torch.distributed as dist
93
+ if dist.is_initialized():
94
+ return True, dist.get_rank(), dist.get_world_size()
95
+ except Exception:
96
+ pass
97
+ return False, 0, 1
98
+
99
+
100
+ def _estimate_comm_overhead(step_times_ms, dataloader_wait_pct=0.0):
101
+ # type: (List[float], float) -> Optional[float]
102
+ """Estimate communication overhead % for distributed training.
103
+
104
+ Uses the p90/p50 spread as a proxy for sync barrier delays.
105
+ Subtracts estimated dataloader contribution to avoid double-counting.
106
+ Returns None if insufficient data.
107
+ """
108
+ if len(step_times_ms) < 10:
109
+ return None
110
+ sorted_vals = sorted(step_times_ms)
111
+ p50 = _compute_percentile(sorted_vals, 50)
112
+ p90 = _compute_percentile(sorted_vals, 90)
113
+ if p50 <= 0:
114
+ return None
115
+ raw_pct = ((p90 - p50) / p50) * 100
116
+ comm_pct = max(0.0, raw_pct - dataloader_wait_pct)
117
+ return round(min(40.0, comm_pct), 1)
118
+
119
+
84
120
  def _write_callback_data(data):
85
121
  # type: (Dict[str, Any]) -> None
86
122
  """Write callback data to the alloc sidecar file.
@@ -101,6 +137,9 @@ def _build_sidecar(
101
137
  step_count, # type: int
102
138
  step_times_ms, # type: List[float]
103
139
  batch_size, # type: Optional[int]
140
+ is_distributed=False, # type: bool
141
+ rank=0, # type: int
142
+ world_size=1, # type: int
104
143
  ):
105
144
  # type: (...) -> Dict[str, Any]
106
145
  """Build the sidecar dict from collected timing data."""
@@ -124,6 +163,15 @@ def _build_sidecar(
124
163
  "batch_size": batch_size,
125
164
  "dataloader_wait_pct": dataloader_wait_pct,
126
165
  }
166
+
167
+ if is_distributed:
168
+ data["is_distributed"] = True
169
+ data["rank"] = rank
170
+ data["world_size"] = world_size
171
+ comm = _estimate_comm_overhead(step_times_ms, dataloader_wait_pct)
172
+ if comm is not None:
173
+ data["comm_overhead_pct"] = comm
174
+
127
175
  return data
128
176
 
129
177
 
@@ -142,9 +190,17 @@ try:
142
190
  self._step_start = None # type: Optional[float]
143
191
  self._batch_size = None # type: Optional[int]
144
192
  self._last_write_step = 0 # type: int
193
+ self._dist_checked = False # type: bool
194
+ self._is_distributed = False # type: bool
195
+ self._rank = 0 # type: int
196
+ self._world_size = 1 # type: int
145
197
 
146
198
  def on_step_begin(self, args, state, control, **kwargs):
147
199
  self._step_start = time.monotonic()
200
+ # Detect distributed once after process group is initialized
201
+ if not self._dist_checked:
202
+ self._is_distributed, self._rank, self._world_size = _detect_distributed()
203
+ self._dist_checked = True
148
204
 
149
205
  def on_step_end(self, args, state, control, **kwargs):
150
206
  self.step_count = state.global_step
@@ -183,6 +239,9 @@ try:
183
239
  step_count=self.step_count,
184
240
  step_times_ms=self._step_times_ms,
185
241
  batch_size=self._batch_size,
242
+ is_distributed=self._is_distributed,
243
+ rank=self._rank,
244
+ world_size=self._world_size,
186
245
  )
187
246
  _write_callback_data(data)
188
247
 
@@ -214,9 +273,16 @@ try:
214
273
  self._step_start = None # type: Optional[float]
215
274
  self._batch_size = None # type: Optional[int]
216
275
  self._last_write_step = 0 # type: int
276
+ self._dist_checked = False # type: bool
277
+ self._is_distributed = False # type: bool
278
+ self._rank = 0 # type: int
279
+ self._world_size = 1 # type: int
217
280
 
218
281
  def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
219
282
  self._step_start = time.monotonic()
283
+ if not self._dist_checked:
284
+ self._is_distributed, self._rank, self._world_size = _detect_distributed()
285
+ self._dist_checked = True
220
286
 
221
287
  def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
222
288
  self.step_count = trainer.global_step
@@ -259,6 +325,9 @@ try:
259
325
  step_count=self.step_count,
260
326
  step_times_ms=self._step_times_ms,
261
327
  batch_size=self._batch_size,
328
+ is_distributed=self._is_distributed,
329
+ rank=self._rank,
330
+ world_size=self._world_size,
262
331
  )
263
332
  _write_callback_data(data)
264
333
 
@@ -76,6 +76,35 @@ def list_gpus() -> List[dict]:
76
76
  return sorted(result, key=lambda x: x["vram_gb"], reverse=True)
77
77
 
78
78
 
79
+ def get_default_rate(gpu_name: str) -> Optional[float]:
80
+ """Look up the average default $/hr for a GPU by name or alias.
81
+
82
+ Tries to match the probe-reported GPU name against catalog display names.
83
+ Returns the average across clouds, or None if not found.
84
+ """
85
+ rate_card = _load_rate_card()
86
+ rates = rate_card.get("rates", {})
87
+
88
+ # Direct match by display name
89
+ for display_name, cloud_rates in rates.items():
90
+ if display_name.lower() in gpu_name.lower() or gpu_name.lower() in display_name.lower():
91
+ vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
92
+ return sum(vals) / len(vals) if vals else None
93
+
94
+ # Try aliases → display name
95
+ for alias, stable_id in _ALIASES.items():
96
+ if alias.lower() in gpu_name.lower():
97
+ catalog = _load_catalog()
98
+ spec = catalog.get("gpus", {}).get(stable_id)
99
+ if spec:
100
+ dn = spec.get("display_name", "")
101
+ cloud_rates = rates.get(dn, {})
102
+ vals = [v for v in cloud_rates.values() if isinstance(v, (int, float))]
103
+ return sum(vals) / len(vals) if vals else None
104
+
105
+ return None
106
+
107
+
79
108
  def get_gpu(gpu_id: str) -> Optional[dict]:
80
109
  """Look up a GPU by stable ID or alias.
81
110