alloc 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. alloc-0.0.1/PKG-INFO +256 -0
  2. alloc-0.0.1/README.md +226 -0
  3. alloc-0.0.1/pyproject.toml +47 -0
  4. alloc-0.0.1/setup.cfg +4 -0
  5. alloc-0.0.1/src/alloc/__init__.py +11 -0
  6. alloc-0.0.1/src/alloc/artifact_writer.py +67 -0
  7. alloc-0.0.1/src/alloc/callbacks.py +342 -0
  8. alloc-0.0.1/src/alloc/catalog/__init__.py +138 -0
  9. alloc-0.0.1/src/alloc/catalog/default_rate_card.json +18 -0
  10. alloc-0.0.1/src/alloc/catalog/gpus.v1.json +174 -0
  11. alloc-0.0.1/src/alloc/cli.py +1341 -0
  12. alloc-0.0.1/src/alloc/config.py +124 -0
  13. alloc-0.0.1/src/alloc/context.py +191 -0
  14. alloc-0.0.1/src/alloc/display.py +580 -0
  15. alloc-0.0.1/src/alloc/extractor_runner.py +141 -0
  16. alloc-0.0.1/src/alloc/ghost.py +167 -0
  17. alloc-0.0.1/src/alloc/model_extractor.py +170 -0
  18. alloc-0.0.1/src/alloc/model_registry.py +138 -0
  19. alloc-0.0.1/src/alloc/probe.py +461 -0
  20. alloc-0.0.1/src/alloc/stability.py +144 -0
  21. alloc-0.0.1/src/alloc/upload.py +138 -0
  22. alloc-0.0.1/src/alloc/yaml_config.py +287 -0
  23. alloc-0.0.1/src/alloc.egg-info/PKG-INFO +256 -0
  24. alloc-0.0.1/src/alloc.egg-info/SOURCES.txt +41 -0
  25. alloc-0.0.1/src/alloc.egg-info/dependency_links.txt +1 -0
  26. alloc-0.0.1/src/alloc.egg-info/entry_points.txt +2 -0
  27. alloc-0.0.1/src/alloc.egg-info/requires.txt +12 -0
  28. alloc-0.0.1/src/alloc.egg-info/top_level.txt +1 -0
  29. alloc-0.0.1/tests/test_artifact.py +128 -0
  30. alloc-0.0.1/tests/test_auth.py +155 -0
  31. alloc-0.0.1/tests/test_callbacks.py +330 -0
  32. alloc-0.0.1/tests/test_catalog.py +83 -0
  33. alloc-0.0.1/tests/test_cli.py +205 -0
  34. alloc-0.0.1/tests/test_context.py +135 -0
  35. alloc-0.0.1/tests/test_ghost.py +82 -0
  36. alloc-0.0.1/tests/test_init_from_org.py +98 -0
  37. alloc-0.0.1/tests/test_model_extractor.py +232 -0
  38. alloc-0.0.1/tests/test_probe_hw.py +83 -0
  39. alloc-0.0.1/tests/test_probe_multi.py +114 -0
  40. alloc-0.0.1/tests/test_stability.py +173 -0
  41. alloc-0.0.1/tests/test_upload.py +157 -0
  42. alloc-0.0.1/tests/test_verdict.py +187 -0
  43. alloc-0.0.1/tests/test_yaml_config.py +215 -0
alloc-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: alloc
3
+ Version: 0.0.1
4
+ Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
+ Author-email: Alloc Labs <hello@alloclabs.com>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://alloclabs.com
8
+ Project-URL: Repository, https://github.com/alloc-labs/alloc
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.8
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: typer>=0.9.0
21
+ Requires-Dist: rich>=13.0.0
22
+ Requires-Dist: httpx>=0.24.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: pyyaml>=6.0
25
+ Provides-Extra: gpu
26
+ Requires-Dist: pynvml>=11.5.0; extra == "gpu"
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
29
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
30
+
31
+ # alloc (by [Alloc Labs](https://www.alloclabs.com))
32
+
33
+ Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
34
+
35
+ [![Website](https://img.shields.io/badge/alloclabs.com-website-22c55e)](https://www.alloclabs.com)
36
+ [![PyPI](https://img.shields.io/pypi/v/alloc)](https://pypi.org/project/alloc/)
37
+ [![License](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
38
+
39
+ > Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
40
+
41
+ ## What Alloc Does
42
+
43
+ Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
44
+
45
+ - **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
46
+ - **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
47
+ - **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
48
+
49
+ Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
50
+
51
+ ## Who This Is For
52
+
53
+ - **Solo engineers** who want a fast sanity check before burning GPU time
54
+ - **ML teams** who need repeatable right-sizing and bottleneck visibility
55
+ - **Platform/infra leads** who want budget-aware controls without rewriting training code
56
+
57
+ ## Why It Is Low Friction
58
+
59
+ - **No code changes required** for baseline value (`alloc run`)
60
+ - **Optional deeper integration** via callbacks when you want richer timing signals
61
+ - **Local-first artifacts** so users still get value without cloud connectivity
62
+ - **Progressive adoption** from local CLI to team workflows and governance
63
+
64
+ ## Install
65
+
66
+ ```bash
67
+ pip install alloc
68
+
69
+ # With GPU monitoring support (NVML via pynvml)
70
+ pip install alloc[gpu]
71
+ ```
72
+
73
+ Notes:
74
+ - `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
75
+ - `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
76
+
77
+ ## Commands
78
+
79
+ ### `alloc scan`: Remote Ghost Scan (no GPU needed)
80
+
81
+ ```bash
82
+ alloc scan --model llama-3-70b --gpu A100-80GB
83
+ alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
84
+ alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
85
+
86
+ # Objective + budget constraints
87
+ alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
88
+
89
+ # Topology hints (optional, improves planner quality)
90
+ alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
91
+ ```
92
+
93
+ ### `alloc ghost`: Local VRAM estimation
94
+
95
+ ```bash
96
+ alloc ghost train.py --dtype bf16 --batch-size 32
97
+ alloc ghost train.py --param-count-b 7.0 # manual override
98
+ ```
99
+
100
+ Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
101
+
102
+ ### `alloc run`: Training with GPU monitoring
103
+
104
+ ```bash
105
+ alloc run python train.py # calibrate and exit (default)
106
+ alloc run --full python train.py # monitor full training run
107
+ alloc run torchrun --nproc_per_node=4 train.py
108
+ alloc run -- python train.py --epochs 10
109
+ ```
110
+
111
+ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
112
+
113
+ **Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
114
+
115
+ **Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
116
+
117
+ **Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
118
+
119
+ ### `alloc login`: Authenticate with dashboard
120
+
121
+ ```bash
122
+ alloc login
123
+ # Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
124
+
125
+ alloc login --token <ACCESS_TOKEN>
126
+ # Paste an access token from the dashboard (no password prompt)
127
+ ```
128
+
129
+ ### `alloc whoami`: Show current auth + org context
130
+
131
+ ```bash
132
+ alloc whoami
133
+ alloc whoami --json
134
+ ```
135
+
136
+ Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
137
+
138
+ ### `alloc logout`: Clear local session
139
+
140
+ ```bash
141
+ alloc logout
142
+ ```
143
+
144
+ Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
145
+
146
+ ### `alloc upload`: Upload artifact to dashboard
147
+
148
+ ```bash
149
+ alloc upload alloc_artifact.json.gz
150
+ ```
151
+
152
+ Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
153
+
154
+ If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
155
+
156
+ ### `alloc catalog`: Browse GPU hardware catalog
157
+
158
+ ```bash
159
+ alloc catalog list # list all 13 GPUs (sorted by VRAM)
160
+ alloc catalog list --sort cost # sort by $/hr
161
+ alloc catalog list --sort tflops # sort by BF16 TFLOPS
162
+ alloc catalog show H100 # detailed specs for H100
163
+ alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
164
+ ```
165
+
166
+ Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
167
+
168
+ ### `alloc init`: Configure GPU fleet and budget
169
+
170
+ ```bash
171
+ alloc init # interactive wizard
172
+ alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
173
+ alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
174
+ ```
175
+
176
+ Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
177
+
178
+ ### `alloc version`
179
+
180
+ ```bash
181
+ alloc version
182
+ ```
183
+
184
+ ## Python API
185
+
186
+ ```python
187
+ import alloc
188
+
189
+ # Static VRAM analysis (never crashes your training)
190
+ report = alloc.ghost(model)
191
+ print(report.total_gb) # e.g., 115.42
192
+
193
+ # Or from param count (no torch needed)
194
+ report = alloc.ghost(param_count_b=7.0, dtype="bf16")
195
+ ```
196
+
197
+ ## Framework Callbacks
198
+
199
+ Optional callbacks for deeper profiling. Captures step-level timing, throughput, and dataloader wait estimates.
200
+
201
+ ```python
202
+ # HuggingFace Transformers
203
+ from alloc import HuggingFaceCallback
204
+ trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
205
+
206
+ # PyTorch Lightning
207
+ from alloc import LightningCallback
208
+ trainer = Trainer(..., callbacks=[LightningCallback()])
209
+ ```
210
+
211
+ Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), samples/sec, and estimated dataloader wait %. This unlocks higher confidence analysis and dataloader bottleneck detection.
212
+
213
+ ## Configuration
214
+
215
+ Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
216
+
217
+ | Variable | Default | Description |
218
+ |----------|---------|-------------|
219
+ | `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
220
+ | `ALLOC_TOKEN` | (empty) | Auth token for API calls |
221
+ | `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
222
+ | `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
223
+ | `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
224
+
225
+ ## Architecture
226
+
227
+ | Module | Purpose |
228
+ |--------|---------|
229
+ | `ghost.py` | VRAM estimation from parameter count. Computes weights + gradients + optimizer + activations + buffer breakdown. |
230
+ | `model_extractor.py` | Three-method model discovery: subprocess execution (`nn.Module` finder), AST parsing (`from_pretrained`), manual override. |
231
+ | `probe.py` | External GPU monitoring via `pynvml`. Process-tree aware multi-GPU discovery. Captures hardware context (driver, CUDA, SM version). |
232
+ | `stability.py` | Multi-signal stability detection for calibrate-and-exit (VRAM plateau + util std dev + power std dev). |
233
+ | `catalog/` | Bundled GPU hardware catalog (13 GPUs) with specs and pricing. Powers `alloc catalog` commands. |
234
+ | `context.py` | Context autodiscovery: git (SHA, branch, repo), container (Docker/Podman), Ray (job ID, cluster). |
235
+ | `artifact_writer.py` | Artifact Writer: writes `alloc_artifact.json.gz` with probe, ghost, hardware, and context sections. |
236
+ | `cli.py` | Typer CLI with `ghost`, `run`, `scan`, `login`, `upload`, `init`, `catalog`, `version` commands. |
237
+ | `yaml_config.py` | `.alloc.yaml` parser: fleet, explore, priority, budget. Loaded automatically by `ghost`, `run`, `scan`. |
238
+ | `callbacks.py` | Framework callbacks: HuggingFace `TrainerCallback` and Lightning `Callback` with step timing (p50/p90), throughput, and dataloader wait estimation. |
239
+ | `upload.py` | Artifact uploader: POSTs `.json.gz` to `POST /runs/ingest`. |
240
+ | `display.py` | Rich terminal formatting for reports. |
241
+ | `config.py` | Env-var-only configuration (API URL, Supabase URL, token storage). |
242
+
243
+ ## Design Principles
244
+
245
+ 1. **Zero config**: `alloc run python train.py` works out of the box
246
+ 2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
247
+ 3. **Never crash user's training**: All Alloc failures are caught and training continues
248
+ 4. **Progressive disclosure**: Individual use first, team governance later
249
+
250
+ ## Telemetry Levels
251
+
252
+ Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
253
+
254
+ - **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
255
+ - **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
256
+ - **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
alloc-0.0.1/README.md ADDED
@@ -0,0 +1,226 @@
1
+ # alloc (by [Alloc Labs](https://www.alloclabs.com))
2
+
3
+ Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
4
+
5
+ [![Website](https://img.shields.io/badge/alloclabs.com-website-22c55e)](https://www.alloclabs.com)
6
+ [![PyPI](https://img.shields.io/pypi/v/alloc)](https://pypi.org/project/alloc/)
7
+ [![License](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
8
+
9
+ > Built by [Alloc Labs](https://www.alloclabs.com): reduce ML training costs with better pre-flight decisions and faster feedback loops.
10
+
11
+ ## What Alloc Does
12
+
13
+ Most ML teams waste spend because resource decisions are guesswork and feedback arrives too late. Alloc gives you a progressive workflow:
14
+
15
+ - **Pre-flight**: estimate VRAM fit and rank feasible configs by objective (`alloc scan`, `alloc ghost`)
16
+ - **Calibration run**: measure peak VRAM + utilization (and optionally step timing) from a short run (`alloc run`)
17
+ - **Run history**: upload artifacts for team visibility and budget-aware proposals (`alloc upload`)
18
+
19
+ Alloc is launcher-first. It works with `python`, `torchrun`, `accelerate`, and cluster entrypoints (Slurm, Ray, Kubernetes) because it does not require framework-specific wrappers for baseline value.
20
+
21
+ ## Who This Is For
22
+
23
+ - **Solo engineers** who want a fast sanity check before burning GPU time
24
+ - **ML teams** who need repeatable right-sizing and bottleneck visibility
25
+ - **Platform/infra leads** who want budget-aware controls without rewriting training code
26
+
27
+ ## Why It Is Low Friction
28
+
29
+ - **No code changes required** for baseline value (`alloc run`)
30
+ - **Optional deeper integration** via callbacks when you want richer timing signals
31
+ - **Local-first artifacts** so users still get value without cloud connectivity
32
+ - **Progressive adoption** from local CLI to team workflows and governance
33
+
34
+ ## Install
35
+
36
+ ```bash
37
+ pip install alloc
38
+
39
+ # With GPU monitoring support (NVML via pynvml)
40
+ pip install alloc[gpu]
41
+ ```
42
+
43
+ Notes:
44
+ - `alloc` does not depend on torch. If you want `alloc ghost train.py` to infer param counts from a script, torch must be installed in that environment, otherwise use `--param-count-b`.
45
+ - `alloc run` will still execute your command without `alloc[gpu]`, but it cannot collect GPU metrics.
46
+
47
+ ## Commands
48
+
49
+ ### `alloc scan`: Remote Ghost Scan (no GPU needed)
50
+
51
+ ```bash
52
+ alloc scan --model llama-3-70b --gpu A100-80GB
53
+ alloc scan --model mistral-7b --gpu A10G --strategy fsdp --num-gpus 4
54
+ alloc scan --param-count-b 13.0 --gpu H100-80GB --dtype bf16
55
+
56
+ # Objective + budget constraints
57
+ alloc scan --model llama-3-70b --gpu H100-80GB --objective fastest_within_budget --max-budget-hourly 12
58
+
59
+ # Topology hints (optional, improves planner quality)
60
+ alloc scan --param-count-b 70 --gpu H100-80GB --num-gpus 64 --num-nodes 8 --gpus-per-node 8 --interconnect infiniband
61
+ ```
62
+
63
+ ### `alloc ghost`: Local VRAM estimation
64
+
65
+ ```bash
66
+ alloc ghost train.py --dtype bf16 --batch-size 32
67
+ alloc ghost train.py --param-count-b 7.0 # manual override
68
+ ```
69
+
70
+ Analyzes your training script to discover model parameters and computes a VRAM breakdown. Uses a three-method fallback: (1) `--param-count-b` manual override, (2) subprocess execution to find `nn.Module` classes and count parameters, (3) AST parsing for `from_pretrained()` calls.
71
+
72
+ ### `alloc run`: Training with GPU monitoring
73
+
74
+ ```bash
75
+ alloc run python train.py # calibrate and exit (default)
76
+ alloc run --full python train.py # monitor full training run
77
+ alloc run torchrun --nproc_per_node=4 train.py
78
+ alloc run -- python train.py --epochs 10
79
+ ```
80
+
81
+ Wraps your command, monitors GPU memory/utilization/power via `pynvml`, and writes an artifact.
82
+
83
+ **Default: calibrate-and-exit.** Auto-stops when GPU metrics stabilize, prints a verdict with bottleneck classification and a top recommendation, then exits. Use `--timeout N` to adjust max calibration time (default 120s). Use `--full` to monitor the entire run.
84
+
85
+ **Multi-GPU:** Automatically discovers all GPUs used by the process tree (works with `torchrun`, `accelerate launch`, etc.).
86
+
87
+ **Hardware context:** Captures driver version, CUDA version, and SM compute capability from NVML.
88
+
89
+ ### `alloc login`: Authenticate with dashboard
90
+
91
+ ```bash
92
+ alloc login
93
+ # Prompts for email + password, stores token + refresh_token in ~/.alloc/config.json
94
+
95
+ alloc login --token <ACCESS_TOKEN>
96
+ # Paste an access token from the dashboard (no password prompt)
97
+ ```
98
+
99
+ ### `alloc whoami`: Show current auth + org context
100
+
101
+ ```bash
102
+ alloc whoami
103
+ alloc whoami --json
104
+ ```
105
+
106
+ Prints the current identity (when logged in), plus objective, effective budget cap, and fleet counts.
107
+
108
+ ### `alloc logout`: Clear local session
109
+
110
+ ```bash
111
+ alloc logout
112
+ ```
113
+
114
+ Clears saved `token`/`refresh_token` from `~/.alloc/config.json`.
115
+
116
+ ### `alloc upload`: Upload artifact to dashboard
117
+
118
+ ```bash
119
+ alloc upload alloc_artifact.json.gz
120
+ ```
121
+
122
+ Uploads a previously saved `.json.gz` artifact to the dashboard via `POST /runs/ingest`. Requires authentication (`alloc login` first).
123
+
124
+ If your session token has expired and a `refresh_token` is available (password login flow), `alloc upload` refreshes once and retries automatically.
125
+
126
+ ### `alloc catalog`: Browse GPU hardware catalog
127
+
128
+ ```bash
129
+ alloc catalog list # list all 13 GPUs (sorted by VRAM)
130
+ alloc catalog list --sort cost # sort by $/hr
131
+ alloc catalog list --sort tflops # sort by BF16 TFLOPS
132
+ alloc catalog show H100 # detailed specs for H100
133
+ alloc catalog show nvidia-a100-sxm-80gb # lookup by stable ID
134
+ ```
135
+
136
+ Offline reference for GPU specs, interconnect details, and cloud pricing. Supports aliases (H100, A100, T4) and stable IDs.
137
+
138
+ ### `alloc init`: Configure GPU fleet and budget
139
+
140
+ ```bash
141
+ alloc init # interactive wizard
142
+ alloc init --yes # non-interactive defaults (full catalog, 50/50 priority)
143
+ alloc init --from-org --yes # pull fleet/budget/objective from your org (requires alloc login)
144
+ ```
145
+
146
+ Creates a `.alloc.yaml` file in the current directory with your GPU fleet, explore list, budget, and priority weights. When present, `ghost`, `run`, and `scan` automatically use fleet context for recommendations. Use `--no-config` on any command to skip it.
147
+
148
+ ### `alloc version`
149
+
150
+ ```bash
151
+ alloc version
152
+ ```
153
+
154
+ ## Python API
155
+
156
+ ```python
157
+ import alloc
158
+
159
+ # Static VRAM analysis (never crashes your training)
160
+ report = alloc.ghost(model)
161
+ print(report.total_gb) # e.g., 115.42
162
+
163
+ # Or from param count (no torch needed)
164
+ report = alloc.ghost(param_count_b=7.0, dtype="bf16")
165
+ ```
166
+
167
+ ## Framework Callbacks
168
+
169
+ Optional callbacks for deeper profiling. Captures step-level timing, throughput, and dataloader wait estimates.
170
+
171
+ ```python
172
+ # HuggingFace Transformers
173
+ from alloc import HuggingFaceCallback
174
+ trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
175
+
176
+ # PyTorch Lightning
177
+ from alloc import LightningCallback
178
+ trainer = Trainer(..., callbacks=[LightningCallback()])
179
+ ```
180
+
181
+ Callbacks write a `.alloc_callback.json` sidecar with step time (p50/p90), samples/sec, and estimated dataloader wait %. This unlocks higher confidence analysis and dataloader bottleneck detection.
182
+
183
+ ## Configuration
184
+
185
+ Alloc works with zero config. You can optionally configure it with environment variables and/or a `.alloc.yaml` in your repo.
186
+
187
+ | Variable | Default | Description |
188
+ |----------|---------|-------------|
189
+ | `ALLOC_API_URL` | `https://alloc-production-ffc2.up.railway.app` | API endpoint for remote scans |
190
+ | `ALLOC_TOKEN` | (empty) | Auth token for API calls |
191
+ | `ALLOC_UPLOAD` | `false` | Upload results to dashboard (`alloc run --upload` also works) |
192
+ | `ALLOC_OUT` | `alloc_artifact.json.gz` | Artifact output path |
193
+ | `ALLOC_GPU_COUNT_CANDIDATES` | (empty) | Override GPU-count candidates for ranking (comma-separated ints) |
194
+
195
+ ## Architecture
196
+
197
+ | Module | Purpose |
198
+ |--------|---------|
199
+ | `ghost.py` | VRAM estimation from parameter count. Computes weights + gradients + optimizer + activations + buffer breakdown. |
200
+ | `model_extractor.py` | Three-method model discovery: subprocess execution (`nn.Module` finder), AST parsing (`from_pretrained`), manual override. |
201
+ | `probe.py` | External GPU monitoring via `pynvml`. Process-tree aware multi-GPU discovery. Captures hardware context (driver, CUDA, SM version). |
202
+ | `stability.py` | Multi-signal stability detection for calibrate-and-exit (VRAM plateau + util std dev + power std dev). |
203
+ | `catalog/` | Bundled GPU hardware catalog (13 GPUs) with specs and pricing. Powers `alloc catalog` commands. |
204
+ | `context.py` | Context autodiscovery: git (SHA, branch, repo), container (Docker/Podman), Ray (job ID, cluster). |
205
+ | `artifact_writer.py` | Artifact Writer: writes `alloc_artifact.json.gz` with probe, ghost, hardware, and context sections. |
206
+ | `cli.py` | Typer CLI with `ghost`, `run`, `scan`, `login`, `upload`, `init`, `catalog`, `version` commands. |
207
+ | `yaml_config.py` | `.alloc.yaml` parser: fleet, explore, priority, budget. Loaded automatically by `ghost`, `run`, `scan`. |
208
+ | `callbacks.py` | Framework callbacks: HuggingFace `TrainerCallback` and Lightning `Callback` with step timing (p50/p90), throughput, and dataloader wait estimation. |
209
+ | `upload.py` | Artifact uploader: POSTs `.json.gz` to `POST /runs/ingest`. |
210
+ | `display.py` | Rich terminal formatting for reports. |
211
+ | `config.py` | Env-var-only configuration (API URL, Supabase URL, token storage). |
212
+
213
+ ## Design Principles
214
+
215
+ 1. **Zero config**: `alloc run python train.py` works out of the box
216
+ 2. **No monkey-patching**: External monitoring only; deeper signals are opt-in
217
+ 3. **Never crash user's training**: All Alloc failures are caught and training continues
218
+ 4. **Progressive disclosure**: Individual use first, team governance later
219
+
220
+ ## Telemetry Levels
221
+
222
+ Alloc intentionally starts non-invasive and adds richer signals only when you opt in.
223
+
224
+ - **NVML (today)**: peak VRAM, GPU utilization, power draw, basic hardware context (driver/CUDA/SM), multi-GPU discovery from the process tree.
225
+ - **Framework timing (today, opt-in)**: step time p50/p90, samples/sec, estimated dataloader wait percentage via HF/Lightning callbacks.
226
+ - **Distributed timing (planned, opt-in)**: per-rank timing skew, communication overhead, stronger interconnect-aware recommendations.
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "alloc"
7
+ version = "0.0.1"
8
+ description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.8"
12
+ authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
13
+ classifiers = [
14
+ "Development Status :: 3 - Alpha",
15
+ "Intended Audience :: Developers",
16
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.8",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ ]
24
+ dependencies = [
25
+ "typer>=0.9.0",
26
+ "rich>=13.0.0",
27
+ "httpx>=0.24.0",
28
+ "pydantic>=2.0.0",
29
+ "pyyaml>=6.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ gpu = ["pynvml>=11.5.0"]
34
+ dev = ["pytest>=7.0.0", "pytest-cov>=4.0.0"]
35
+
36
+ [project.scripts]
37
+ alloc = "alloc.cli:app"
38
+
39
+ [project.urls]
40
+ Homepage = "https://alloclabs.com"
41
+ Repository = "https://github.com/alloc-labs/alloc"
42
+
43
+ [tool.setuptools.packages.find]
44
+ where = ["src"]
45
+
46
+ [tool.setuptools.package-data]
47
+ "alloc.catalog" = ["*.json"]
alloc-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,11 @@
1
+ """Alloc — GPU intelligence for ML training."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.0.1"
6
+
7
+ from alloc.ghost import ghost, GhostReport
8
+ from alloc.callbacks import AllocCallback as HuggingFaceCallback
9
+ from alloc.callbacks import AllocLightningCallback as LightningCallback
10
+
11
+ __all__ = ["ghost", "GhostReport", "HuggingFaceCallback", "LightningCallback", "__version__"]
@@ -0,0 +1,67 @@
1
+ """Artifact Writer — write alloc_artifact.json.gz.
2
+
3
+ Optionally uploads to W&B if wandb is active.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import gzip
9
+ import json
10
+ import os
11
+ from datetime import datetime, timezone
12
+ from typing import Optional
13
+
14
+
15
+ def write_report(
16
+ ghost_report: Optional[dict] = None,
17
+ probe_result: Optional[dict] = None,
18
+ output_path: Optional[str] = None,
19
+ hardware_context: Optional[dict] = None,
20
+ context: Optional[dict] = None,
21
+ ) -> str:
22
+ """Write an artifact to disk.
23
+
24
+ Resolution order for output path:
25
+ 1. Explicit output_path parameter
26
+ 2. ALLOC_OUT env var
27
+ 3. ./alloc_artifact.json.gz
28
+
29
+ Returns the path written to. Never raises.
30
+ """
31
+ try:
32
+ resolved_path = (
33
+ output_path
34
+ or os.environ.get("ALLOC_OUT", "")
35
+ or "alloc_artifact.json.gz"
36
+ )
37
+
38
+ report = {
39
+ "version": "0.0.1",
40
+ "timestamp": datetime.now(timezone.utc).isoformat(),
41
+ "ghost": ghost_report,
42
+ "probe": probe_result,
43
+ "hardware": hardware_context,
44
+ "context": context if context else None,
45
+ }
46
+
47
+ with gzip.open(resolved_path, "wt", encoding="utf-8") as f:
48
+ json.dump(report, f, indent=2)
49
+
50
+ _try_wandb_upload(resolved_path)
51
+ return resolved_path
52
+ except Exception:
53
+ return ""
54
+
55
+
56
+ def _try_wandb_upload(path: str) -> None:
57
+ """Upload to W&B if wandb is active. Silent no-op otherwise."""
58
+ if not os.environ.get("WANDB_RUN_ID"):
59
+ return
60
+ try:
61
+ import wandb
62
+ if wandb.run is not None:
63
+ artifact = wandb.Artifact("alloc-profile", type="profile")
64
+ artifact.add_file(path)
65
+ wandb.run.log_artifact(artifact)
66
+ except Exception:
67
+ pass