alloc 0.0.1__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alloc-0.0.3/PKG-INFO +190 -0
- alloc-0.0.3/README.md +162 -0
- {alloc-0.0.1 → alloc-0.0.3}/pyproject.toml +3 -4
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/__init__.py +1 -1
- alloc-0.0.3/src/alloc/artifact_loader.py +179 -0
- alloc-0.0.3/src/alloc/browser_auth.py +189 -0
- alloc-0.0.3/src/alloc/callbacks.py +617 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/catalog/__init__.py +1 -2
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/catalog/gpus.v1.json +17 -16
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/cli.py +296 -16
- alloc-0.0.3/src/alloc/code_analyzer.py +882 -0
- alloc-0.0.3/src/alloc/diagnosis_display.py +677 -0
- alloc-0.0.3/src/alloc/diagnosis_engine.py +496 -0
- alloc-0.0.3/src/alloc/diagnosis_rules.py +1419 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/display.py +16 -222
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/ghost.py +3 -3
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/probe.py +3 -16
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/stability.py +9 -20
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/upload.py +5 -0
- alloc-0.0.3/src/alloc.egg-info/PKG-INFO +190 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/SOURCES.txt +11 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/requires.txt +1 -3
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_artifact.py +19 -0
- alloc-0.0.3/tests/test_artifact_loader.py +251 -0
- alloc-0.0.3/tests/test_auth.py +307 -0
- alloc-0.0.3/tests/test_callbacks.py +583 -0
- alloc-0.0.3/tests/test_code_analyzer.py +612 -0
- alloc-0.0.3/tests/test_diagnose_cli.py +464 -0
- alloc-0.0.3/tests/test_diagnosis_engine.py +280 -0
- alloc-0.0.3/tests/test_diagnosis_rules.py +869 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_upload.py +68 -0
- alloc-0.0.3/tests/test_verdict.py +78 -0
- alloc-0.0.1/PKG-INFO +0 -256
- alloc-0.0.1/README.md +0 -226
- alloc-0.0.1/src/alloc/callbacks.py +0 -342
- alloc-0.0.1/src/alloc.egg-info/PKG-INFO +0 -256
- alloc-0.0.1/tests/test_auth.py +0 -155
- alloc-0.0.1/tests/test_callbacks.py +0 -330
- alloc-0.0.1/tests/test_verdict.py +0 -187
- {alloc-0.0.1 → alloc-0.0.3}/setup.cfg +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/artifact_writer.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/catalog/default_rate_card.json +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/config.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/context.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/extractor_runner.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/model_extractor.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/model_registry.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc/yaml_config.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/dependency_links.txt +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/entry_points.txt +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/top_level.txt +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_catalog.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_cli.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_context.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_ghost.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_init_from_org.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_model_extractor.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_probe_hw.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_probe_multi.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_stability.py +0 -0
- {alloc-0.0.1 → alloc-0.0.3}/tests/test_yaml_config.py +0 -0
alloc-0.0.3/PKG-INFO
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: alloc
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
|
+
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://alloclabs.com
|
|
8
|
+
Project-URL: Repository, https://github.com/alloc-labs/alloc
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: typer>=0.9.0
|
|
20
|
+
Requires-Dist: rich>=13.0.0
|
|
21
|
+
Requires-Dist: httpx>=0.24.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0.0
|
|
23
|
+
Requires-Dist: pyyaml>=6.0
|
|
24
|
+
Requires-Dist: pynvml>=11.5.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
28
|
+
|
|
29
|
+
# alloc
|
|
30
|
+
|
|
31
|
+
**Find and fix training bottlenecks. Zero code changes.**
|
|
32
|
+
|
|
33
|
+
[](https://pypi.org/project/alloc/)
|
|
34
|
+
[](https://pypi.org/project/alloc/)
|
|
35
|
+
[](LICENSE)
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install alloc
|
|
39
|
+
alloc run python train.py
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
alloc v0.0.2 — Calibrate
|
|
44
|
+
|
|
45
|
+
Run Summary
|
|
46
|
+
Peak VRAM 31.2 GB / 40.0 GB (A100)
|
|
47
|
+
VRAM used 78.0%
|
|
48
|
+
Avg GPU util 72.3%
|
|
49
|
+
Avg power 287 W
|
|
50
|
+
Duration 24.1s (auto-stopped: metrics stable at 18.2s)
|
|
51
|
+
Step time 148.5 ms (p50) / 152.1 ms (p90)
|
|
52
|
+
Throughput 42.3 samples/sec
|
|
53
|
+
|
|
54
|
+
Artifact: alloc_artifact.json.gz
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
That's it. No decorators, no config files, no code changes. Alloc wraps your command, profiles GPU usage, and tells you what's wrong.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## What you get
|
|
62
|
+
|
|
63
|
+
**`alloc diagnose`** reads your training script and tells you exactly what to change:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
alloc diagnose train.py
|
|
67
|
+
```
|
|
68
|
+
```
|
|
69
|
+
alloc diagnose — 3 findings in train.py
|
|
70
|
+
|
|
71
|
+
CRITICAL DL005 — DataLoader running in main thread
|
|
72
|
+
train.py:47 num_workers=0 → num_workers=8
|
|
73
|
+
num_workers=0 loads data in the main thread, blocking GPU computation entirely.
|
|
74
|
+
Expected impact: ~30-50% faster training with parallel data loading
|
|
75
|
+
|
|
76
|
+
WARNING PREC002 — Using fp16, consider bf16
|
|
77
|
+
train.py:56 dtype: float16 → dtype: bfloat16
|
|
78
|
+
H100 supports bf16 natively — eliminates loss scaling overhead.
|
|
79
|
+
Expected impact: ~5-10% speedup, eliminates GradScaler complexity
|
|
80
|
+
|
|
81
|
+
INFO THRU001 — cudnn.benchmark not enabled
|
|
82
|
+
Add: torch.backends.cudnn.benchmark = True
|
|
83
|
+
Expected impact: ~5-10% speedup for fixed-size inputs
|
|
84
|
+
|
|
85
|
+
Summary: 1 critical, 1 warning, 1 info
|
|
86
|
+
Run with --diff to generate patches | --json for CI output
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**`alloc ghost`** estimates VRAM before you launch:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
alloc ghost train.py --dtype bf16
|
|
93
|
+
```
|
|
94
|
+
```
|
|
95
|
+
Ghost Scan — 7.0B params (bf16)
|
|
96
|
+
|
|
97
|
+
Model weights 13.04 GB
|
|
98
|
+
Gradients 13.04 GB
|
|
99
|
+
Optimizer (Adam) 78.23 GB
|
|
100
|
+
Activations (est.) 0.50 GB
|
|
101
|
+
Buffer (10%) 10.48 GB
|
|
102
|
+
|
|
103
|
+
Total VRAM 115.28 GB
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**`alloc scan`** ranks GPU configs without a GPU:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
alloc scan --model llama-3-70b --gpu H100-80GB --num-gpus 8
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Works with everything
|
|
115
|
+
|
|
116
|
+
Alloc wraps your launch command. No framework-specific setup required.
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
alloc run python train.py
|
|
120
|
+
alloc run torchrun --nproc_per_node=4 train.py
|
|
121
|
+
alloc run accelerate launch train.py
|
|
122
|
+
alloc run srun python train.py # Slurm
|
|
123
|
+
alloc run ray job submit -- python train.py
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Multi-GPU detection is automatic (discovers all GPUs in the process tree).
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Deeper signals (optional)
|
|
131
|
+
|
|
132
|
+
Add a one-line callback for step-level timing:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# HuggingFace
|
|
136
|
+
from alloc import HuggingFaceCallback
|
|
137
|
+
trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
|
|
138
|
+
|
|
139
|
+
# Lightning
|
|
140
|
+
from alloc import LightningCallback
|
|
141
|
+
trainer = Trainer(..., callbacks=[LightningCallback()])
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
This unlocks step time p50/p90, throughput, and dataloader bottleneck detection.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## All commands
|
|
149
|
+
|
|
150
|
+
| Command | What it does |
|
|
151
|
+
|---------|-------------|
|
|
152
|
+
| `alloc run <cmd>` | Profile a training run (auto-stops when stable) |
|
|
153
|
+
| `alloc diagnose <script>` | AST analysis with specific fix suggestions |
|
|
154
|
+
| `alloc ghost <script>` | Estimate VRAM before launching |
|
|
155
|
+
| `alloc scan --model <name>` | Rank GPU configs remotely (no GPU needed) |
|
|
156
|
+
| `alloc catalog list` | Browse 13 GPUs with specs and pricing |
|
|
157
|
+
| `alloc init` | Configure GPU fleet and budget (`.alloc.yaml`) |
|
|
158
|
+
| `alloc login` | Authenticate for dashboard + auto-upload |
|
|
159
|
+
|
|
160
|
+
Every command supports `--json` for CI/CD integration.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Dashboard
|
|
165
|
+
|
|
166
|
+
Log in to get team visibility, budget tracking, and optimization proposals:
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
alloc login --browser
|
|
170
|
+
alloc run python train.py # auto-uploads when logged in
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Dashboard at [alloclabs.com](https://www.alloclabs.com)
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Design principles
|
|
178
|
+
|
|
179
|
+
1. **Zero config** — `alloc run python train.py` works out of the box
|
|
180
|
+
2. **Never crash training** — all Alloc failures are caught silently
|
|
181
|
+
3. **No monkey-patching** — external monitoring only, deeper signals opt-in
|
|
182
|
+
4. **Local-first** — works in air-gapped environments, no internet required
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Links
|
|
187
|
+
|
|
188
|
+
- [Website](https://www.alloclabs.com)
|
|
189
|
+
- [Documentation](https://www.alloclabs.com/docs)
|
|
190
|
+
- [PyPI](https://pypi.org/project/alloc/)
|
alloc-0.0.3/README.md
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# alloc
|
|
2
|
+
|
|
3
|
+
**Find and fix training bottlenecks. Zero code changes.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/alloc/)
|
|
6
|
+
[](https://pypi.org/project/alloc/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install alloc
|
|
11
|
+
alloc run python train.py
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
alloc v0.0.2 — Calibrate
|
|
16
|
+
|
|
17
|
+
Run Summary
|
|
18
|
+
Peak VRAM 31.2 GB / 40.0 GB (A100)
|
|
19
|
+
VRAM used 78.0%
|
|
20
|
+
Avg GPU util 72.3%
|
|
21
|
+
Avg power 287 W
|
|
22
|
+
Duration 24.1s (auto-stopped: metrics stable at 18.2s)
|
|
23
|
+
Step time 148.5 ms (p50) / 152.1 ms (p90)
|
|
24
|
+
Throughput 42.3 samples/sec
|
|
25
|
+
|
|
26
|
+
Artifact: alloc_artifact.json.gz
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
That's it. No decorators, no config files, no code changes. Alloc wraps your command, profiles GPU usage, and tells you what's wrong.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## What you get
|
|
34
|
+
|
|
35
|
+
**`alloc diagnose`** reads your training script and tells you exactly what to change:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
alloc diagnose train.py
|
|
39
|
+
```
|
|
40
|
+
```
|
|
41
|
+
alloc diagnose — 3 findings in train.py
|
|
42
|
+
|
|
43
|
+
CRITICAL DL005 — DataLoader running in main thread
|
|
44
|
+
train.py:47 num_workers=0 → num_workers=8
|
|
45
|
+
num_workers=0 loads data in the main thread, blocking GPU computation entirely.
|
|
46
|
+
Expected impact: ~30-50% faster training with parallel data loading
|
|
47
|
+
|
|
48
|
+
WARNING PREC002 — Using fp16, consider bf16
|
|
49
|
+
train.py:56 dtype: float16 → dtype: bfloat16
|
|
50
|
+
H100 supports bf16 natively — eliminates loss scaling overhead.
|
|
51
|
+
Expected impact: ~5-10% speedup, eliminates GradScaler complexity
|
|
52
|
+
|
|
53
|
+
INFO THRU001 — cudnn.benchmark not enabled
|
|
54
|
+
Add: torch.backends.cudnn.benchmark = True
|
|
55
|
+
Expected impact: ~5-10% speedup for fixed-size inputs
|
|
56
|
+
|
|
57
|
+
Summary: 1 critical, 1 warning, 1 info
|
|
58
|
+
Run with --diff to generate patches | --json for CI output
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**`alloc ghost`** estimates VRAM before you launch:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
alloc ghost train.py --dtype bf16
|
|
65
|
+
```
|
|
66
|
+
```
|
|
67
|
+
Ghost Scan — 7.0B params (bf16)
|
|
68
|
+
|
|
69
|
+
Model weights 13.04 GB
|
|
70
|
+
Gradients 13.04 GB
|
|
71
|
+
Optimizer (Adam) 78.23 GB
|
|
72
|
+
Activations (est.) 0.50 GB
|
|
73
|
+
Buffer (10%) 10.48 GB
|
|
74
|
+
|
|
75
|
+
Total VRAM 115.28 GB
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**`alloc scan`** ranks GPU configs without a GPU:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
alloc scan --model llama-3-70b --gpu H100-80GB --num-gpus 8
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Works with everything
|
|
87
|
+
|
|
88
|
+
Alloc wraps your launch command. No framework-specific setup required.
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
alloc run python train.py
|
|
92
|
+
alloc run torchrun --nproc_per_node=4 train.py
|
|
93
|
+
alloc run accelerate launch train.py
|
|
94
|
+
alloc run srun python train.py # Slurm
|
|
95
|
+
alloc run ray job submit -- python train.py
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Multi-GPU detection is automatic (discovers all GPUs in the process tree).
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Deeper signals (optional)
|
|
103
|
+
|
|
104
|
+
Add a one-line callback for step-level timing:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
# HuggingFace
|
|
108
|
+
from alloc import HuggingFaceCallback
|
|
109
|
+
trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
|
|
110
|
+
|
|
111
|
+
# Lightning
|
|
112
|
+
from alloc import LightningCallback
|
|
113
|
+
trainer = Trainer(..., callbacks=[LightningCallback()])
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
This unlocks step time p50/p90, throughput, and dataloader bottleneck detection.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## All commands
|
|
121
|
+
|
|
122
|
+
| Command | What it does |
|
|
123
|
+
|---------|-------------|
|
|
124
|
+
| `alloc run <cmd>` | Profile a training run (auto-stops when stable) |
|
|
125
|
+
| `alloc diagnose <script>` | AST analysis with specific fix suggestions |
|
|
126
|
+
| `alloc ghost <script>` | Estimate VRAM before launching |
|
|
127
|
+
| `alloc scan --model <name>` | Rank GPU configs remotely (no GPU needed) |
|
|
128
|
+
| `alloc catalog list` | Browse 13 GPUs with specs and pricing |
|
|
129
|
+
| `alloc init` | Configure GPU fleet and budget (`.alloc.yaml`) |
|
|
130
|
+
| `alloc login` | Authenticate for dashboard + auto-upload |
|
|
131
|
+
|
|
132
|
+
Every command supports `--json` for CI/CD integration.
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Dashboard
|
|
137
|
+
|
|
138
|
+
Log in to get team visibility, budget tracking, and optimization proposals:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
alloc login --browser
|
|
142
|
+
alloc run python train.py # auto-uploads when logged in
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Dashboard at [alloclabs.com](https://www.alloclabs.com)
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Design principles
|
|
150
|
+
|
|
151
|
+
1. **Zero config** — `alloc run python train.py` works out of the box
|
|
152
|
+
2. **Never crash training** — all Alloc failures are caught silently
|
|
153
|
+
3. **No monkey-patching** — external monitoring only, deeper signals opt-in
|
|
154
|
+
4. **Local-first** — works in air-gapped environments, no internet required
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Links
|
|
159
|
+
|
|
160
|
+
- [Website](https://www.alloclabs.com)
|
|
161
|
+
- [Documentation](https://www.alloclabs.com/docs)
|
|
162
|
+
- [PyPI](https://pypi.org/project/alloc/)
|
|
@@ -4,18 +4,17 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "alloc"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.3"
|
|
8
8
|
description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
11
|
-
requires-python = ">=3.
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
12
|
authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
|
|
13
13
|
classifiers = [
|
|
14
14
|
"Development Status :: 3 - Alpha",
|
|
15
15
|
"Intended Audience :: Developers",
|
|
16
16
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
17
17
|
"Programming Language :: Python :: 3",
|
|
18
|
-
"Programming Language :: Python :: 3.8",
|
|
19
18
|
"Programming Language :: Python :: 3.9",
|
|
20
19
|
"Programming Language :: Python :: 3.10",
|
|
21
20
|
"Programming Language :: Python :: 3.11",
|
|
@@ -27,10 +26,10 @@ dependencies = [
|
|
|
27
26
|
"httpx>=0.24.0",
|
|
28
27
|
"pydantic>=2.0.0",
|
|
29
28
|
"pyyaml>=6.0",
|
|
29
|
+
"pynvml>=11.5.0",
|
|
30
30
|
]
|
|
31
31
|
|
|
32
32
|
[project.optional-dependencies]
|
|
33
|
-
gpu = ["pynvml>=11.5.0"]
|
|
34
33
|
dev = ["pytest>=7.0.0", "pytest-cov>=4.0.0"]
|
|
35
34
|
|
|
36
35
|
[project.scripts]
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Artifact loader — parse alloc_artifact.json.gz for runtime-enhanced diagnosis.
|
|
2
|
+
|
|
3
|
+
Loads the artifact created by `alloc run`, extracting GPU metrics, timing data,
|
|
4
|
+
and per-rank distributed information for use by Phase 2 diagnosis rules.
|
|
5
|
+
|
|
6
|
+
Never crashes. Returns None on any failure.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import glob
|
|
12
|
+
import gzip
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ArtifactData:
|
|
21
|
+
"""Parsed runtime artifact — structured for rule consumption."""
|
|
22
|
+
|
|
23
|
+
# Hardware (from probe)
|
|
24
|
+
gpu_name: Optional[str] = None
|
|
25
|
+
gpu_count: int = 1
|
|
26
|
+
per_gpu_vram_total_mb: Optional[float] = None
|
|
27
|
+
per_gpu_vram_used_mb: Optional[List[float]] = None
|
|
28
|
+
gpu_utilization_pct: Optional[List[float]] = None
|
|
29
|
+
power_draw_w: Optional[List[float]] = None
|
|
30
|
+
sm_version: Optional[str] = None
|
|
31
|
+
interconnect: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
# Timing (from callbacks — may be None)
|
|
34
|
+
step_times_ms: Optional[List[float]] = None
|
|
35
|
+
step_time_p50_ms: Optional[float] = None
|
|
36
|
+
step_time_p90_ms: Optional[float] = None
|
|
37
|
+
throughput_samples_per_sec: Optional[float] = None
|
|
38
|
+
dataloader_wait_pct: Optional[float] = None
|
|
39
|
+
|
|
40
|
+
# Per-rank data (from distributed callbacks)
|
|
41
|
+
per_rank_peak_vram_mb: Optional[List[float]] = None
|
|
42
|
+
per_rank_step_times_ms: Optional[List[List[float]]] = None
|
|
43
|
+
|
|
44
|
+
# Run metadata
|
|
45
|
+
exit_code: Optional[int] = None
|
|
46
|
+
duration_s: Optional[float] = None
|
|
47
|
+
command: Optional[str] = None
|
|
48
|
+
git_sha: Optional[str] = None
|
|
49
|
+
is_oom: bool = False
|
|
50
|
+
|
|
51
|
+
# Aggregate metrics (computed from samples)
|
|
52
|
+
avg_gpu_util: Optional[float] = None
|
|
53
|
+
peak_vram_mb: Optional[float] = None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def load_artifact(path: str) -> Optional[ArtifactData]:
|
|
57
|
+
"""Load and parse alloc_artifact.json.gz. Returns None on any failure."""
|
|
58
|
+
try:
|
|
59
|
+
if path.endswith(".gz"):
|
|
60
|
+
with gzip.open(path, "rt", encoding="utf-8") as f:
|
|
61
|
+
raw = json.load(f)
|
|
62
|
+
else:
|
|
63
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
64
|
+
raw = json.load(f)
|
|
65
|
+
except Exception:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
return _parse_artifact(raw)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def find_artifact(directory: str = ".") -> Optional[str]:
|
|
72
|
+
"""Find most recent alloc_artifact*.json.gz in directory. Returns path or None."""
|
|
73
|
+
patterns = [
|
|
74
|
+
os.path.join(directory, "alloc_artifact*.json.gz"),
|
|
75
|
+
os.path.join(directory, "alloc_artifact*.json"),
|
|
76
|
+
]
|
|
77
|
+
candidates = [] # type: List[str]
|
|
78
|
+
for pattern in patterns:
|
|
79
|
+
candidates.extend(glob.glob(pattern))
|
|
80
|
+
|
|
81
|
+
if not candidates:
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
# Sort by modification time, return newest
|
|
85
|
+
return max(candidates, key=os.path.getmtime)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _parse_artifact(raw: dict) -> ArtifactData:
|
|
89
|
+
"""Parse raw artifact JSON into ArtifactData."""
|
|
90
|
+
data = ArtifactData()
|
|
91
|
+
|
|
92
|
+
probe = raw.get("probe") or {}
|
|
93
|
+
hardware = raw.get("hardware") or {}
|
|
94
|
+
context = raw.get("context") or {}
|
|
95
|
+
|
|
96
|
+
# Hardware
|
|
97
|
+
data.gpu_name = hardware.get("gpu_name") or probe.get("gpu_name")
|
|
98
|
+
gpu_count = hardware.get("num_gpus_detected")
|
|
99
|
+
data.gpu_count = gpu_count if gpu_count is not None and gpu_count > 0 else 1
|
|
100
|
+
data.per_gpu_vram_total_mb = _float_or_none(
|
|
101
|
+
hardware.get("gpu_total_vram_mb") or probe.get("gpu_total_vram_mb")
|
|
102
|
+
)
|
|
103
|
+
data.sm_version = hardware.get("sm_version")
|
|
104
|
+
data.interconnect = probe.get("interconnect_type")
|
|
105
|
+
|
|
106
|
+
# Peak VRAM — from probe samples or direct field
|
|
107
|
+
peak = _float_or_none(probe.get("peak_vram_mb"))
|
|
108
|
+
data.peak_vram_mb = peak
|
|
109
|
+
|
|
110
|
+
# Per-GPU VRAM: use per_rank_peak_vram_mb if available, else single peak
|
|
111
|
+
per_rank = probe.get("per_rank_peak_vram_mb")
|
|
112
|
+
if isinstance(per_rank, list) and per_rank:
|
|
113
|
+
data.per_gpu_vram_used_mb = [float(v) for v in per_rank]
|
|
114
|
+
elif peak is not None:
|
|
115
|
+
data.per_gpu_vram_used_mb = [peak]
|
|
116
|
+
|
|
117
|
+
data.per_rank_peak_vram_mb = data.per_gpu_vram_used_mb
|
|
118
|
+
|
|
119
|
+
# GPU utilization from samples
|
|
120
|
+
samples = probe.get("samples") or []
|
|
121
|
+
if samples:
|
|
122
|
+
utils = [s.get("gpu_util_pct") for s in samples if s.get("gpu_util_pct") is not None]
|
|
123
|
+
if utils:
|
|
124
|
+
data.gpu_utilization_pct = [float(u) for u in utils]
|
|
125
|
+
data.avg_gpu_util = sum(data.gpu_utilization_pct) / len(data.gpu_utilization_pct)
|
|
126
|
+
|
|
127
|
+
powers = [s.get("power_w") for s in samples if s.get("power_w") is not None]
|
|
128
|
+
if powers:
|
|
129
|
+
data.power_draw_w = [float(p) for p in powers]
|
|
130
|
+
|
|
131
|
+
# Avg GPU util from probe aggregate
|
|
132
|
+
if data.avg_gpu_util is None:
|
|
133
|
+
data.avg_gpu_util = _float_or_none(probe.get("avg_gpu_util"))
|
|
134
|
+
|
|
135
|
+
# Timing (from callback sidecar data merged into probe)
|
|
136
|
+
data.step_time_p50_ms = _float_or_none(probe.get("step_time_ms_p50"))
|
|
137
|
+
data.step_time_p90_ms = _float_or_none(probe.get("step_time_ms_p90"))
|
|
138
|
+
data.throughput_samples_per_sec = _float_or_none(probe.get("samples_per_sec"))
|
|
139
|
+
data.dataloader_wait_pct = _float_or_none(probe.get("dataloader_wait_pct"))
|
|
140
|
+
|
|
141
|
+
# Run metadata
|
|
142
|
+
data.exit_code = probe.get("exit_code")
|
|
143
|
+
data.duration_s = _float_or_none(probe.get("duration_seconds"))
|
|
144
|
+
data.command = probe.get("command")
|
|
145
|
+
|
|
146
|
+
# Git SHA from context
|
|
147
|
+
git_ctx = context.get("git") or {}
|
|
148
|
+
data.git_sha = git_ctx.get("commit_sha")
|
|
149
|
+
|
|
150
|
+
# OOM detection: exit_code != 0 AND VRAM utilization > 95%
|
|
151
|
+
data.is_oom = _detect_oom(data)
|
|
152
|
+
|
|
153
|
+
return data
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _detect_oom(data: ArtifactData) -> bool:
|
|
157
|
+
"""Detect probable OOM from exit code and VRAM utilization."""
|
|
158
|
+
if data.exit_code is None or data.exit_code == 0:
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
if data.per_gpu_vram_used_mb and data.per_gpu_vram_total_mb:
|
|
162
|
+
total = data.per_gpu_vram_total_mb
|
|
163
|
+
if total > 0:
|
|
164
|
+
max_used = max(data.per_gpu_vram_used_mb)
|
|
165
|
+
utilization = max_used / total
|
|
166
|
+
if utilization > 0.95:
|
|
167
|
+
return True
|
|
168
|
+
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _float_or_none(val) -> Optional[float]:
|
|
173
|
+
"""Convert a value to float, returning None on failure."""
|
|
174
|
+
if val is None:
|
|
175
|
+
return None
|
|
176
|
+
try:
|
|
177
|
+
return float(val)
|
|
178
|
+
except (ValueError, TypeError):
|
|
179
|
+
return None
|