alloc 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. alloc-0.0.3/PKG-INFO +190 -0
  2. alloc-0.0.3/README.md +162 -0
  3. {alloc-0.0.1 → alloc-0.0.3}/pyproject.toml +3 -4
  4. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/__init__.py +1 -1
  5. alloc-0.0.3/src/alloc/artifact_loader.py +179 -0
  6. alloc-0.0.3/src/alloc/browser_auth.py +189 -0
  7. alloc-0.0.3/src/alloc/callbacks.py +617 -0
  8. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/catalog/__init__.py +1 -2
  9. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/catalog/gpus.v1.json +17 -16
  10. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/cli.py +296 -16
  11. alloc-0.0.3/src/alloc/code_analyzer.py +882 -0
  12. alloc-0.0.3/src/alloc/diagnosis_display.py +677 -0
  13. alloc-0.0.3/src/alloc/diagnosis_engine.py +496 -0
  14. alloc-0.0.3/src/alloc/diagnosis_rules.py +1419 -0
  15. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/display.py +16 -222
  16. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/ghost.py +3 -3
  17. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/probe.py +3 -16
  18. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/stability.py +9 -20
  19. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/upload.py +5 -0
  20. alloc-0.0.3/src/alloc.egg-info/PKG-INFO +190 -0
  21. {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/SOURCES.txt +11 -0
  22. {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/requires.txt +1 -3
  23. {alloc-0.0.1 → alloc-0.0.3}/tests/test_artifact.py +19 -0
  24. alloc-0.0.3/tests/test_artifact_loader.py +251 -0
  25. alloc-0.0.3/tests/test_auth.py +307 -0
  26. alloc-0.0.3/tests/test_callbacks.py +583 -0
  27. alloc-0.0.3/tests/test_code_analyzer.py +612 -0
  28. alloc-0.0.3/tests/test_diagnose_cli.py +464 -0
  29. alloc-0.0.3/tests/test_diagnosis_engine.py +280 -0
  30. alloc-0.0.3/tests/test_diagnosis_rules.py +869 -0
  31. {alloc-0.0.1 → alloc-0.0.3}/tests/test_upload.py +68 -0
  32. alloc-0.0.3/tests/test_verdict.py +78 -0
  33. alloc-0.0.1/PKG-INFO +0 -256
  34. alloc-0.0.1/README.md +0 -226
  35. alloc-0.0.1/src/alloc/callbacks.py +0 -342
  36. alloc-0.0.1/src/alloc.egg-info/PKG-INFO +0 -256
  37. alloc-0.0.1/tests/test_auth.py +0 -155
  38. alloc-0.0.1/tests/test_callbacks.py +0 -330
  39. alloc-0.0.1/tests/test_verdict.py +0 -187
  40. {alloc-0.0.1 → alloc-0.0.3}/setup.cfg +0 -0
  41. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/artifact_writer.py +0 -0
  42. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/catalog/default_rate_card.json +0 -0
  43. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/config.py +0 -0
  44. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/context.py +0 -0
  45. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/extractor_runner.py +0 -0
  46. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/model_extractor.py +0 -0
  47. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/model_registry.py +0 -0
  48. {alloc-0.0.1 → alloc-0.0.3}/src/alloc/yaml_config.py +0 -0
  49. {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/dependency_links.txt +0 -0
  50. {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/entry_points.txt +0 -0
  51. {alloc-0.0.1 → alloc-0.0.3}/src/alloc.egg-info/top_level.txt +0 -0
  52. {alloc-0.0.1 → alloc-0.0.3}/tests/test_catalog.py +0 -0
  53. {alloc-0.0.1 → alloc-0.0.3}/tests/test_cli.py +0 -0
  54. {alloc-0.0.1 → alloc-0.0.3}/tests/test_context.py +0 -0
  55. {alloc-0.0.1 → alloc-0.0.3}/tests/test_ghost.py +0 -0
  56. {alloc-0.0.1 → alloc-0.0.3}/tests/test_init_from_org.py +0 -0
  57. {alloc-0.0.1 → alloc-0.0.3}/tests/test_model_extractor.py +0 -0
  58. {alloc-0.0.1 → alloc-0.0.3}/tests/test_probe_hw.py +0 -0
  59. {alloc-0.0.1 → alloc-0.0.3}/tests/test_probe_multi.py +0 -0
  60. {alloc-0.0.1 → alloc-0.0.3}/tests/test_stability.py +0 -0
  61. {alloc-0.0.1 → alloc-0.0.3}/tests/test_yaml_config.py +0 -0
alloc-0.0.3/PKG-INFO ADDED
@@ -0,0 +1,190 @@
1
+ Metadata-Version: 2.4
2
+ Name: alloc
3
+ Version: 0.0.3
4
+ Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
+ Author-email: Alloc Labs <hello@alloclabs.com>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://alloclabs.com
8
+ Project-URL: Repository, https://github.com/alloc-labs/alloc
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: typer>=0.9.0
20
+ Requires-Dist: rich>=13.0.0
21
+ Requires-Dist: httpx>=0.24.0
22
+ Requires-Dist: pydantic>=2.0.0
23
+ Requires-Dist: pyyaml>=6.0
24
+ Requires-Dist: pynvml>=11.5.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
28
+
29
+ # alloc
30
+
31
+ **Find and fix training bottlenecks. Zero code changes.**
32
+
33
+ [![PyPI](https://img.shields.io/pypi/v/alloc)](https://pypi.org/project/alloc/)
34
+ [![Python](https://img.shields.io/pypi/pyversions/alloc)](https://pypi.org/project/alloc/)
35
+ [![License](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
36
+
37
+ ```bash
38
+ pip install alloc
39
+ alloc run python train.py
40
+ ```
41
+
42
+ ```
43
+ alloc v0.0.2 — Calibrate
44
+
45
+ Run Summary
46
+ Peak VRAM 31.2 GB / 40.0 GB (A100)
47
+ VRAM used 78.0%
48
+ Avg GPU util 72.3%
49
+ Avg power 287 W
50
+ Duration 24.1s (auto-stopped: metrics stable at 18.2s)
51
+ Step time 148.5 ms (p50) / 152.1 ms (p90)
52
+ Throughput 42.3 samples/sec
53
+
54
+ Artifact: alloc_artifact.json.gz
55
+ ```
56
+
57
+ That's it. No decorators, no config files, no code changes. Alloc wraps your command, profiles GPU usage, and tells you what's wrong.
58
+
59
+ ---
60
+
61
+ ## What you get
62
+
63
+ **`alloc diagnose`** reads your training script and tells you exactly what to change:
64
+
65
+ ```bash
66
+ alloc diagnose train.py
67
+ ```
68
+ ```
69
+ alloc diagnose — 3 findings in train.py
70
+
71
+ CRITICAL DL005 — DataLoader running in main thread
72
+ train.py:47 num_workers=0 → num_workers=8
73
+ num_workers=0 loads data in the main thread, blocking GPU computation entirely.
74
+ Expected impact: ~30-50% faster training with parallel data loading
75
+
76
+ WARNING PREC002 — Using fp16, consider bf16
77
+ train.py:56 dtype: float16 → dtype: bfloat16
78
+ H100 supports bf16 natively — eliminates loss scaling overhead.
79
+ Expected impact: ~5-10% speedup, eliminates GradScaler complexity
80
+
81
+ INFO THRU001 — cudnn.benchmark not enabled
82
+ Add: torch.backends.cudnn.benchmark = True
83
+ Expected impact: ~5-10% speedup for fixed-size inputs
84
+
85
+ Summary: 1 critical, 1 warning, 1 info
86
+ Run with --diff to generate patches | --json for CI output
87
+ ```
88
+
89
+ **`alloc ghost`** estimates VRAM before you launch:
90
+
91
+ ```bash
92
+ alloc ghost train.py --dtype bf16
93
+ ```
94
+ ```
95
+ Ghost Scan — 7.0B params (bf16)
96
+
97
+ Model weights 13.04 GB
98
+ Gradients 13.04 GB
99
+ Optimizer (Adam) 78.23 GB
100
+ Activations (est.) 0.50 GB
101
+ Buffer (10%) 10.48 GB
102
+
103
+ Total VRAM 115.28 GB
104
+ ```
105
+
106
+ **`alloc scan`** ranks GPU configs without a GPU:
107
+
108
+ ```bash
109
+ alloc scan --model llama-3-70b --gpu H100-80GB --num-gpus 8
110
+ ```
111
+
112
+ ---
113
+
114
+ ## Works with everything
115
+
116
+ Alloc wraps your launch command. No framework-specific setup required.
117
+
118
+ ```bash
119
+ alloc run python train.py
120
+ alloc run torchrun --nproc_per_node=4 train.py
121
+ alloc run accelerate launch train.py
122
+ alloc run srun python train.py # Slurm
123
+ alloc run ray job submit -- python train.py
124
+ ```
125
+
126
+ Multi-GPU detection is automatic (discovers all GPUs in the process tree).
127
+
128
+ ---
129
+
130
+ ## Deeper signals (optional)
131
+
132
+ Add a one-line callback for step-level timing:
133
+
134
+ ```python
135
+ # HuggingFace
136
+ from alloc import HuggingFaceCallback
137
+ trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
138
+
139
+ # Lightning
140
+ from alloc import LightningCallback
141
+ trainer = Trainer(..., callbacks=[LightningCallback()])
142
+ ```
143
+
144
+ This unlocks step time p50/p90, throughput, and dataloader bottleneck detection.
145
+
146
+ ---
147
+
148
+ ## All commands
149
+
150
+ | Command | What it does |
151
+ |---------|-------------|
152
+ | `alloc run <cmd>` | Profile a training run (auto-stops when stable) |
153
+ | `alloc diagnose <script>` | AST analysis with specific fix suggestions |
154
+ | `alloc ghost <script>` | Estimate VRAM before launching |
155
+ | `alloc scan --model <name>` | Rank GPU configs remotely (no GPU needed) |
156
+ | `alloc catalog list` | Browse 13 GPUs with specs and pricing |
157
+ | `alloc init` | Configure GPU fleet and budget (`.alloc.yaml`) |
158
+ | `alloc login` | Authenticate for dashboard + auto-upload |
159
+
160
+ Every command supports `--json` for CI/CD integration.
161
+
162
+ ---
163
+
164
+ ## Dashboard
165
+
166
+ Log in to get team visibility, budget tracking, and optimization proposals:
167
+
168
+ ```bash
169
+ alloc login --browser
170
+ alloc run python train.py # auto-uploads when logged in
171
+ ```
172
+
173
+ Dashboard at [alloclabs.com](https://www.alloclabs.com)
174
+
175
+ ---
176
+
177
+ ## Design principles
178
+
179
+ 1. **Zero config** — `alloc run python train.py` works out of the box
180
+ 2. **Never crash training** — all Alloc failures are caught silently
181
+ 3. **No monkey-patching** — external monitoring only, deeper signals opt-in
182
+ 4. **Local-first** — works in air-gapped environments, no internet required
183
+
184
+ ---
185
+
186
+ ## Links
187
+
188
+ - [Website](https://www.alloclabs.com)
189
+ - [Documentation](https://www.alloclabs.com/docs)
190
+ - [PyPI](https://pypi.org/project/alloc/)
alloc-0.0.3/README.md ADDED
@@ -0,0 +1,162 @@
1
+ # alloc
2
+
3
+ **Find and fix training bottlenecks. Zero code changes.**
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/alloc)](https://pypi.org/project/alloc/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/alloc)](https://pypi.org/project/alloc/)
7
+ [![License](https://img.shields.io/badge/license-Apache--2.0-blue)](LICENSE)
8
+
9
+ ```bash
10
+ pip install alloc
11
+ alloc run python train.py
12
+ ```
13
+
14
+ ```
15
+ alloc v0.0.2 — Calibrate
16
+
17
+ Run Summary
18
+ Peak VRAM 31.2 GB / 40.0 GB (A100)
19
+ VRAM used 78.0%
20
+ Avg GPU util 72.3%
21
+ Avg power 287 W
22
+ Duration 24.1s (auto-stopped: metrics stable at 18.2s)
23
+ Step time 148.5 ms (p50) / 152.1 ms (p90)
24
+ Throughput 42.3 samples/sec
25
+
26
+ Artifact: alloc_artifact.json.gz
27
+ ```
28
+
29
+ That's it. No decorators, no config files, no code changes. Alloc wraps your command, profiles GPU usage, and tells you what's wrong.
30
+
31
+ ---
32
+
33
+ ## What you get
34
+
35
+ **`alloc diagnose`** reads your training script and tells you exactly what to change:
36
+
37
+ ```bash
38
+ alloc diagnose train.py
39
+ ```
40
+ ```
41
+ alloc diagnose — 3 findings in train.py
42
+
43
+ CRITICAL DL005 — DataLoader running in main thread
44
+ train.py:47 num_workers=0 → num_workers=8
45
+ num_workers=0 loads data in the main thread, blocking GPU computation entirely.
46
+ Expected impact: ~30-50% faster training with parallel data loading
47
+
48
+ WARNING PREC002 — Using fp16, consider bf16
49
+ train.py:56 dtype: float16 → dtype: bfloat16
50
+ H100 supports bf16 natively — eliminates loss scaling overhead.
51
+ Expected impact: ~5-10% speedup, eliminates GradScaler complexity
52
+
53
+ INFO THRU001 — cudnn.benchmark not enabled
54
+ Add: torch.backends.cudnn.benchmark = True
55
+ Expected impact: ~5-10% speedup for fixed-size inputs
56
+
57
+ Summary: 1 critical, 1 warning, 1 info
58
+ Run with --diff to generate patches | --json for CI output
59
+ ```
60
+
61
+ **`alloc ghost`** estimates VRAM before you launch:
62
+
63
+ ```bash
64
+ alloc ghost train.py --dtype bf16
65
+ ```
66
+ ```
67
+ Ghost Scan — 7.0B params (bf16)
68
+
69
+ Model weights 13.04 GB
70
+ Gradients 13.04 GB
71
+ Optimizer (Adam) 78.23 GB
72
+ Activations (est.) 0.50 GB
73
+ Buffer (10%) 10.48 GB
74
+
75
+ Total VRAM 115.28 GB
76
+ ```
77
+
78
+ **`alloc scan`** ranks GPU configs without a GPU:
79
+
80
+ ```bash
81
+ alloc scan --model llama-3-70b --gpu H100-80GB --num-gpus 8
82
+ ```
83
+
84
+ ---
85
+
86
+ ## Works with everything
87
+
88
+ Alloc wraps your launch command. No framework-specific setup required.
89
+
90
+ ```bash
91
+ alloc run python train.py
92
+ alloc run torchrun --nproc_per_node=4 train.py
93
+ alloc run accelerate launch train.py
94
+ alloc run srun python train.py # Slurm
95
+ alloc run ray job submit -- python train.py
96
+ ```
97
+
98
+ Multi-GPU detection is automatic (discovers all GPUs in the process tree).
99
+
100
+ ---
101
+
102
+ ## Deeper signals (optional)
103
+
104
+ Add a one-line callback for step-level timing:
105
+
106
+ ```python
107
+ # HuggingFace
108
+ from alloc import HuggingFaceCallback
109
+ trainer = Trainer(..., callbacks=[HuggingFaceCallback()])
110
+
111
+ # Lightning
112
+ from alloc import LightningCallback
113
+ trainer = Trainer(..., callbacks=[LightningCallback()])
114
+ ```
115
+
116
+ This unlocks step time p50/p90, throughput, and dataloader bottleneck detection.
117
+
118
+ ---
119
+
120
+ ## All commands
121
+
122
+ | Command | What it does |
123
+ |---------|-------------|
124
+ | `alloc run <cmd>` | Profile a training run (auto-stops when stable) |
125
+ | `alloc diagnose <script>` | AST analysis with specific fix suggestions |
126
+ | `alloc ghost <script>` | Estimate VRAM before launching |
127
+ | `alloc scan --model <name>` | Rank GPU configs remotely (no GPU needed) |
128
+ | `alloc catalog list` | Browse 13 GPUs with specs and pricing |
129
+ | `alloc init` | Configure GPU fleet and budget (`.alloc.yaml`) |
130
+ | `alloc login` | Authenticate for dashboard + auto-upload |
131
+
132
+ Every command supports `--json` for CI/CD integration.
133
+
134
+ ---
135
+
136
+ ## Dashboard
137
+
138
+ Log in to get team visibility, budget tracking, and optimization proposals:
139
+
140
+ ```bash
141
+ alloc login --browser
142
+ alloc run python train.py # auto-uploads when logged in
143
+ ```
144
+
145
+ Dashboard at [alloclabs.com](https://www.alloclabs.com)
146
+
147
+ ---
148
+
149
+ ## Design principles
150
+
151
+ 1. **Zero config** — `alloc run python train.py` works out of the box
152
+ 2. **Never crash training** — all Alloc failures are caught silently
153
+ 3. **No monkey-patching** — external monitoring only, deeper signals opt-in
154
+ 4. **Local-first** — works in air-gapped environments, no internet required
155
+
156
+ ---
157
+
158
+ ## Links
159
+
160
+ - [Website](https://www.alloclabs.com)
161
+ - [Documentation](https://www.alloclabs.com/docs)
162
+ - [PyPI](https://pypi.org/project/alloc/)
@@ -4,18 +4,17 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alloc"
7
- version = "0.0.1"
7
+ version = "0.0.3"
8
8
  description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
11
- requires-python = ">=3.8"
11
+ requires-python = ">=3.9"
12
12
  authors = [{name = "Alloc Labs", email = "hello@alloclabs.com"}]
13
13
  classifiers = [
14
14
  "Development Status :: 3 - Alpha",
15
15
  "Intended Audience :: Developers",
16
16
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
17
  "Programming Language :: Python :: 3",
18
- "Programming Language :: Python :: 3.8",
19
18
  "Programming Language :: Python :: 3.9",
20
19
  "Programming Language :: Python :: 3.10",
21
20
  "Programming Language :: Python :: 3.11",
@@ -27,10 +26,10 @@ dependencies = [
27
26
  "httpx>=0.24.0",
28
27
  "pydantic>=2.0.0",
29
28
  "pyyaml>=6.0",
29
+ "pynvml>=11.5.0",
30
30
  ]
31
31
 
32
32
  [project.optional-dependencies]
33
- gpu = ["pynvml>=11.5.0"]
34
33
  dev = ["pytest>=7.0.0", "pytest-cov>=4.0.0"]
35
34
 
36
35
  [project.scripts]
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "0.0.1"
5
+ __version__ = "0.0.3"
6
6
 
7
7
  from alloc.ghost import ghost, GhostReport
8
8
  from alloc.callbacks import AllocCallback as HuggingFaceCallback
@@ -0,0 +1,179 @@
1
+ """Artifact loader — parse alloc_artifact.json.gz for runtime-enhanced diagnosis.
2
+
3
+ Loads the artifact created by `alloc run`, extracting GPU metrics, timing data,
4
+ and per-rank distributed information for use by Phase 2 diagnosis rules.
5
+
6
+ Never crashes. Returns None on any failure.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import glob
12
+ import gzip
13
+ import json
14
+ import os
15
+ from dataclasses import dataclass, field
16
+ from typing import Dict, List, Optional
17
+
18
+
19
+ @dataclass
20
+ class ArtifactData:
21
+ """Parsed runtime artifact — structured for rule consumption."""
22
+
23
+ # Hardware (from probe)
24
+ gpu_name: Optional[str] = None
25
+ gpu_count: int = 1
26
+ per_gpu_vram_total_mb: Optional[float] = None
27
+ per_gpu_vram_used_mb: Optional[List[float]] = None
28
+ gpu_utilization_pct: Optional[List[float]] = None
29
+ power_draw_w: Optional[List[float]] = None
30
+ sm_version: Optional[str] = None
31
+ interconnect: Optional[str] = None
32
+
33
+ # Timing (from callbacks — may be None)
34
+ step_times_ms: Optional[List[float]] = None
35
+ step_time_p50_ms: Optional[float] = None
36
+ step_time_p90_ms: Optional[float] = None
37
+ throughput_samples_per_sec: Optional[float] = None
38
+ dataloader_wait_pct: Optional[float] = None
39
+
40
+ # Per-rank data (from distributed callbacks)
41
+ per_rank_peak_vram_mb: Optional[List[float]] = None
42
+ per_rank_step_times_ms: Optional[List[List[float]]] = None
43
+
44
+ # Run metadata
45
+ exit_code: Optional[int] = None
46
+ duration_s: Optional[float] = None
47
+ command: Optional[str] = None
48
+ git_sha: Optional[str] = None
49
+ is_oom: bool = False
50
+
51
+ # Aggregate metrics (computed from samples)
52
+ avg_gpu_util: Optional[float] = None
53
+ peak_vram_mb: Optional[float] = None
54
+
55
+
56
+ def load_artifact(path: str) -> Optional[ArtifactData]:
57
+ """Load and parse alloc_artifact.json.gz. Returns None on any failure."""
58
+ try:
59
+ if path.endswith(".gz"):
60
+ with gzip.open(path, "rt", encoding="utf-8") as f:
61
+ raw = json.load(f)
62
+ else:
63
+ with open(path, "r", encoding="utf-8") as f:
64
+ raw = json.load(f)
65
+ except Exception:
66
+ return None
67
+
68
+ return _parse_artifact(raw)
69
+
70
+
71
+ def find_artifact(directory: str = ".") -> Optional[str]:
72
+ """Find most recent alloc_artifact*.json.gz in directory. Returns path or None."""
73
+ patterns = [
74
+ os.path.join(directory, "alloc_artifact*.json.gz"),
75
+ os.path.join(directory, "alloc_artifact*.json"),
76
+ ]
77
+ candidates = [] # type: List[str]
78
+ for pattern in patterns:
79
+ candidates.extend(glob.glob(pattern))
80
+
81
+ if not candidates:
82
+ return None
83
+
84
+ # Sort by modification time, return newest
85
+ return max(candidates, key=os.path.getmtime)
86
+
87
+
88
+ def _parse_artifact(raw: dict) -> ArtifactData:
89
+ """Parse raw artifact JSON into ArtifactData."""
90
+ data = ArtifactData()
91
+
92
+ probe = raw.get("probe") or {}
93
+ hardware = raw.get("hardware") or {}
94
+ context = raw.get("context") or {}
95
+
96
+ # Hardware
97
+ data.gpu_name = hardware.get("gpu_name") or probe.get("gpu_name")
98
+ gpu_count = hardware.get("num_gpus_detected")
99
+ data.gpu_count = gpu_count if gpu_count is not None and gpu_count > 0 else 1
100
+ data.per_gpu_vram_total_mb = _float_or_none(
101
+ hardware.get("gpu_total_vram_mb") or probe.get("gpu_total_vram_mb")
102
+ )
103
+ data.sm_version = hardware.get("sm_version")
104
+ data.interconnect = probe.get("interconnect_type")
105
+
106
+ # Peak VRAM — from probe samples or direct field
107
+ peak = _float_or_none(probe.get("peak_vram_mb"))
108
+ data.peak_vram_mb = peak
109
+
110
+ # Per-GPU VRAM: use per_rank_peak_vram_mb if available, else single peak
111
+ per_rank = probe.get("per_rank_peak_vram_mb")
112
+ if isinstance(per_rank, list) and per_rank:
113
+ data.per_gpu_vram_used_mb = [float(v) for v in per_rank]
114
+ elif peak is not None:
115
+ data.per_gpu_vram_used_mb = [peak]
116
+
117
+ data.per_rank_peak_vram_mb = data.per_gpu_vram_used_mb
118
+
119
+ # GPU utilization from samples
120
+ samples = probe.get("samples") or []
121
+ if samples:
122
+ utils = [s.get("gpu_util_pct") for s in samples if s.get("gpu_util_pct") is not None]
123
+ if utils:
124
+ data.gpu_utilization_pct = [float(u) for u in utils]
125
+ data.avg_gpu_util = sum(data.gpu_utilization_pct) / len(data.gpu_utilization_pct)
126
+
127
+ powers = [s.get("power_w") for s in samples if s.get("power_w") is not None]
128
+ if powers:
129
+ data.power_draw_w = [float(p) for p in powers]
130
+
131
+ # Avg GPU util from probe aggregate
132
+ if data.avg_gpu_util is None:
133
+ data.avg_gpu_util = _float_or_none(probe.get("avg_gpu_util"))
134
+
135
+ # Timing (from callback sidecar data merged into probe)
136
+ data.step_time_p50_ms = _float_or_none(probe.get("step_time_ms_p50"))
137
+ data.step_time_p90_ms = _float_or_none(probe.get("step_time_ms_p90"))
138
+ data.throughput_samples_per_sec = _float_or_none(probe.get("samples_per_sec"))
139
+ data.dataloader_wait_pct = _float_or_none(probe.get("dataloader_wait_pct"))
140
+
141
+ # Run metadata
142
+ data.exit_code = probe.get("exit_code")
143
+ data.duration_s = _float_or_none(probe.get("duration_seconds"))
144
+ data.command = probe.get("command")
145
+
146
+ # Git SHA from context
147
+ git_ctx = context.get("git") or {}
148
+ data.git_sha = git_ctx.get("commit_sha")
149
+
150
+ # OOM detection: exit_code != 0 AND VRAM utilization > 95%
151
+ data.is_oom = _detect_oom(data)
152
+
153
+ return data
154
+
155
+
156
+ def _detect_oom(data: ArtifactData) -> bool:
157
+ """Detect probable OOM from exit code and VRAM utilization."""
158
+ if data.exit_code is None or data.exit_code == 0:
159
+ return False
160
+
161
+ if data.per_gpu_vram_used_mb and data.per_gpu_vram_total_mb:
162
+ total = data.per_gpu_vram_total_mb
163
+ if total > 0:
164
+ max_used = max(data.per_gpu_vram_used_mb)
165
+ utilization = max_used / total
166
+ if utilization > 0.95:
167
+ return True
168
+
169
+ return False
170
+
171
+
172
+ def _float_or_none(val) -> Optional[float]:
173
+ """Convert a value to float, returning None on failure."""
174
+ if val is None:
175
+ return None
176
+ try:
177
+ return float(val)
178
+ except (ValueError, TypeError):
179
+ return None