alloc 0.0.3__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {alloc-0.0.3 → alloc-0.0.5}/PKG-INFO +1 -1
- {alloc-0.0.3 → alloc-0.0.5}/pyproject.toml +1 -1
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/__init__.py +1 -1
- alloc-0.0.5/src/alloc/artifact_loader.py +330 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/artifact_writer.py +4 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/browser_auth.py +28 -6
- alloc-0.0.5/src/alloc/callbacks.py +1204 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/catalog/gpus.v1.json +44 -31
- alloc-0.0.5/src/alloc/cli.py +3576 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/code_analyzer.py +161 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/config.py +3 -2
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/diagnosis_display.py +102 -30
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/diagnosis_engine.py +83 -7
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/diagnosis_rules.py +364 -31
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/display.py +18 -1
- alloc-0.0.5/src/alloc/extractor_runner.py +285 -0
- alloc-0.0.5/src/alloc/ghost.py +304 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/model_extractor.py +10 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/model_registry.py +26 -2
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/probe.py +85 -11
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/upload.py +39 -4
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/yaml_config.py +2 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/PKG-INFO +1 -1
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/SOURCES.txt +2 -0
- alloc-0.0.5/tests/test_artifact_loader.py +568 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_auth.py +14 -7
- alloc-0.0.5/tests/test_callbacks.py +1191 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_cli.py +128 -0
- alloc-0.0.5/tests/test_diagnose_cli.py +1313 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_diagnosis_engine.py +82 -1
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_diagnosis_rules.py +346 -3
- alloc-0.0.5/tests/test_extractor_activation.py +178 -0
- alloc-0.0.5/tests/test_ghost.py +304 -0
- alloc-0.0.5/tests/test_interconnect.py +182 -0
- alloc-0.0.3/src/alloc/artifact_loader.py +0 -179
- alloc-0.0.3/src/alloc/callbacks.py +0 -617
- alloc-0.0.3/src/alloc/cli.py +0 -1621
- alloc-0.0.3/src/alloc/extractor_runner.py +0 -141
- alloc-0.0.3/src/alloc/ghost.py +0 -167
- alloc-0.0.3/tests/test_artifact_loader.py +0 -251
- alloc-0.0.3/tests/test_callbacks.py +0 -583
- alloc-0.0.3/tests/test_diagnose_cli.py +0 -464
- alloc-0.0.3/tests/test_ghost.py +0 -82
- {alloc-0.0.3 → alloc-0.0.5}/README.md +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/setup.cfg +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/catalog/__init__.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/catalog/default_rate_card.json +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/context.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc/stability.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/dependency_links.txt +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/entry_points.txt +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/requires.txt +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/top_level.txt +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_artifact.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_catalog.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_code_analyzer.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_context.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_init_from_org.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_model_extractor.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_probe_hw.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_probe_multi.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_stability.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_upload.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_verdict.py +0 -0
- {alloc-0.0.3 → alloc-0.0.5}/tests/test_yaml_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alloc
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.5
|
|
4
4
|
Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
|
|
5
5
|
Author-email: Alloc Labs <hello@alloclabs.com>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "alloc"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.5"
|
|
8
8
|
description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
"""Artifact loader — parse alloc_artifact.json.gz for runtime-enhanced diagnosis.
|
|
2
|
+
|
|
3
|
+
Loads the artifact created by `alloc run`, extracting GPU metrics, timing data,
|
|
4
|
+
and per-rank distributed information for use by Phase 2 diagnosis rules.
|
|
5
|
+
|
|
6
|
+
Never crashes. Returns None on any failure.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import glob
|
|
12
|
+
import gzip
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ArtifactData:
|
|
21
|
+
"""Parsed runtime artifact — structured for rule consumption."""
|
|
22
|
+
|
|
23
|
+
# Hardware (from probe)
|
|
24
|
+
gpu_name: Optional[str] = None
|
|
25
|
+
gpu_count: int = 1
|
|
26
|
+
per_gpu_vram_total_mb: Optional[float] = None
|
|
27
|
+
per_gpu_vram_used_mb: Optional[List[float]] = None
|
|
28
|
+
gpu_utilization_pct: Optional[List[float]] = None
|
|
29
|
+
power_draw_w: Optional[List[float]] = None
|
|
30
|
+
sm_version: Optional[str] = None
|
|
31
|
+
interconnect: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
# Timing (from callbacks — may be None)
|
|
34
|
+
step_times_ms: Optional[List[float]] = None
|
|
35
|
+
step_time_p50_ms: Optional[float] = None
|
|
36
|
+
step_time_p90_ms: Optional[float] = None
|
|
37
|
+
throughput_samples_per_sec: Optional[float] = None
|
|
38
|
+
dataloader_wait_pct: Optional[float] = None
|
|
39
|
+
|
|
40
|
+
# Phase timing (from callbacks with CUDA events)
|
|
41
|
+
phase_forward_ms_p50: Optional[float] = None
|
|
42
|
+
phase_forward_ms_p90: Optional[float] = None
|
|
43
|
+
phase_backward_ms_p50: Optional[float] = None
|
|
44
|
+
phase_backward_ms_p90: Optional[float] = None
|
|
45
|
+
phase_optimizer_ms_p50: Optional[float] = None
|
|
46
|
+
phase_optimizer_ms_p90: Optional[float] = None
|
|
47
|
+
phase_dataloader_ms_p50: Optional[float] = None
|
|
48
|
+
phase_dataloader_ms_p90: Optional[float] = None
|
|
49
|
+
has_phase_timing: bool = False
|
|
50
|
+
|
|
51
|
+
# Communication overhead (estimated from phase timing in distributed training)
|
|
52
|
+
comm_overhead_pct: Optional[float] = None
|
|
53
|
+
|
|
54
|
+
# Per-rank data (from distributed callbacks)
|
|
55
|
+
per_rank_peak_vram_mb: Optional[List[float]] = None
|
|
56
|
+
per_rank_step_times_ms: Optional[List[List[float]]] = None
|
|
57
|
+
straggler_ratio: Optional[float] = None
|
|
58
|
+
|
|
59
|
+
# Architecture metadata (from callback introspection)
|
|
60
|
+
architecture_type: Optional[str] = None
|
|
61
|
+
optimizer_type: Optional[str] = None
|
|
62
|
+
fine_tuning_method: Optional[str] = None
|
|
63
|
+
gradient_checkpointing: Optional[bool] = None
|
|
64
|
+
attention_type: Optional[str] = None
|
|
65
|
+
param_count: Optional[int] = None
|
|
66
|
+
trainable_param_count: Optional[int] = None
|
|
67
|
+
|
|
68
|
+
# Run metadata
|
|
69
|
+
exit_code: Optional[int] = None
|
|
70
|
+
duration_s: Optional[float] = None
|
|
71
|
+
command: Optional[str] = None
|
|
72
|
+
git_sha: Optional[str] = None
|
|
73
|
+
is_oom: bool = False
|
|
74
|
+
|
|
75
|
+
# Aggregate metrics (computed from samples)
|
|
76
|
+
avg_gpu_util: Optional[float] = None
|
|
77
|
+
peak_vram_mb: Optional[float] = None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def load_artifact(path: str) -> Optional[ArtifactData]:
|
|
81
|
+
"""Load and parse alloc_artifact.json.gz. Returns None on any failure."""
|
|
82
|
+
try:
|
|
83
|
+
if path.endswith(".gz"):
|
|
84
|
+
with gzip.open(path, "rt", encoding="utf-8") as f:
|
|
85
|
+
raw = json.load(f)
|
|
86
|
+
else:
|
|
87
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
88
|
+
raw = json.load(f)
|
|
89
|
+
except Exception:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
return _parse_artifact(raw)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def find_artifact(directory: str = ".") -> Optional[str]:
|
|
96
|
+
"""Find most recent alloc_artifact*.json.gz in directory. Returns path or None."""
|
|
97
|
+
patterns = [
|
|
98
|
+
os.path.join(directory, "alloc_artifact*.json.gz"),
|
|
99
|
+
os.path.join(directory, "alloc_artifact*.json"),
|
|
100
|
+
]
|
|
101
|
+
candidates = [] # type: List[str]
|
|
102
|
+
for pattern in patterns:
|
|
103
|
+
candidates.extend(glob.glob(pattern))
|
|
104
|
+
|
|
105
|
+
if not candidates:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
# Sort by modification time, return newest
|
|
109
|
+
return max(candidates, key=os.path.getmtime)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _parse_artifact(raw: dict) -> ArtifactData:
|
|
113
|
+
"""Parse raw artifact JSON into ArtifactData."""
|
|
114
|
+
data = ArtifactData()
|
|
115
|
+
|
|
116
|
+
probe = raw.get("probe") or {}
|
|
117
|
+
hardware = raw.get("hardware") or {}
|
|
118
|
+
context = raw.get("context") or {}
|
|
119
|
+
|
|
120
|
+
# Hardware
|
|
121
|
+
data.gpu_name = hardware.get("gpu_name") or probe.get("gpu_name")
|
|
122
|
+
gpu_count = hardware.get("num_gpus_detected")
|
|
123
|
+
data.gpu_count = gpu_count if gpu_count is not None and gpu_count > 0 else 1
|
|
124
|
+
data.per_gpu_vram_total_mb = _float_or_none(
|
|
125
|
+
hardware.get("gpu_total_vram_mb") or probe.get("gpu_total_vram_mb")
|
|
126
|
+
)
|
|
127
|
+
data.sm_version = hardware.get("sm_version")
|
|
128
|
+
data.interconnect = probe.get("interconnect_type")
|
|
129
|
+
|
|
130
|
+
# Peak VRAM — from probe samples or direct field
|
|
131
|
+
peak = _float_or_none(probe.get("peak_vram_mb"))
|
|
132
|
+
data.peak_vram_mb = peak
|
|
133
|
+
|
|
134
|
+
# Per-GPU VRAM: use per_rank_peak_vram_mb if available, else single peak
|
|
135
|
+
per_rank = probe.get("per_rank_peak_vram_mb")
|
|
136
|
+
if isinstance(per_rank, list) and per_rank:
|
|
137
|
+
data.per_gpu_vram_used_mb = [x for x in (_float_or_none(v) for v in per_rank) if x is not None]
|
|
138
|
+
elif peak is not None:
|
|
139
|
+
data.per_gpu_vram_used_mb = [peak]
|
|
140
|
+
|
|
141
|
+
data.per_rank_peak_vram_mb = data.per_gpu_vram_used_mb
|
|
142
|
+
|
|
143
|
+
# GPU utilization from samples
|
|
144
|
+
samples = probe.get("samples") or []
|
|
145
|
+
if samples:
|
|
146
|
+
utils = [s.get("gpu_util_pct") for s in samples if s.get("gpu_util_pct") is not None]
|
|
147
|
+
if utils:
|
|
148
|
+
data.gpu_utilization_pct = [x for x in (_float_or_none(u) for u in utils) if x is not None]
|
|
149
|
+
data.avg_gpu_util = sum(data.gpu_utilization_pct) / len(data.gpu_utilization_pct)
|
|
150
|
+
|
|
151
|
+
powers = [s.get("power_w") for s in samples if s.get("power_w") is not None]
|
|
152
|
+
if powers:
|
|
153
|
+
data.power_draw_w = [x for x in (_float_or_none(p) for p in powers) if x is not None]
|
|
154
|
+
|
|
155
|
+
# Avg GPU util from probe aggregate
|
|
156
|
+
if data.avg_gpu_util is None:
|
|
157
|
+
data.avg_gpu_util = _float_or_none(probe.get("avg_gpu_util"))
|
|
158
|
+
|
|
159
|
+
# Timing (from callback sidecar data merged into probe)
|
|
160
|
+
data.step_time_p50_ms = _float_or_none(probe.get("step_time_ms_p50"))
|
|
161
|
+
data.step_time_p90_ms = _float_or_none(probe.get("step_time_ms_p90"))
|
|
162
|
+
data.throughput_samples_per_sec = _float_or_none(probe.get("samples_per_sec"))
|
|
163
|
+
data.dataloader_wait_pct = _float_or_none(probe.get("dataloader_wait_pct"))
|
|
164
|
+
|
|
165
|
+
# Phase timing (from CUDA events in callbacks)
|
|
166
|
+
data.phase_forward_ms_p50 = _float_or_none(probe.get("phase_forward_ms_p50"))
|
|
167
|
+
data.phase_forward_ms_p90 = _float_or_none(probe.get("phase_forward_ms_p90"))
|
|
168
|
+
data.phase_backward_ms_p50 = _float_or_none(probe.get("phase_backward_ms_p50"))
|
|
169
|
+
data.phase_backward_ms_p90 = _float_or_none(probe.get("phase_backward_ms_p90"))
|
|
170
|
+
data.phase_optimizer_ms_p50 = _float_or_none(probe.get("phase_optimizer_ms_p50"))
|
|
171
|
+
data.phase_optimizer_ms_p90 = _float_or_none(probe.get("phase_optimizer_ms_p90"))
|
|
172
|
+
data.phase_dataloader_ms_p50 = _float_or_none(probe.get("phase_dataloader_ms_p50"))
|
|
173
|
+
data.phase_dataloader_ms_p90 = _float_or_none(probe.get("phase_dataloader_ms_p90"))
|
|
174
|
+
data.has_phase_timing = bool(probe.get("has_phase_timing"))
|
|
175
|
+
data.comm_overhead_pct = _float_or_none(probe.get("comm_overhead_pct"))
|
|
176
|
+
|
|
177
|
+
# Architecture metadata (from callback introspection)
|
|
178
|
+
data.architecture_type = probe.get("architecture_type")
|
|
179
|
+
data.optimizer_type = probe.get("optimizer_type")
|
|
180
|
+
data.fine_tuning_method = probe.get("fine_tuning_method")
|
|
181
|
+
gc = probe.get("gradient_checkpointing")
|
|
182
|
+
data.gradient_checkpointing = bool(gc) if gc is not None else None
|
|
183
|
+
data.attention_type = probe.get("attention_type")
|
|
184
|
+
data.param_count = _int_or_none(probe.get("param_count"))
|
|
185
|
+
data.trainable_param_count = _int_or_none(probe.get("trainable_param_count"))
|
|
186
|
+
|
|
187
|
+
# Raw step times (for per-rank merge / straggler detection)
|
|
188
|
+
raw_times = probe.get("step_times_raw")
|
|
189
|
+
if isinstance(raw_times, list) and raw_times:
|
|
190
|
+
data.step_times_ms = [x for x in (_float_or_none(t) for t in raw_times) if x is not None]
|
|
191
|
+
|
|
192
|
+
# Run metadata
|
|
193
|
+
data.exit_code = probe.get("exit_code")
|
|
194
|
+
data.duration_s = _float_or_none(probe.get("duration_seconds"))
|
|
195
|
+
data.command = probe.get("command")
|
|
196
|
+
|
|
197
|
+
# Git SHA from context
|
|
198
|
+
git_ctx = context.get("git") or {}
|
|
199
|
+
data.git_sha = git_ctx.get("commit_sha")
|
|
200
|
+
|
|
201
|
+
# OOM detection: exit_code != 0 AND VRAM utilization > 95%
|
|
202
|
+
data.is_oom = _detect_oom(data)
|
|
203
|
+
|
|
204
|
+
return data
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _detect_oom(data: ArtifactData) -> bool:
|
|
208
|
+
"""Detect probable OOM from exit code and VRAM utilization."""
|
|
209
|
+
if data.exit_code is None or data.exit_code == 0:
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
if data.per_gpu_vram_used_mb and data.per_gpu_vram_total_mb:
|
|
213
|
+
total = data.per_gpu_vram_total_mb
|
|
214
|
+
if total > 0:
|
|
215
|
+
max_used = max(data.per_gpu_vram_used_mb)
|
|
216
|
+
utilization = max_used / total
|
|
217
|
+
if utilization > 0.95:
|
|
218
|
+
return True
|
|
219
|
+
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _float_or_none(val) -> Optional[float]:
|
|
224
|
+
"""Convert a value to float, returning None on failure."""
|
|
225
|
+
if val is None:
|
|
226
|
+
return None
|
|
227
|
+
try:
|
|
228
|
+
return float(val)
|
|
229
|
+
except (ValueError, TypeError):
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _int_or_none(val) -> Optional[int]:
|
|
234
|
+
"""Convert a value to int, returning None on failure."""
|
|
235
|
+
if val is None:
|
|
236
|
+
return None
|
|
237
|
+
try:
|
|
238
|
+
return int(val)
|
|
239
|
+
except (ValueError, TypeError):
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def find_rank_artifacts(directory: str = ".") -> List[str]:
|
|
244
|
+
"""Find all alloc_artifact_rank*.json.gz files in directory."""
|
|
245
|
+
pattern = os.path.join(directory, "alloc_artifact_rank*.json.gz")
|
|
246
|
+
return sorted(glob.glob(pattern))
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def merge_artifacts(paths: List[str]) -> Optional[ArtifactData]:
|
|
250
|
+
"""Load multiple rank artifacts and merge into a single ArtifactData.
|
|
251
|
+
|
|
252
|
+
Combines per-rank VRAM peaks, step times, and computes straggler metrics.
|
|
253
|
+
Returns None if no valid artifacts found.
|
|
254
|
+
"""
|
|
255
|
+
artifacts = [load_artifact(p) for p in paths]
|
|
256
|
+
artifacts = [a for a in artifacts if a is not None]
|
|
257
|
+
if not artifacts:
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
merged = ArtifactData()
|
|
261
|
+
|
|
262
|
+
# Use GPU info from rank 0
|
|
263
|
+
merged.gpu_name = artifacts[0].gpu_name
|
|
264
|
+
merged.gpu_count = len(artifacts)
|
|
265
|
+
merged.per_gpu_vram_total_mb = artifacts[0].per_gpu_vram_total_mb
|
|
266
|
+
merged.sm_version = artifacts[0].sm_version
|
|
267
|
+
merged.interconnect = artifacts[0].interconnect
|
|
268
|
+
|
|
269
|
+
# Per-rank peak VRAM
|
|
270
|
+
peaks = [a.peak_vram_mb for a in artifacts if a.peak_vram_mb is not None]
|
|
271
|
+
if peaks:
|
|
272
|
+
merged.per_rank_peak_vram_mb = peaks
|
|
273
|
+
merged.peak_vram_mb = max(peaks)
|
|
274
|
+
merged.per_gpu_vram_used_mb = peaks
|
|
275
|
+
|
|
276
|
+
# Per-rank step times (raw lists from each rank's rolling window)
|
|
277
|
+
rank_step_times = []
|
|
278
|
+
for a in artifacts:
|
|
279
|
+
if a.step_times_ms:
|
|
280
|
+
rank_step_times.append(a.step_times_ms)
|
|
281
|
+
if rank_step_times:
|
|
282
|
+
merged.per_rank_step_times_ms = rank_step_times
|
|
283
|
+
|
|
284
|
+
# Aggregate step timing from rank 0 (representative)
|
|
285
|
+
merged.step_time_p50_ms = artifacts[0].step_time_p50_ms
|
|
286
|
+
merged.step_time_p90_ms = artifacts[0].step_time_p90_ms
|
|
287
|
+
merged.throughput_samples_per_sec = artifacts[0].throughput_samples_per_sec
|
|
288
|
+
|
|
289
|
+
# Phase timing from rank 0 (representative)
|
|
290
|
+
merged.has_phase_timing = artifacts[0].has_phase_timing
|
|
291
|
+
merged.phase_forward_ms_p50 = artifacts[0].phase_forward_ms_p50
|
|
292
|
+
merged.phase_forward_ms_p90 = artifacts[0].phase_forward_ms_p90
|
|
293
|
+
merged.phase_backward_ms_p50 = artifacts[0].phase_backward_ms_p50
|
|
294
|
+
merged.phase_backward_ms_p90 = artifacts[0].phase_backward_ms_p90
|
|
295
|
+
merged.phase_optimizer_ms_p50 = artifacts[0].phase_optimizer_ms_p50
|
|
296
|
+
merged.phase_optimizer_ms_p90 = artifacts[0].phase_optimizer_ms_p90
|
|
297
|
+
merged.phase_dataloader_ms_p50 = artifacts[0].phase_dataloader_ms_p50
|
|
298
|
+
merged.phase_dataloader_ms_p90 = artifacts[0].phase_dataloader_ms_p90
|
|
299
|
+
merged.comm_overhead_pct = artifacts[0].comm_overhead_pct
|
|
300
|
+
|
|
301
|
+
# Straggler detection: ratio of slowest to fastest rank by step time p50
|
|
302
|
+
p50s = [a.step_time_p50_ms for a in artifacts if a.step_time_p50_ms is not None]
|
|
303
|
+
if p50s and len(p50s) > 1 and min(p50s) > 0:
|
|
304
|
+
merged.straggler_ratio = round(max(p50s) / min(p50s), 3)
|
|
305
|
+
|
|
306
|
+
# Aggregate GPU util from all ranks
|
|
307
|
+
all_utils = []
|
|
308
|
+
for a in artifacts:
|
|
309
|
+
if a.avg_gpu_util is not None:
|
|
310
|
+
all_utils.append(a.avg_gpu_util)
|
|
311
|
+
if all_utils:
|
|
312
|
+
merged.avg_gpu_util = sum(all_utils) / len(all_utils)
|
|
313
|
+
|
|
314
|
+
# Architecture metadata from rank 0
|
|
315
|
+
merged.architecture_type = artifacts[0].architecture_type
|
|
316
|
+
merged.optimizer_type = artifacts[0].optimizer_type
|
|
317
|
+
merged.fine_tuning_method = artifacts[0].fine_tuning_method
|
|
318
|
+
merged.gradient_checkpointing = artifacts[0].gradient_checkpointing
|
|
319
|
+
merged.attention_type = artifacts[0].attention_type
|
|
320
|
+
merged.param_count = artifacts[0].param_count
|
|
321
|
+
merged.trainable_param_count = artifacts[0].trainable_param_count
|
|
322
|
+
|
|
323
|
+
# Run metadata from rank 0
|
|
324
|
+
merged.exit_code = artifacts[0].exit_code
|
|
325
|
+
merged.duration_s = artifacts[0].duration_s
|
|
326
|
+
merged.command = artifacts[0].command
|
|
327
|
+
merged.git_sha = artifacts[0].git_sha
|
|
328
|
+
merged.is_oom = any(a.is_oom for a in artifacts)
|
|
329
|
+
|
|
330
|
+
return merged
|
|
@@ -44,6 +44,10 @@ def write_report(
|
|
|
44
44
|
"context": context if context else None,
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
+
if os.path.exists(resolved_path):
|
|
48
|
+
import sys
|
|
49
|
+
print(f"alloc: warning: overwriting existing artifact at {resolved_path}", file=sys.stderr)
|
|
50
|
+
|
|
47
51
|
with gzip.open(resolved_path, "wt", encoding="utf-8") as f:
|
|
48
52
|
json.dump(report, f, indent=2)
|
|
49
53
|
|
|
@@ -11,6 +11,7 @@ from __future__ import annotations
|
|
|
11
11
|
|
|
12
12
|
import base64
|
|
13
13
|
import hashlib
|
|
14
|
+
import html
|
|
14
15
|
import secrets
|
|
15
16
|
import socket
|
|
16
17
|
import threading
|
|
@@ -40,7 +41,8 @@ def _find_open_port(start=17256, attempts=20):
|
|
|
40
41
|
for port in range(start, start + attempts):
|
|
41
42
|
try:
|
|
42
43
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
43
|
-
|
|
44
|
+
# Bind to all interfaces so both localhost and 127.0.0.1 work
|
|
45
|
+
s.bind(("0.0.0.0", port))
|
|
44
46
|
return port
|
|
45
47
|
except OSError:
|
|
46
48
|
continue
|
|
@@ -69,7 +71,7 @@ class _CallbackHandler(BaseHTTPRequestHandler):
|
|
|
69
71
|
self._respond(
|
|
70
72
|
400,
|
|
71
73
|
"<html><body style='font-family:system-ui;text-align:center;padding:60px'>"
|
|
72
|
-
f"<h2>Login failed</h2><p>{error_desc}</p>"
|
|
74
|
+
f"<h2>Login failed</h2><p>{html.escape(error_desc)}</p>"
|
|
73
75
|
"</body></html>",
|
|
74
76
|
)
|
|
75
77
|
else:
|
|
@@ -108,7 +110,8 @@ def browser_login(
|
|
|
108
110
|
verifier, challenge = _generate_pkce_pair()
|
|
109
111
|
port = _find_open_port()
|
|
110
112
|
|
|
111
|
-
|
|
113
|
+
# Use 127.0.0.1 (not localhost) — more reliable, avoids IPv6 resolution issues.
|
|
114
|
+
redirect_uri = f"http://127.0.0.1:{port}/callback"
|
|
112
115
|
|
|
113
116
|
authorize_params = urlencode({
|
|
114
117
|
"provider": provider,
|
|
@@ -118,7 +121,8 @@ def browser_login(
|
|
|
118
121
|
})
|
|
119
122
|
authorize_url = f"{supabase_url}/auth/v1/authorize?{authorize_params}"
|
|
120
123
|
|
|
121
|
-
|
|
124
|
+
# Bind to 0.0.0.0 so both localhost and 127.0.0.1 reach the server.
|
|
125
|
+
server = HTTPServer(("0.0.0.0", port), _CallbackHandler)
|
|
122
126
|
server.auth_code = None # type: ignore[attr-defined]
|
|
123
127
|
server.auth_error = None # type: ignore[attr-defined]
|
|
124
128
|
server.timeout = 1 # poll interval for handle_request()
|
|
@@ -128,16 +132,21 @@ def browser_login(
|
|
|
128
132
|
server_thread.daemon = True
|
|
129
133
|
server_thread.start()
|
|
130
134
|
|
|
135
|
+
import sys
|
|
136
|
+
|
|
131
137
|
# Open the browser (or print URL as fallback).
|
|
132
138
|
try:
|
|
133
139
|
opened = webbrowser.open(authorize_url)
|
|
134
140
|
except Exception:
|
|
135
141
|
opened = False
|
|
136
142
|
|
|
143
|
+
print(f"\nCallback server listening on http://127.0.0.1:{port}/callback", file=sys.stderr)
|
|
137
144
|
if not opened:
|
|
138
145
|
print(f"\nOpen this URL in your browser to log in:\n\n {authorize_url}\n")
|
|
139
146
|
else:
|
|
140
|
-
print("Opened browser for login. Waiting for callback...")
|
|
147
|
+
print("Opened browser for login. Waiting for callback...", file=sys.stderr)
|
|
148
|
+
print(f"If login completes but the terminal stays stuck, your Supabase", file=sys.stderr)
|
|
149
|
+
print(f"redirect allowlist may not include http://127.0.0.1:{port}/callback", file=sys.stderr)
|
|
141
150
|
|
|
142
151
|
server_thread.join(timeout=timeout_seconds + 5)
|
|
143
152
|
server.server_close()
|
|
@@ -146,7 +155,20 @@ def browser_login(
|
|
|
146
155
|
raise RuntimeError(f"OAuth error: {server.auth_error}")
|
|
147
156
|
|
|
148
157
|
if not server.auth_code:
|
|
149
|
-
raise RuntimeError(
|
|
158
|
+
raise RuntimeError(
|
|
159
|
+
f"Login timed out — no callback received within {timeout_seconds} seconds.\n"
|
|
160
|
+
f"\n"
|
|
161
|
+
f" The browser never reached http://127.0.0.1:{port}/callback.\n"
|
|
162
|
+
f"\n"
|
|
163
|
+
f" Common causes:\n"
|
|
164
|
+
f" 1. Supabase redirect allowlist does not include http://127.0.0.1:{port}/**\n"
|
|
165
|
+
f" (Check: Supabase Dashboard → Authentication → URL Configuration → Redirect URLs)\n"
|
|
166
|
+
f" 2. Browser redirected to your site URL instead of localhost\n"
|
|
167
|
+
f" 3. Firewall or antivirus blocked the local callback server\n"
|
|
168
|
+
f"\n"
|
|
169
|
+
f" Workaround: alloc login --method token --token <paste-access-token>\n"
|
|
170
|
+
f" (Copy token from browser DevTools → Application → Local Storage → sb-*-auth-token)"
|
|
171
|
+
)
|
|
150
172
|
|
|
151
173
|
# Exchange auth code + verifier for tokens.
|
|
152
174
|
with httpx.Client(timeout=15) as client:
|