alloc 0.0.3__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {alloc-0.0.3 → alloc-0.0.5}/PKG-INFO +1 -1
  2. {alloc-0.0.3 → alloc-0.0.5}/pyproject.toml +1 -1
  3. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/__init__.py +1 -1
  4. alloc-0.0.5/src/alloc/artifact_loader.py +330 -0
  5. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/artifact_writer.py +4 -0
  6. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/browser_auth.py +28 -6
  7. alloc-0.0.5/src/alloc/callbacks.py +1204 -0
  8. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/catalog/gpus.v1.json +44 -31
  9. alloc-0.0.5/src/alloc/cli.py +3576 -0
  10. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/code_analyzer.py +161 -0
  11. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/config.py +3 -2
  12. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/diagnosis_display.py +102 -30
  13. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/diagnosis_engine.py +83 -7
  14. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/diagnosis_rules.py +364 -31
  15. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/display.py +18 -1
  16. alloc-0.0.5/src/alloc/extractor_runner.py +285 -0
  17. alloc-0.0.5/src/alloc/ghost.py +304 -0
  18. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/model_extractor.py +10 -0
  19. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/model_registry.py +26 -2
  20. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/probe.py +85 -11
  21. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/upload.py +39 -4
  22. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/yaml_config.py +2 -0
  23. {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/PKG-INFO +1 -1
  24. {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/SOURCES.txt +2 -0
  25. alloc-0.0.5/tests/test_artifact_loader.py +568 -0
  26. {alloc-0.0.3 → alloc-0.0.5}/tests/test_auth.py +14 -7
  27. alloc-0.0.5/tests/test_callbacks.py +1191 -0
  28. {alloc-0.0.3 → alloc-0.0.5}/tests/test_cli.py +128 -0
  29. alloc-0.0.5/tests/test_diagnose_cli.py +1313 -0
  30. {alloc-0.0.3 → alloc-0.0.5}/tests/test_diagnosis_engine.py +82 -1
  31. {alloc-0.0.3 → alloc-0.0.5}/tests/test_diagnosis_rules.py +346 -3
  32. alloc-0.0.5/tests/test_extractor_activation.py +178 -0
  33. alloc-0.0.5/tests/test_ghost.py +304 -0
  34. alloc-0.0.5/tests/test_interconnect.py +182 -0
  35. alloc-0.0.3/src/alloc/artifact_loader.py +0 -179
  36. alloc-0.0.3/src/alloc/callbacks.py +0 -617
  37. alloc-0.0.3/src/alloc/cli.py +0 -1621
  38. alloc-0.0.3/src/alloc/extractor_runner.py +0 -141
  39. alloc-0.0.3/src/alloc/ghost.py +0 -167
  40. alloc-0.0.3/tests/test_artifact_loader.py +0 -251
  41. alloc-0.0.3/tests/test_callbacks.py +0 -583
  42. alloc-0.0.3/tests/test_diagnose_cli.py +0 -464
  43. alloc-0.0.3/tests/test_ghost.py +0 -82
  44. {alloc-0.0.3 → alloc-0.0.5}/README.md +0 -0
  45. {alloc-0.0.3 → alloc-0.0.5}/setup.cfg +0 -0
  46. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/catalog/__init__.py +0 -0
  47. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/catalog/default_rate_card.json +0 -0
  48. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/context.py +0 -0
  49. {alloc-0.0.3 → alloc-0.0.5}/src/alloc/stability.py +0 -0
  50. {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/dependency_links.txt +0 -0
  51. {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/entry_points.txt +0 -0
  52. {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/requires.txt +0 -0
  53. {alloc-0.0.3 → alloc-0.0.5}/src/alloc.egg-info/top_level.txt +0 -0
  54. {alloc-0.0.3 → alloc-0.0.5}/tests/test_artifact.py +0 -0
  55. {alloc-0.0.3 → alloc-0.0.5}/tests/test_catalog.py +0 -0
  56. {alloc-0.0.3 → alloc-0.0.5}/tests/test_code_analyzer.py +0 -0
  57. {alloc-0.0.3 → alloc-0.0.5}/tests/test_context.py +0 -0
  58. {alloc-0.0.3 → alloc-0.0.5}/tests/test_init_from_org.py +0 -0
  59. {alloc-0.0.3 → alloc-0.0.5}/tests/test_model_extractor.py +0 -0
  60. {alloc-0.0.3 → alloc-0.0.5}/tests/test_probe_hw.py +0 -0
  61. {alloc-0.0.3 → alloc-0.0.5}/tests/test_probe_multi.py +0 -0
  62. {alloc-0.0.3 → alloc-0.0.5}/tests/test_stability.py +0 -0
  63. {alloc-0.0.3 → alloc-0.0.5}/tests/test_upload.py +0 -0
  64. {alloc-0.0.3 → alloc-0.0.5}/tests/test_verdict.py +0 -0
  65. {alloc-0.0.3 → alloc-0.0.5}/tests/test_yaml_config.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alloc
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
5
5
  Author-email: Alloc Labs <hello@alloclabs.com>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alloc"
7
- version = "0.0.3"
7
+ version = "0.0.5"
8
8
  description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "0.0.3"
5
+ __version__ = "0.0.5"
6
6
 
7
7
  from alloc.ghost import ghost, GhostReport
8
8
  from alloc.callbacks import AllocCallback as HuggingFaceCallback
@@ -0,0 +1,330 @@
1
+ """Artifact loader — parse alloc_artifact.json.gz for runtime-enhanced diagnosis.
2
+
3
+ Loads the artifact created by `alloc run`, extracting GPU metrics, timing data,
4
+ and per-rank distributed information for use by Phase 2 diagnosis rules.
5
+
6
+ Never crashes. Returns None on any failure.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import glob
12
+ import gzip
13
+ import json
14
+ import os
15
+ from dataclasses import dataclass, field
16
+ from typing import Dict, List, Optional
17
+
18
+
19
+ @dataclass
20
+ class ArtifactData:
21
+ """Parsed runtime artifact — structured for rule consumption."""
22
+
23
+ # Hardware (from probe)
24
+ gpu_name: Optional[str] = None
25
+ gpu_count: int = 1
26
+ per_gpu_vram_total_mb: Optional[float] = None
27
+ per_gpu_vram_used_mb: Optional[List[float]] = None
28
+ gpu_utilization_pct: Optional[List[float]] = None
29
+ power_draw_w: Optional[List[float]] = None
30
+ sm_version: Optional[str] = None
31
+ interconnect: Optional[str] = None
32
+
33
+ # Timing (from callbacks — may be None)
34
+ step_times_ms: Optional[List[float]] = None
35
+ step_time_p50_ms: Optional[float] = None
36
+ step_time_p90_ms: Optional[float] = None
37
+ throughput_samples_per_sec: Optional[float] = None
38
+ dataloader_wait_pct: Optional[float] = None
39
+
40
+ # Phase timing (from callbacks with CUDA events)
41
+ phase_forward_ms_p50: Optional[float] = None
42
+ phase_forward_ms_p90: Optional[float] = None
43
+ phase_backward_ms_p50: Optional[float] = None
44
+ phase_backward_ms_p90: Optional[float] = None
45
+ phase_optimizer_ms_p50: Optional[float] = None
46
+ phase_optimizer_ms_p90: Optional[float] = None
47
+ phase_dataloader_ms_p50: Optional[float] = None
48
+ phase_dataloader_ms_p90: Optional[float] = None
49
+ has_phase_timing: bool = False
50
+
51
+ # Communication overhead (estimated from phase timing in distributed training)
52
+ comm_overhead_pct: Optional[float] = None
53
+
54
+ # Per-rank data (from distributed callbacks)
55
+ per_rank_peak_vram_mb: Optional[List[float]] = None
56
+ per_rank_step_times_ms: Optional[List[List[float]]] = None
57
+ straggler_ratio: Optional[float] = None
58
+
59
+ # Architecture metadata (from callback introspection)
60
+ architecture_type: Optional[str] = None
61
+ optimizer_type: Optional[str] = None
62
+ fine_tuning_method: Optional[str] = None
63
+ gradient_checkpointing: Optional[bool] = None
64
+ attention_type: Optional[str] = None
65
+ param_count: Optional[int] = None
66
+ trainable_param_count: Optional[int] = None
67
+
68
+ # Run metadata
69
+ exit_code: Optional[int] = None
70
+ duration_s: Optional[float] = None
71
+ command: Optional[str] = None
72
+ git_sha: Optional[str] = None
73
+ is_oom: bool = False
74
+
75
+ # Aggregate metrics (computed from samples)
76
+ avg_gpu_util: Optional[float] = None
77
+ peak_vram_mb: Optional[float] = None
78
+
79
+
80
+ def load_artifact(path: str) -> Optional[ArtifactData]:
81
+ """Load and parse alloc_artifact.json.gz. Returns None on any failure."""
82
+ try:
83
+ if path.endswith(".gz"):
84
+ with gzip.open(path, "rt", encoding="utf-8") as f:
85
+ raw = json.load(f)
86
+ else:
87
+ with open(path, "r", encoding="utf-8") as f:
88
+ raw = json.load(f)
89
+ except Exception:
90
+ return None
91
+
92
+ return _parse_artifact(raw)
93
+
94
+
95
+ def find_artifact(directory: str = ".") -> Optional[str]:
96
+ """Find most recent alloc_artifact*.json.gz in directory. Returns path or None."""
97
+ patterns = [
98
+ os.path.join(directory, "alloc_artifact*.json.gz"),
99
+ os.path.join(directory, "alloc_artifact*.json"),
100
+ ]
101
+ candidates = [] # type: List[str]
102
+ for pattern in patterns:
103
+ candidates.extend(glob.glob(pattern))
104
+
105
+ if not candidates:
106
+ return None
107
+
108
+ # Sort by modification time, return newest
109
+ return max(candidates, key=os.path.getmtime)
110
+
111
+
112
+ def _parse_artifact(raw: dict) -> ArtifactData:
113
+ """Parse raw artifact JSON into ArtifactData."""
114
+ data = ArtifactData()
115
+
116
+ probe = raw.get("probe") or {}
117
+ hardware = raw.get("hardware") or {}
118
+ context = raw.get("context") or {}
119
+
120
+ # Hardware
121
+ data.gpu_name = hardware.get("gpu_name") or probe.get("gpu_name")
122
+ gpu_count = hardware.get("num_gpus_detected")
123
+ data.gpu_count = gpu_count if gpu_count is not None and gpu_count > 0 else 1
124
+ data.per_gpu_vram_total_mb = _float_or_none(
125
+ hardware.get("gpu_total_vram_mb") or probe.get("gpu_total_vram_mb")
126
+ )
127
+ data.sm_version = hardware.get("sm_version")
128
+ data.interconnect = probe.get("interconnect_type")
129
+
130
+ # Peak VRAM — from probe samples or direct field
131
+ peak = _float_or_none(probe.get("peak_vram_mb"))
132
+ data.peak_vram_mb = peak
133
+
134
+ # Per-GPU VRAM: use per_rank_peak_vram_mb if available, else single peak
135
+ per_rank = probe.get("per_rank_peak_vram_mb")
136
+ if isinstance(per_rank, list) and per_rank:
137
+ data.per_gpu_vram_used_mb = [x for x in (_float_or_none(v) for v in per_rank) if x is not None]
138
+ elif peak is not None:
139
+ data.per_gpu_vram_used_mb = [peak]
140
+
141
+ data.per_rank_peak_vram_mb = data.per_gpu_vram_used_mb
142
+
143
+ # GPU utilization from samples
144
+ samples = probe.get("samples") or []
145
+ if samples:
146
+ utils = [s.get("gpu_util_pct") for s in samples if s.get("gpu_util_pct") is not None]
147
+ if utils:
148
+ data.gpu_utilization_pct = [x for x in (_float_or_none(u) for u in utils) if x is not None]
149
+ data.avg_gpu_util = sum(data.gpu_utilization_pct) / len(data.gpu_utilization_pct)
150
+
151
+ powers = [s.get("power_w") for s in samples if s.get("power_w") is not None]
152
+ if powers:
153
+ data.power_draw_w = [x for x in (_float_or_none(p) for p in powers) if x is not None]
154
+
155
+ # Avg GPU util from probe aggregate
156
+ if data.avg_gpu_util is None:
157
+ data.avg_gpu_util = _float_or_none(probe.get("avg_gpu_util"))
158
+
159
+ # Timing (from callback sidecar data merged into probe)
160
+ data.step_time_p50_ms = _float_or_none(probe.get("step_time_ms_p50"))
161
+ data.step_time_p90_ms = _float_or_none(probe.get("step_time_ms_p90"))
162
+ data.throughput_samples_per_sec = _float_or_none(probe.get("samples_per_sec"))
163
+ data.dataloader_wait_pct = _float_or_none(probe.get("dataloader_wait_pct"))
164
+
165
+ # Phase timing (from CUDA events in callbacks)
166
+ data.phase_forward_ms_p50 = _float_or_none(probe.get("phase_forward_ms_p50"))
167
+ data.phase_forward_ms_p90 = _float_or_none(probe.get("phase_forward_ms_p90"))
168
+ data.phase_backward_ms_p50 = _float_or_none(probe.get("phase_backward_ms_p50"))
169
+ data.phase_backward_ms_p90 = _float_or_none(probe.get("phase_backward_ms_p90"))
170
+ data.phase_optimizer_ms_p50 = _float_or_none(probe.get("phase_optimizer_ms_p50"))
171
+ data.phase_optimizer_ms_p90 = _float_or_none(probe.get("phase_optimizer_ms_p90"))
172
+ data.phase_dataloader_ms_p50 = _float_or_none(probe.get("phase_dataloader_ms_p50"))
173
+ data.phase_dataloader_ms_p90 = _float_or_none(probe.get("phase_dataloader_ms_p90"))
174
+ data.has_phase_timing = bool(probe.get("has_phase_timing"))
175
+ data.comm_overhead_pct = _float_or_none(probe.get("comm_overhead_pct"))
176
+
177
+ # Architecture metadata (from callback introspection)
178
+ data.architecture_type = probe.get("architecture_type")
179
+ data.optimizer_type = probe.get("optimizer_type")
180
+ data.fine_tuning_method = probe.get("fine_tuning_method")
181
+ gc = probe.get("gradient_checkpointing")
182
+ data.gradient_checkpointing = bool(gc) if gc is not None else None
183
+ data.attention_type = probe.get("attention_type")
184
+ data.param_count = _int_or_none(probe.get("param_count"))
185
+ data.trainable_param_count = _int_or_none(probe.get("trainable_param_count"))
186
+
187
+ # Raw step times (for per-rank merge / straggler detection)
188
+ raw_times = probe.get("step_times_raw")
189
+ if isinstance(raw_times, list) and raw_times:
190
+ data.step_times_ms = [x for x in (_float_or_none(t) for t in raw_times) if x is not None]
191
+
192
+ # Run metadata
193
+ data.exit_code = probe.get("exit_code")
194
+ data.duration_s = _float_or_none(probe.get("duration_seconds"))
195
+ data.command = probe.get("command")
196
+
197
+ # Git SHA from context
198
+ git_ctx = context.get("git") or {}
199
+ data.git_sha = git_ctx.get("commit_sha")
200
+
201
+ # OOM detection: exit_code != 0 AND VRAM utilization > 95%
202
+ data.is_oom = _detect_oom(data)
203
+
204
+ return data
205
+
206
+
207
+ def _detect_oom(data: ArtifactData) -> bool:
208
+ """Detect probable OOM from exit code and VRAM utilization."""
209
+ if data.exit_code is None or data.exit_code == 0:
210
+ return False
211
+
212
+ if data.per_gpu_vram_used_mb and data.per_gpu_vram_total_mb:
213
+ total = data.per_gpu_vram_total_mb
214
+ if total > 0:
215
+ max_used = max(data.per_gpu_vram_used_mb)
216
+ utilization = max_used / total
217
+ if utilization > 0.95:
218
+ return True
219
+
220
+ return False
221
+
222
+
223
+ def _float_or_none(val) -> Optional[float]:
224
+ """Convert a value to float, returning None on failure."""
225
+ if val is None:
226
+ return None
227
+ try:
228
+ return float(val)
229
+ except (ValueError, TypeError):
230
+ return None
231
+
232
+
233
+ def _int_or_none(val) -> Optional[int]:
234
+ """Convert a value to int, returning None on failure."""
235
+ if val is None:
236
+ return None
237
+ try:
238
+ return int(val)
239
+ except (ValueError, TypeError):
240
+ return None
241
+
242
+
243
+ def find_rank_artifacts(directory: str = ".") -> List[str]:
244
+ """Find all alloc_artifact_rank*.json.gz files in directory."""
245
+ pattern = os.path.join(directory, "alloc_artifact_rank*.json.gz")
246
+ return sorted(glob.glob(pattern))
247
+
248
+
249
+ def merge_artifacts(paths: List[str]) -> Optional[ArtifactData]:
250
+ """Load multiple rank artifacts and merge into a single ArtifactData.
251
+
252
+ Combines per-rank VRAM peaks, step times, and computes straggler metrics.
253
+ Returns None if no valid artifacts found.
254
+ """
255
+ artifacts = [load_artifact(p) for p in paths]
256
+ artifacts = [a for a in artifacts if a is not None]
257
+ if not artifacts:
258
+ return None
259
+
260
+ merged = ArtifactData()
261
+
262
+ # Use GPU info from rank 0
263
+ merged.gpu_name = artifacts[0].gpu_name
264
+ merged.gpu_count = len(artifacts)
265
+ merged.per_gpu_vram_total_mb = artifacts[0].per_gpu_vram_total_mb
266
+ merged.sm_version = artifacts[0].sm_version
267
+ merged.interconnect = artifacts[0].interconnect
268
+
269
+ # Per-rank peak VRAM
270
+ peaks = [a.peak_vram_mb for a in artifacts if a.peak_vram_mb is not None]
271
+ if peaks:
272
+ merged.per_rank_peak_vram_mb = peaks
273
+ merged.peak_vram_mb = max(peaks)
274
+ merged.per_gpu_vram_used_mb = peaks
275
+
276
+ # Per-rank step times (raw lists from each rank's rolling window)
277
+ rank_step_times = []
278
+ for a in artifacts:
279
+ if a.step_times_ms:
280
+ rank_step_times.append(a.step_times_ms)
281
+ if rank_step_times:
282
+ merged.per_rank_step_times_ms = rank_step_times
283
+
284
+ # Aggregate step timing from rank 0 (representative)
285
+ merged.step_time_p50_ms = artifacts[0].step_time_p50_ms
286
+ merged.step_time_p90_ms = artifacts[0].step_time_p90_ms
287
+ merged.throughput_samples_per_sec = artifacts[0].throughput_samples_per_sec
288
+
289
+ # Phase timing from rank 0 (representative)
290
+ merged.has_phase_timing = artifacts[0].has_phase_timing
291
+ merged.phase_forward_ms_p50 = artifacts[0].phase_forward_ms_p50
292
+ merged.phase_forward_ms_p90 = artifacts[0].phase_forward_ms_p90
293
+ merged.phase_backward_ms_p50 = artifacts[0].phase_backward_ms_p50
294
+ merged.phase_backward_ms_p90 = artifacts[0].phase_backward_ms_p90
295
+ merged.phase_optimizer_ms_p50 = artifacts[0].phase_optimizer_ms_p50
296
+ merged.phase_optimizer_ms_p90 = artifacts[0].phase_optimizer_ms_p90
297
+ merged.phase_dataloader_ms_p50 = artifacts[0].phase_dataloader_ms_p50
298
+ merged.phase_dataloader_ms_p90 = artifacts[0].phase_dataloader_ms_p90
299
+ merged.comm_overhead_pct = artifacts[0].comm_overhead_pct
300
+
301
+ # Straggler detection: ratio of slowest to fastest rank by step time p50
302
+ p50s = [a.step_time_p50_ms for a in artifacts if a.step_time_p50_ms is not None]
303
+ if p50s and len(p50s) > 1 and min(p50s) > 0:
304
+ merged.straggler_ratio = round(max(p50s) / min(p50s), 3)
305
+
306
+ # Aggregate GPU util from all ranks
307
+ all_utils = []
308
+ for a in artifacts:
309
+ if a.avg_gpu_util is not None:
310
+ all_utils.append(a.avg_gpu_util)
311
+ if all_utils:
312
+ merged.avg_gpu_util = sum(all_utils) / len(all_utils)
313
+
314
+ # Architecture metadata from rank 0
315
+ merged.architecture_type = artifacts[0].architecture_type
316
+ merged.optimizer_type = artifacts[0].optimizer_type
317
+ merged.fine_tuning_method = artifacts[0].fine_tuning_method
318
+ merged.gradient_checkpointing = artifacts[0].gradient_checkpointing
319
+ merged.attention_type = artifacts[0].attention_type
320
+ merged.param_count = artifacts[0].param_count
321
+ merged.trainable_param_count = artifacts[0].trainable_param_count
322
+
323
+ # Run metadata from rank 0
324
+ merged.exit_code = artifacts[0].exit_code
325
+ merged.duration_s = artifacts[0].duration_s
326
+ merged.command = artifacts[0].command
327
+ merged.git_sha = artifacts[0].git_sha
328
+ merged.is_oom = any(a.is_oom for a in artifacts)
329
+
330
+ return merged
@@ -44,6 +44,10 @@ def write_report(
44
44
  "context": context if context else None,
45
45
  }
46
46
 
47
+ if os.path.exists(resolved_path):
48
+ import sys
49
+ print(f"alloc: warning: overwriting existing artifact at {resolved_path}", file=sys.stderr)
50
+
47
51
  with gzip.open(resolved_path, "wt", encoding="utf-8") as f:
48
52
  json.dump(report, f, indent=2)
49
53
 
@@ -11,6 +11,7 @@ from __future__ import annotations
11
11
 
12
12
  import base64
13
13
  import hashlib
14
+ import html
14
15
  import secrets
15
16
  import socket
16
17
  import threading
@@ -40,7 +41,8 @@ def _find_open_port(start=17256, attempts=20):
40
41
  for port in range(start, start + attempts):
41
42
  try:
42
43
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
43
- s.bind(("127.0.0.1", port))
44
+ # Bind to all interfaces so both localhost and 127.0.0.1 work
45
+ s.bind(("0.0.0.0", port))
44
46
  return port
45
47
  except OSError:
46
48
  continue
@@ -69,7 +71,7 @@ class _CallbackHandler(BaseHTTPRequestHandler):
69
71
  self._respond(
70
72
  400,
71
73
  "<html><body style='font-family:system-ui;text-align:center;padding:60px'>"
72
- f"<h2>Login failed</h2><p>{error_desc}</p>"
74
+ f"<h2>Login failed</h2><p>{html.escape(error_desc)}</p>"
73
75
  "</body></html>",
74
76
  )
75
77
  else:
@@ -108,7 +110,8 @@ def browser_login(
108
110
  verifier, challenge = _generate_pkce_pair()
109
111
  port = _find_open_port()
110
112
 
111
- redirect_uri = f"http://localhost:{port}/callback"
113
+ # Use 127.0.0.1 (not localhost) — more reliable, avoids IPv6 resolution issues.
114
+ redirect_uri = f"http://127.0.0.1:{port}/callback"
112
115
 
113
116
  authorize_params = urlencode({
114
117
  "provider": provider,
@@ -118,7 +121,8 @@ def browser_login(
118
121
  })
119
122
  authorize_url = f"{supabase_url}/auth/v1/authorize?{authorize_params}"
120
123
 
121
- server = HTTPServer(("127.0.0.1", port), _CallbackHandler)
124
+ # Bind to 0.0.0.0 so both localhost and 127.0.0.1 reach the server.
125
+ server = HTTPServer(("0.0.0.0", port), _CallbackHandler)
122
126
  server.auth_code = None # type: ignore[attr-defined]
123
127
  server.auth_error = None # type: ignore[attr-defined]
124
128
  server.timeout = 1 # poll interval for handle_request()
@@ -128,16 +132,21 @@ def browser_login(
128
132
  server_thread.daemon = True
129
133
  server_thread.start()
130
134
 
135
+ import sys
136
+
131
137
  # Open the browser (or print URL as fallback).
132
138
  try:
133
139
  opened = webbrowser.open(authorize_url)
134
140
  except Exception:
135
141
  opened = False
136
142
 
143
+ print(f"\nCallback server listening on http://127.0.0.1:{port}/callback", file=sys.stderr)
137
144
  if not opened:
138
145
  print(f"\nOpen this URL in your browser to log in:\n\n {authorize_url}\n")
139
146
  else:
140
- print("Opened browser for login. Waiting for callback...")
147
+ print("Opened browser for login. Waiting for callback...", file=sys.stderr)
148
+ print(f"If login completes but the terminal stays stuck, your Supabase", file=sys.stderr)
149
+ print(f"redirect allowlist may not include http://127.0.0.1:{port}/callback", file=sys.stderr)
141
150
 
142
151
  server_thread.join(timeout=timeout_seconds + 5)
143
152
  server.server_close()
@@ -146,7 +155,20 @@ def browser_login(
146
155
  raise RuntimeError(f"OAuth error: {server.auth_error}")
147
156
 
148
157
  if not server.auth_code:
149
- raise RuntimeError("Login timed out — no callback received within 120 seconds.")
158
+ raise RuntimeError(
159
+ f"Login timed out — no callback received within {timeout_seconds} seconds.\n"
160
+ f"\n"
161
+ f" The browser never reached http://127.0.0.1:{port}/callback.\n"
162
+ f"\n"
163
+ f" Common causes:\n"
164
+ f" 1. Supabase redirect allowlist does not include http://127.0.0.1:{port}/**\n"
165
+ f" (Check: Supabase Dashboard → Authentication → URL Configuration → Redirect URLs)\n"
166
+ f" 2. Browser redirected to your site URL instead of localhost\n"
167
+ f" 3. Firewall or antivirus blocked the local callback server\n"
168
+ f"\n"
169
+ f" Workaround: alloc login --method token --token <paste-access-token>\n"
170
+ f" (Copy token from browser DevTools → Application → Local Storage → sb-*-auth-token)"
171
+ )
150
172
 
151
173
  # Exchange auth code + verifier for tokens.
152
174
  with httpx.Client(timeout=15) as client: