PyPI - alloc - Versions diffs - 0.0.4__tar.gz → 0.0.5__tar.gz - Mend

alloc 0.0.4tar.gz → 0.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

{alloc-0.0.4 → alloc-0.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.4
+Version: 0.0.5
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0

{alloc-0.0.4 → alloc-0.0.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alloc"
-version = "0.0.4"
+version = "0.0.5"
 description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
 readme = "README.md"
 license = "Apache-2.0"

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/__init__.py RENAMED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-__version__ = "0.0.4"
+__version__ = "0.0.5"
 from alloc.ghost import ghost, GhostReport
 from alloc.callbacks import AllocCallback as HuggingFaceCallback

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/browser_auth.py RENAMED Viewed

@@ -11,6 +11,7 @@ from __future__ import annotations
 import base64
 import hashlib
+import html
 import secrets
 import socket
 import threading
@@ -40,7 +41,8 @@ def _find_open_port(start=17256, attempts=20):
     for port in range(start, start + attempts):
         try:
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-                s.bind(("127.0.0.1", port))
+                # Bind to all interfaces so both localhost and 127.0.0.1 work
+                s.bind(("0.0.0.0", port))
                 return port
         except OSError:
             continue
@@ -69,7 +71,7 @@ class _CallbackHandler(BaseHTTPRequestHandler):
             self._respond(
                 400,
                 "<html><body style='font-family:system-ui;text-align:center;padding:60px'>"
-                f"<h2>Login failed</h2><p>{error_desc}</p>"
+                f"<h2>Login failed</h2><p>{html.escape(error_desc)}</p>"
                 "</body></html>",
             )
         else:
@@ -108,7 +110,8 @@ def browser_login(
     verifier, challenge = _generate_pkce_pair()
     port = _find_open_port()
-    redirect_uri = f"http://localhost:{port}/callback"
+    # Use 127.0.0.1 (not localhost) — more reliable, avoids IPv6 resolution issues.
+    redirect_uri = f"http://127.0.0.1:{port}/callback"
     authorize_params = urlencode({
         "provider": provider,
@@ -118,7 +121,8 @@ def browser_login(
     })
     authorize_url = f"{supabase_url}/auth/v1/authorize?{authorize_params}"
-    server = HTTPServer(("127.0.0.1", port), _CallbackHandler)
+    # Bind to 0.0.0.0 so both localhost and 127.0.0.1 reach the server.
+    server = HTTPServer(("0.0.0.0", port), _CallbackHandler)
     server.auth_code = None  # type: ignore[attr-defined]
     server.auth_error = None  # type: ignore[attr-defined]
     server.timeout = 1  # poll interval for handle_request()
@@ -128,16 +132,21 @@ def browser_login(
     server_thread.daemon = True
     server_thread.start()
+    import sys
     # Open the browser (or print URL as fallback).
     try:
         opened = webbrowser.open(authorize_url)
     except Exception:
         opened = False
+    print(f"\nCallback server listening on http://127.0.0.1:{port}/callback", file=sys.stderr)
     if not opened:
         print(f"\nOpen this URL in your browser to log in:\n\n  {authorize_url}\n")
     else:
-        print("Opened browser for login. Waiting for callback...")
+        print("Opened browser for login. Waiting for callback...", file=sys.stderr)
+        print(f"If login completes but the terminal stays stuck, your Supabase", file=sys.stderr)
+        print(f"redirect allowlist may not include http://127.0.0.1:{port}/callback", file=sys.stderr)
     server_thread.join(timeout=timeout_seconds + 5)
     server.server_close()
@@ -146,7 +155,20 @@ def browser_login(
         raise RuntimeError(f"OAuth error: {server.auth_error}")
     if not server.auth_code:
-        raise RuntimeError("Login timed out — no callback received within 120 seconds.")
+        raise RuntimeError(
+            f"Login timed out — no callback received within {timeout_seconds} seconds.\n"
+            f"\n"
+            f"  The browser never reached http://127.0.0.1:{port}/callback.\n"
+            f"\n"
+            f"  Common causes:\n"
+            f"  1. Supabase redirect allowlist does not include http://127.0.0.1:{port}/**\n"
+            f"     (Check: Supabase Dashboard → Authentication → URL Configuration → Redirect URLs)\n"
+            f"  2. Browser redirected to your site URL instead of localhost\n"
+            f"  3. Firewall or antivirus blocked the local callback server\n"
+            f"\n"
+            f"  Workaround: alloc login --method token --token <paste-access-token>\n"
+            f"  (Copy token from browser DevTools → Application → Local Storage → sb-*-auth-token)"
+        )
     # Exchange auth code + verifier for tokens.
     with httpx.Client(timeout=15) as client:

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/callbacks.py RENAMED Viewed

@@ -138,7 +138,8 @@ def _detect_architecture(model, optimizer=None, training_args=None):
                                 "mistral", "qwen2", "phi", "gemma", "falcon",
                                 "bert", "roberta", "t5", "bart", "mbart",
                                 "whisper", "wav2vec2", "vit", "deit", "beit",
-                                "swin", "clip", "dinov2"},
+                                "swin", "clip", "dinov2", "deepseek",
+                                "starcoder2", "cohere", "mamba"},
                 "moe": {"mixtral", "switch_transformers"},
                 "diffusion": {"unet_2d_condition"},
             }
@@ -389,9 +390,19 @@ class _NvmlMonitor:
                         if 0 <= idx < physical_count:
                             visible_indices.append(idx)
                     except ValueError:
-                        # UUID-style device identifiers — fall back to physical count
-                        visible_indices = list(range(physical_count))
-                        break
+                        # UUID-style device identifiers — try NVML UUID matching
+                        try:
+                            for phys_idx in range(physical_count):
+                                handle = self._pynvml.nvmlDeviceGetHandleByIndex(phys_idx)
+                                uuid = self._pynvml.nvmlDeviceGetUUID(handle)
+                                if isinstance(uuid, bytes):
+                                    uuid = uuid.decode("utf-8", errors="replace")
+                                if d in uuid:
+                                    visible_indices.append(phys_idx)
+                                    break
+                        except Exception:
+                            visible_indices = list(range(physical_count))
+                            break
             gpu_indices = visible_indices if visible_indices else list(range(physical_count))
         else:
             gpu_indices = list(range(physical_count))
@@ -687,8 +698,14 @@ def _write_full_artifact(monitor, sidecar_data, step_times_raw=None):
         if sidecar_data.get("is_distributed"):
             probe_dict["is_distributed"] = True
             rank = sidecar_data.get("rank", 0)
+            world_size = sidecar_data.get("world_size", 1)
             probe_dict["rank"] = rank
-            probe_dict["world_size"] = sidecar_data.get("world_size", 1)
+            probe_dict["world_size"] = world_size
+            # Set num_gpus_detected to world_size so the artifact reflects
+            # the full distributed topology, not just the local GPU count.
+            probe_dict["num_gpus_detected"] = max(
+                probe_dict.get("num_gpus_detected", 1), world_size
+            )
             if rank > 0:
                 output_path = "alloc_artifact_rank{}.json.gz".format(rank)

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/cli.py RENAMED Viewed

@@ -370,6 +370,36 @@ def run(
     callback_data = _read_callback_data()
     step_count = callback_data.get("step_count") if callback_data else None
+    # Auto-merge per-rank callback artifacts for distributed runs.
+    # When DDP callbacks write alloc_artifact_rank{N}.json.gz alongside the
+    # main artifact, merge them to get per-rank peaks and straggler data.
+    try:
+        from alloc.artifact_loader import find_rank_artifacts, merge_artifacts, load_artifact
+        rank_files = find_rank_artifacts(".")
+        if rank_files:
+            # Include rank 0 artifact if it exists
+            main_artifact_path = os.path.join(".", "alloc_artifact.json.gz")
+            all_paths = ([main_artifact_path] if os.path.exists(main_artifact_path) else []) + rank_files
+            if len(all_paths) > 1:
+                merged = merge_artifacts(all_paths)
+                if merged is not None:
+                    # Enrich probe result with merged multi-GPU data
+                    result.num_gpus_detected = max(result.num_gpus_detected, merged.gpu_count or len(all_paths))
+                    if merged.per_rank_peak_vram_mb:
+                        result.per_gpu_peak_vram_mb = merged.per_rank_peak_vram_mb
+                    # Use merged step timing if probe didn't capture it
+                    if callback_data is None:
+                        callback_data = {}
+                    if merged.step_time_p50_ms and not callback_data.get("step_time_ms_p50"):
+                        callback_data["step_time_ms_p50"] = merged.step_time_p50_ms
+                    if merged.step_time_p90_ms and not callback_data.get("step_time_ms_p90"):
+                        callback_data["step_time_ms_p90"] = merged.step_time_p90_ms
+                    if merged.throughput_samples_per_sec and not callback_data.get("samples_per_sec"):
+                        callback_data["samples_per_sec"] = merged.throughput_samples_per_sec
+                    step_count = step_count or callback_data.get("step_count")
+    except Exception:
+        pass  # Never crash on merge failure
     # Discover environment context (git, container, Ray)
     from alloc.context import discover_context
     env_context = discover_context()
@@ -2117,6 +2147,13 @@ def login(
     ),
 ):
     """Authenticate with Alloc dashboard."""
+    # Suppress noisy third-party warnings (urllib3 LibreSSL, pynvml deprecation)
+    # that clutter the auth flow output.
+    import warnings
+    warnings.filterwarnings("ignore", category=DeprecationWarning, module="pynvml")
+    warnings.filterwarnings("ignore", message=".*LibreSSL.*", module="urllib3")
+    warnings.filterwarnings("ignore", message=".*pynvml.*", category=FutureWarning)
     import httpx
     from alloc.config import get_supabase_url, get_supabase_anon_key, load_config, save_config

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/code_analyzer.py RENAMED Viewed

@@ -576,6 +576,56 @@ def _find_distributed(
                         backend=None,
                     ))
+    # Lightning: pl.Trainer(...), Trainer from pytorch_lightning/lightning.pytorch
+    _lightning_prefixes = ("pytorch_lightning", "lightning.pytorch", "lightning")
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.Call):
+            continue
+        fqn = _resolve_call_name(node, imports)
+        if fqn is None:
+            continue
+        # Direct match: pytorch_lightning.Trainer(...) or lightning.pytorch.Trainer(...)
+        if any(fqn.startswith(pfx) for pfx in _lightning_prefixes) and "Trainer" in fqn:
+            if not any(d.kind == "lightning" for d in results):
+                results.append(DistributedFinding(
+                    location=_loc(script_path, node, lines),
+                    kind="lightning",
+                    backend=None,
+                ))
+            break
+        # Import-resolved: Trainer imported from lightning
+        if fqn == "Trainer" or fqn.endswith(".Trainer"):
+            src = imports.get("Trainer", "")
+            if any(src.startswith(pfx) for pfx in _lightning_prefixes):
+                if not any(d.kind == "lightning" for d in results):
+                    results.append(DistributedFinding(
+                        location=_loc(script_path, node, lines),
+                        kind="lightning",
+                        backend=None,
+                    ))
+                break
+    # LightningModule subclass detection
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.ClassDef):
+            continue
+        for base in node.bases:
+            base_name = None
+            if isinstance(base, ast.Name):
+                base_name = base.id
+            elif isinstance(base, ast.Attribute):
+                base_name = base.attr
+            if base_name == "LightningModule":
+                src = imports.get("LightningModule", "")
+                if not src or any(src.startswith(pfx) for pfx in _lightning_prefixes):
+                    if not any(d.kind == "lightning" for d in results):
+                        results.append(DistributedFinding(
+                            location=_loc(script_path, node, lines),
+                            kind="lightning",
+                            backend=None,
+                        ))
+                    break
     return results
@@ -973,6 +1023,10 @@ def _merge_imported_findings(
     for opt in _find_optimizers(tree, imports, lines, imported_path):
         main_findings.optimizers.append(opt)
+    # Merge fine-tuning findings
+    for ft in _find_fine_tuning(tree, imports, lines, imported_path):
+        main_findings.fine_tuning.append(ft)
     # Extract TrainingArguments from imported file
     sub_findings = CodeFindings(script_path=imported_path)
     sub_findings.imports = imports

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/config.py RENAMED Viewed

@@ -40,8 +40,9 @@ def save_config(data: dict) -> None:
         cfg_file = _config_file()
         cfg_file.write_text(json.dumps(data, indent=2) + "\n")
         os.chmod(cfg_file, 0o600)
-    except Exception:
-        pass
+    except Exception as e:
+        import sys
+        print(f"Warning: could not secure config file permissions: {e}", file=sys.stderr)
 def get_token() -> str:

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/diagnosis_display.py RENAMED Viewed

@@ -443,23 +443,46 @@ def print_diagnose_efficiency(result: DiagnoseResult) -> None:
     console.print(f"  Step time (p50): {p50:.1f} ms")
     console.print()
-    # Visual bar
-    compute_w = int(eff["compute_pct"] / 100 * 48)
-    data_w = int(eff["data_loading_pct"] / 100 * 48)
-    other_w = max(0, 48 - compute_w - data_w)
-    bar = (
-        "[green]" + "█" * compute_w + "[/green]"
-        + "[yellow]" + "█" * data_w + "[/yellow]"
-        + "[dim]" + "░" * other_w + "[/dim]"
-    )
-    console.print(f"  {bar}")
-    label = f"  [green]Compute: {eff['compute_pct']:.0f}%[/green]"
-    if eff["data_loading_pct"] > 0:
-        label += f"  [yellow]Data: {eff['data_loading_pct']:.0f}%[/yellow]"
-    if eff["other_pct"] > 0:
-        label += f"  [dim]Other: {eff['other_pct']:.0f}%[/dim]"
-    console.print(label)
+    # Visual bar — layout depends on source (cuda_events vs wall_clock)
+    is_cuda = eff.get("source") == "cuda_events"
+    if is_cuda:
+        fwd_w = int(eff["forward_pct"] / 100 * 48)
+        bwd_w = int(eff["backward_pct"] / 100 * 48)
+        opt_w = int(eff["optimizer_pct"] / 100 * 48)
+        dl_w = max(0, 48 - fwd_w - bwd_w - opt_w)
+        bar = (
+            "[green]" + "█" * fwd_w + "[/green]"
+            + "[cyan]" + "█" * bwd_w + "[/cyan]"
+            + "[magenta]" + "█" * opt_w + "[/magenta]"
+            + "[yellow]" + "█" * dl_w + "[/yellow]"
+        )
+        console.print(f"  {bar}")
+        label = f"  [green]Forward: {eff['forward_pct']:.0f}%[/green]"
+        label += f"  [cyan]Backward: {eff['backward_pct']:.0f}%[/cyan]"
+        if eff["optimizer_pct"] > 0:
+            label += f"  [magenta]Optimizer: {eff['optimizer_pct']:.0f}%[/magenta]"
+        if eff["data_loading_pct"] > 0:
+            label += f"  [yellow]Data: {eff['data_loading_pct']:.0f}%[/yellow]"
+        console.print(label)
+    else:
+        compute_w = int(eff["compute_pct"] / 100 * 48)
+        data_w = int(eff["data_loading_pct"] / 100 * 48)
+        other_w = max(0, 48 - compute_w - data_w)
+        bar = (
+            "[green]" + "█" * compute_w + "[/green]"
+            + "[yellow]" + "█" * data_w + "[/yellow]"
+            + "[dim]" + "░" * other_w + "[/dim]"
+        )
+        console.print(f"  {bar}")
+        label = f"  [green]Compute: {eff['compute_pct']:.0f}%[/green]"
+        if eff["data_loading_pct"] > 0:
+            label += f"  [yellow]Data: {eff['data_loading_pct']:.0f}%[/yellow]"
+        if eff["other_pct"] > 0:
+            label += f"  [dim]Other: {eff['other_pct']:.0f}%[/dim]"
+        console.print(label)
     console.print()
     # Component table
@@ -468,14 +491,28 @@ def print_diagnose_efficiency(result: DiagnoseResult) -> None:
     table.add_column("Time (est.)", justify="right", style="bold")
     table.add_column("Notes", style="dim")
-    table.add_row("GPU compute", f"{eff['compute_ms']:.1f} ms", f"{eff['compute_pct']:.0f}% of step")
-    if eff["data_loading_pct"] > 0:
-        dl_note = f"{eff['data_loading_pct']:.0f}%"
-        if eff["data_loading_pct"] > 20:
-            dl_note += " — bottleneck candidate"
-        table.add_row("Data loading", f"{eff['data_loading_ms']:.1f} ms", dl_note)
-    if eff["other_pct"] > 0:
-        table.add_row("Other/overhead", f"{eff['other_ms']:.1f} ms", f"{eff['other_pct']:.0f}%")
+    if is_cuda:
+        table.add_row("Forward", f"{eff['forward_ms']:.1f} ms", f"{eff['forward_pct']:.0f}% of step")
+        table.add_row("Backward", f"{eff['backward_ms']:.1f} ms", f"{eff['backward_pct']:.0f}% of step")
+        if eff["optimizer_pct"] > 0:
+            opt_note = f"{eff['optimizer_pct']:.0f}%"
+            if eff["optimizer_pct"] > 30:
+                opt_note += " — bottleneck candidate"
+            table.add_row("Optimizer", f"{eff['optimizer_ms']:.1f} ms", opt_note)
+        if eff["data_loading_pct"] > 0:
+            dl_note = f"{eff['data_loading_pct']:.0f}%"
+            if eff["data_loading_pct"] > 30:
+                dl_note += " — bottleneck candidate"
+            table.add_row("Data loading", f"{eff['data_loading_ms']:.1f} ms", dl_note)
+    else:
+        table.add_row("GPU compute", f"{eff['compute_ms']:.1f} ms", f"{eff['compute_pct']:.0f}% of step")
+        if eff["data_loading_pct"] > 0:
+            dl_note = f"{eff['data_loading_pct']:.0f}%"
+            if eff["data_loading_pct"] > 20:
+                dl_note += " — bottleneck candidate"
+            table.add_row("Data loading", f"{eff['data_loading_ms']:.1f} ms", dl_note)
+        if eff["other_pct"] > 0:
+            table.add_row("Other/overhead", f"{eff['other_ms']:.1f} ms", f"{eff['other_pct']:.0f}%")
     console.print(table)
     console.print()
@@ -500,9 +537,19 @@ def _print_efficiency_plain(result: DiagnoseResult) -> None:
     print(f"\n  Efficiency breakdown (estimated)")
     print(f"  Step time (p50): {eff['step_time_p50_ms']:.1f} ms")
     print(f"  {'─' * 40}")
-    print(f"  GPU compute:     {eff['compute_ms']:>8.1f} ms  ({eff['compute_pct']:.0f}%)")
-    if eff["data_loading_pct"] > 0:
-        print(f"  Data loading:    {eff['data_loading_ms']:>8.1f} ms  ({eff['data_loading_pct']:.0f}%)")
+    if eff.get("source") == "cuda_events":
+        print(f"  Forward:         {eff['forward_ms']:>8.1f} ms  ({eff['forward_pct']:.0f}%)")
+        print(f"  Backward:        {eff['backward_ms']:>8.1f} ms  ({eff['backward_pct']:.0f}%)")
+        if eff["optimizer_pct"] > 0:
+            print(f"  Optimizer:       {eff['optimizer_ms']:>8.1f} ms  ({eff['optimizer_pct']:.0f}%)")
+        if eff["data_loading_pct"] > 0:
+            print(f"  Data loading:    {eff['data_loading_ms']:>8.1f} ms  ({eff['data_loading_pct']:.0f}%)")
+    else:
+        print(f"  GPU compute:     {eff['compute_ms']:>8.1f} ms  ({eff['compute_pct']:.0f}%)")
+        if eff["data_loading_pct"] > 0:
+            print(f"  Data loading:    {eff['data_loading_ms']:>8.1f} ms  ({eff['data_loading_pct']:.0f}%)")
     bn = eff.get("bottleneck")
     if bn:
         print(f"\n  Bottleneck: {bn}")

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/diagnosis_engine.py RENAMED Viewed

@@ -185,8 +185,8 @@ def _build_comparison(current: ArtifactData, previous: ArtifactData) -> Dict:
         })
     # Peak VRAM
-    cur_peak = max(current.per_gpu_vram_used_mb) if current.per_gpu_vram_used_mb else current.peak_vram_mb
-    prev_peak = max(previous.per_gpu_vram_used_mb) if previous.per_gpu_vram_used_mb else previous.peak_vram_mb
+    cur_peak = max(current.per_gpu_vram_used_mb) if current.per_gpu_vram_used_mb and len(current.per_gpu_vram_used_mb) > 0 else current.peak_vram_mb
+    prev_peak = max(previous.per_gpu_vram_used_mb) if previous.per_gpu_vram_used_mb and len(previous.per_gpu_vram_used_mb) > 0 else previous.peak_vram_mb
     _add("Peak VRAM", cur_peak, prev_peak, "MB", higher_is_worse=True)
     # Step time

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/diagnosis_rules.py RENAMED Viewed

@@ -289,7 +289,9 @@ def rule_dl005_main_thread(
     """
     results = []
     gpu_count = (hw or {}).get("gpu_count", 1) or 1
-    recommended = max(4, gpu_count * 2)
+    cpu_cores = os.cpu_count() or 4
+    per_gpu_cores = max(1, cpu_cores // max(gpu_count, 1))
+    recommended = max(4, min(gpu_count * 2, per_gpu_cores))
     for dl in findings.dataloaders:
         if dl.num_workers != 0:
@@ -428,7 +430,7 @@ def rule_mem005_no_torch_compile(
         return [Diagnosis(
             rule_id="MEM005",
             severity="info",
-            category="throughput",
+            category="memory",
             title="torch.compile not used",
             file_path=findings.script_path,
             line_number=0,
@@ -446,7 +448,7 @@ def rule_mem005_no_torch_compile(
     return [Diagnosis(
         rule_id="MEM005",
         severity="info",
-        category="throughput",
+        category="memory",
         title="torch.compile not used",
         file_path=findings.script_path,
         line_number=0,

{alloc-0.0.4 → alloc-0.0.5}/src/alloc/probe.py RENAMED Viewed

@@ -8,6 +8,7 @@ Graceful no-op if pynvml is not installed or no GPU is available.
 from __future__ import annotations
+import os
 import signal
 import subprocess
 import sys
@@ -112,16 +113,38 @@ def _discover_gpu_indices(proc_pid, pynvml, fallback_index=0):
     except Exception:
         return [fallback_index]
-    # Collect target PIDs: the main process + its children
+    # Respect CUDA_VISIBLE_DEVICES — only search visible GPUs
+    cvd = os.environ.get("CUDA_VISIBLE_DEVICES", "").strip()
+    if cvd:
+        visible_physical = []
+        for d in cvd.split(","):
+            d = d.strip()
+            if d:
+                try:
+                    idx = int(d)
+                    if 0 <= idx < device_count:
+                        visible_physical.append(idx)
+                except ValueError:
+                    visible_physical = list(range(device_count))
+                    break
+        search_indices = visible_physical if visible_physical else list(range(device_count))
+    else:
+        search_indices = list(range(device_count))
+    # Collect target PIDs: the main process + descendants (3 levels deep).
+    # torchrun uses: torchrun → elastic_agent → worker processes,
+    # so we need at least 3 levels to find DDP worker GPUs.
     target_pids = {proc_pid}
     for child in _get_child_pids(proc_pid):
         target_pids.add(child)
-        # Also check grandchildren (common with torchrun/accelerate)
         for grandchild in _get_child_pids(child):
             target_pids.add(grandchild)
+            # Great-grandchildren: covers torchrun elastic launch wrapper
+            for ggchild in _get_child_pids(grandchild):
+                target_pids.add(ggchild)
     found_indices = []
-    for idx in range(device_count):
+    for idx in search_indices:
         try:
             handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
             procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
@@ -284,10 +307,27 @@ def probe_command(
             handles = [handle]
             discovery_done = False
+            discovery_attempts = 0
+            max_discovery_attempts = 3  # Retry at samples 5, 15, 30
+            # Determine expected GPU count from environment for retry logic
+            expected_gpus = 1
+            ws = os.environ.get("WORLD_SIZE", "").strip()
+            if ws:
+                try:
+                    expected_gpus = max(1, int(ws))
+                except ValueError:
+                    pass
             while not stop_event.is_set():
-                # After 5 samples, try to discover all GPUs used by the process
-                if not discovery_done and len(samples) >= 5 and proc.pid:
+                # Retry GPU discovery: at samples 5, 15, 30
+                # Keep retrying if we haven't found all expected GPUs yet
+                discovery_thresholds = [5, 15, 30]
+                if (not discovery_done
+                    and discovery_attempts < max_discovery_attempts
+                    and len(samples) >= discovery_thresholds[discovery_attempts]
+                    and proc.pid):
+                    discovery_attempts += 1
                     try:
                         discovered = _discover_gpu_indices(proc.pid, pynvml, fallback_index=gpu_index)
                         if len(discovered) > 1:
@@ -303,7 +343,9 @@ def probe_command(
                         pass
                     # Detect interconnect type between discovered GPUs
                     detected_ic_ref[0] = _detect_interconnect(handles, pynvml)
-                    discovery_done = True
+                    # Stop retrying if we found expected count or exhausted attempts
+                    if num_gpus_ref[0] >= expected_gpus or discovery_attempts >= max_discovery_attempts:
+                        discovery_done = True
                 # Sample from all monitored GPUs — aggregate: peak vram = max, util/power = mean
                 try:
@@ -414,6 +456,18 @@ def probe_command(
     if calibration_time_ref[0] is not None:
         cal_duration = round(calibration_time_ref[0] - start_time, 2)
+    # Environment-based fallback: if NVML discovery found fewer GPUs than
+    # WORLD_SIZE indicates, trust the environment. The probe may miss GPUs
+    # due to DDP per-rank CVD isolation or timing races.
+    env_world = os.environ.get("WORLD_SIZE", "").strip()
+    if env_world:
+        try:
+            ws = int(env_world)
+            if ws > num_gpus_ref[0]:
+                num_gpus_ref[0] = ws
+        except ValueError:
+            pass
     if not samples:
         return ProbeResult(
             duration_seconds=round(duration, 2),

{alloc-0.0.4 → alloc-0.0.5}/src/alloc.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.4
+Version: 0.0.5
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0

{alloc-0.0.4 → alloc-0.0.5}/tests/test_cli.py RENAMED Viewed

@@ -234,6 +234,7 @@ def test_status_json_no_artifact(tmp_path, monkeypatch):
     import json
     monkeypatch.chdir(tmp_path)
     monkeypatch.delenv("ALLOC_TOKEN", raising=False)
+    monkeypatch.setenv("HOME", str(tmp_path))  # isolate from real ~/.alloc/config.json
     result = runner.invoke(app, ["status", "--json"])
     assert result.exit_code == 0
     data = json.loads(result.output.strip())
@@ -325,6 +326,7 @@ def test_status_not_logged_in(tmp_path, monkeypatch):
     """alloc status without token shows not-logged-in state."""
     monkeypatch.chdir(tmp_path)
     monkeypatch.delenv("ALLOC_TOKEN", raising=False)
+    monkeypatch.setenv("HOME", str(tmp_path))  # isolate from real ~/.alloc/config.json
     result = runner.invoke(app, ["status"])
     assert result.exit_code == 0
     out = _plain(result.output)