PyPI - alloc - Versions diffs - 0.0.13__tar.gz → 0.0.15__tar.gz - Mend

alloc 0.0.13tar.gz → 0.0.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{alloc-0.0.13 → alloc-0.0.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.13
+Version: 0.0.15
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0

{alloc-0.0.13 → alloc-0.0.15}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alloc"
-version = "0.0.13"
+version = "0.0.15"
 description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
 readme = "README.md"
 license = "Apache-2.0"

{alloc-0.0.13 → alloc-0.0.15}/src/alloc/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
 _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
 del _warnings
-__version__ = "0.0.13"
+__version__ = "0.0.15"
 from alloc.ghost import ghost, GhostReport
 from alloc.callbacks import AllocCallback as HuggingFaceCallback

{alloc-0.0.13 → alloc-0.0.15}/src/alloc/cli.py RENAMED Viewed

@@ -564,8 +564,11 @@ def run(
             else:
                 console.print("[dim]Tip: alloc login --browser to connect your dashboard[/dim]")
+    # Propagate non-zero exit code — but NOT when calibrate mode
+    # intentionally killed the process (torchrun exits non-zero on SIGTERM)
     if result.exit_code and result.exit_code != 0:
-        raise typer.Exit(result.exit_code)
+        if result.stop_reason not in ("stable", "timeout"):
+            raise typer.Exit(result.exit_code)
 @app.command()
@@ -2115,6 +2118,13 @@ def scan(
     """Remote ghost scan via Alloc API — no GPU needed."""
     import httpx
+    # When --json, redirect console to stderr so nothing contaminates stdout.
+    from rich.console import Console as _Console
+    if json_output:
+        console = _Console(stderr=True)
+    else:
+        console = _Console()
     # Resolve param count from model name or explicit flag
     resolved_param_count = param_count_b or _model_to_params(model)
     if resolved_param_count is None:
@@ -2174,11 +2184,13 @@ def scan(
                     resp = client.post(f"{api_url}/scans", json=payload, headers=headers)
                 else:
                     # Token refresh failed — fall back to unauthenticated scan
-                    if not json_output:
-                        console.print(
-                            "[yellow]Session expired — falling back to public scan "
-                            "(org fleet context unavailable). Run `alloc login` to restore.[/yellow]",
-                        )
+                    # Always print to stderr (not stdout) so JSON output is clean
+                    import sys as _sys
+                    print(
+                        "Session expired — falling back to public scan "
+                        "(org fleet context unavailable). Run `alloc login` to restore.",
+                        file=_sys.stderr,
+                    )
                     del headers["Authorization"]
                     resp = client.post(f"{api_url}/scans/cli", json=payload, headers=headers)

{alloc-0.0.13 → alloc-0.0.15}/src/alloc/probe.py RENAMED Viewed

@@ -469,6 +469,8 @@ def probe_command(
                 pass
             handles = [handle]
+            # Map from handle index → physical GPU index (for per_gpu_peaks keying)
+            handle_gpu_indices = [gpu_index]
             discovery_done = False
             discovery_attempts = 0
             max_discovery_attempts = 3  # Retry at samples 5, 15, 30
@@ -487,6 +489,32 @@ def probe_command(
                 except ValueError:
                     pass
+            # Early-initialize handles for all expected GPUs so per_gpu_peaks
+            # is populated from sample 0 — don't depend on process-tree
+            # discovery timing. Discovery still runs for process_map and to
+            # confirm which specific GPUs are in use.
+            if expected_gpus > 1:
+                try:
+                    device_count = pynvml.nvmlDeviceGetCount()
+                    if device_count >= expected_gpus:
+                        early_handles = []
+                        early_indices = []
+                        for idx in range(device_count):
+                            if len(early_handles) >= expected_gpus:
+                                break
+                            try:
+                                h = pynvml.nvmlDeviceGetHandleByIndex(idx)
+                                early_handles.append(h)
+                                early_indices.append(idx)
+                            except Exception:
+                                pass
+                        if len(early_handles) >= expected_gpus:
+                            handles = early_handles
+                            handle_gpu_indices = early_indices
+                            num_gpus_ref[0] = len(handles)
+                except Exception:
+                    pass
             while not stop_event.is_set():
                 # Retry GPU discovery: at samples 5, 15, 30
                 # Keep retrying if we haven't found all expected GPUs yet
@@ -503,10 +531,12 @@ def probe_command(
                         )
                         if len(discovered) > 1:
                             handles = []
+                            handle_gpu_indices = []
                             pmap = []
                             for idx in discovered:
                                 h = pynvml.nvmlDeviceGetHandleByIndex(idx)
                                 handles.append(h)
+                                handle_gpu_indices.append(idx)
                                 pmap.append({"gpu_index": idx})
                             num_gpus_ref[0] = len(handles)
                             process_map_ref[0] = pmap
@@ -529,12 +559,13 @@ def probe_command(
                         discovery_done = True
                 # Sample from all monitored GPUs — aggregate: peak vram = max, util/power = mean
-                try:
-                    vram_vals = []
-                    util_vals = []
-                    power_vals = []
-                    total_mb = 0.0
-                    for h in handles:
+                # Per-GPU try/except: one bad handle must not prevent tracking others
+                vram_vals = []
+                util_vals = []
+                power_vals = []
+                total_mb = 0.0
+                for h in handles:
+                    try:
                         mi = pynvml.nvmlDeviceGetMemoryInfo(h)
                         ut = pynvml.nvmlDeviceGetUtilizationRates(h)
                         pw = pynvml.nvmlDeviceGetPowerUsage(h) / 1000.0
@@ -542,25 +573,28 @@ def probe_command(
                         util_vals.append(ut.gpu)
                         power_vals.append(pw)
                         total_mb = mi.total / (1024 * 1024)
+                    except Exception:
+                        pass
-                    # Track per-GPU peak VRAM (always, even single GPU —
-                    # discovery may expand handles later, and we need history from sample 0)
-                    pgp = per_gpu_peaks_ref[0]
-                    for gi, vm in enumerate(vram_vals):
-                        pgp[gi] = max(pgp.get(gi, 0.0), vm)
+                # Track per-GPU peak VRAM (always, even single GPU —
+                # discovery may expand handles later, and we need history from sample 0)
+                pgp = per_gpu_peaks_ref[0]
+                for gi, vm in enumerate(vram_vals):
+                    pgp[gi] = max(pgp.get(gi, 0.0), vm)
+                if vram_vals:
                     samples.append(ProbeSample(
                         timestamp=time.time(),
                         memory_used_mb=max(vram_vals),
                         memory_total_mb=total_mb,
-                        gpu_util_pct=sum(util_vals) / len(util_vals),
-                        power_watts=sum(power_vals) / len(power_vals),
+                        gpu_util_pct=sum(util_vals) / len(util_vals) if util_vals else 0.0,
+                        power_watts=sum(power_vals) / len(power_vals) if power_vals else 0.0,
                     ))
-                except Exception:
-                    pass
                 # Calibrate mode: auto-stop when stable
-                if calibrate and len(samples) > ramp_up_samples:
+                # Delay stability check until GPU discovery is complete —
+                # prevents calibrate-and-exit before finding all expected GPUs.
+                if calibrate and discovery_done and len(samples) > ramp_up_samples:
                     from alloc.stability import check_stability, RAMP_UP_SAMPLES
                     sr = check_stability(samples, poll_interval_ms=poll_interval_ms)
                     if sr.is_stable:
@@ -699,8 +733,8 @@ def probe_command(
         num_gpus_detected=num_gpus_ref[0],
         process_map=process_map_ref[0],
         per_gpu_peak_vram_mb=(
-            [round(per_gpu_peaks_ref[0].get(i, 0), 1) for i in range(num_gpus_ref[0])]
-            if num_gpus_ref[0] >= 1 and per_gpu_peaks_ref[0] else None
+            [round(per_gpu_peaks_ref[0][i], 1) for i in sorted(per_gpu_peaks_ref[0])]
+            if per_gpu_peaks_ref[0] else None
         ),
         detected_interconnect=detected_ic_ref[0],
     )

{alloc-0.0.13 → alloc-0.0.15}/src/alloc.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.13
+Version: 0.0.15
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0

{alloc-0.0.13 → alloc-0.0.15}/tests/test_probe_multi.py RENAMED Viewed

@@ -245,3 +245,118 @@ def test_active_gpu_fallback_not_used_without_expected():
         with patch("alloc.probe._read_child_env", return_value=None):
             result = _discover_gpu_indices(1000, mock, fallback_index=0)
     assert result == [0]  # Falls back to default
+# ── Early handle initialization for expected GPUs ──
+def test_early_init_opens_handles_for_expected_gpus():
+    """When expected_gpus > 1 and device_count >= expected, early-init should
+    open handles for all expected GPUs."""
+    mock_pynvml = MagicMock()
+    mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=2)
+    handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
+    mock_pynvml.nvmlDeviceGetHandleByIndex = MagicMock(side_effect=lambda i: handles_map[i])
+    # Simulate early-init logic from probe_command._monitor()
+    expected_gpus = 2
+    handles = [handles_map[0]]
+    if expected_gpus > 1:
+        device_count = mock_pynvml.nvmlDeviceGetCount()
+        if device_count >= expected_gpus:
+            early_handles = []
+            early_indices = []
+            for idx in range(device_count):
+                if len(early_handles) >= expected_gpus:
+                    break
+                h = mock_pynvml.nvmlDeviceGetHandleByIndex(idx)
+                early_handles.append(h)
+                early_indices.append(idx)
+            if len(early_handles) >= expected_gpus:
+                handles = early_handles
+    assert len(handles) == 2
+def test_early_init_skipped_when_fewer_devices():
+    """When device_count < expected_gpus, early-init should not change handles."""
+    mock_pynvml = MagicMock()
+    mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=1)
+    expected_gpus = 2
+    handles = [MagicMock(name="gpu0")]
+    original_handles = list(handles)
+    if expected_gpus > 1:
+        device_count = mock_pynvml.nvmlDeviceGetCount()
+        if device_count >= expected_gpus:
+            assert False, "Should not reach here"
+    assert len(handles) == 1
+def test_per_gpu_sampling_resilient_to_partial_failure():
+    """Per-GPU try/except: one GPU failure should not prevent others from
+    being sampled into per_gpu_peaks."""
+    mock_pynvml = MagicMock()
+    handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
+    mem_ok = MagicMock()
+    mem_ok.total = 24 * 1024 * 1024 * 1024
+    mem_ok.used = 8000 * 1024 * 1024
+    def mem_info_side_effect(h):
+        if h == handles_map[1]:
+            raise RuntimeError("GPU 1 memory read failed")
+        return mem_ok
+    mock_pynvml.nvmlDeviceGetMemoryInfo = MagicMock(side_effect=mem_info_side_effect)
+    util = MagicMock()
+    util.gpu = 80
+    mock_pynvml.nvmlDeviceGetUtilizationRates = MagicMock(return_value=util)
+    mock_pynvml.nvmlDeviceGetPowerUsage = MagicMock(return_value=100_000)
+    # Simulate the per-GPU sampling loop
+    handles = [handles_map[0], handles_map[1]]
+    per_gpu_peaks = {}
+    vram_vals = []
+    for h in handles:
+        try:
+            mi = mock_pynvml.nvmlDeviceGetMemoryInfo(h)
+            vram_vals.append(mi.used / (1024 * 1024))
+        except Exception:
+            pass
+    for gi, vm in enumerate(vram_vals):
+        per_gpu_peaks[gi] = max(per_gpu_peaks.get(gi, 0.0), vm)
+    # GPU 0 tracked, GPU 1 skipped
+    assert 0 in per_gpu_peaks
+    assert per_gpu_peaks[0] > 0
+    assert len(vram_vals) == 1
+def test_stability_delayed_until_discovery_done():
+    """Stability check requires discovery_done=True."""
+    # Single GPU: expected=1, num_gpus=1 → done immediately
+    assert 1 >= 1  # num_gpus >= expected
+    # Multi GPU with early-init: expected=2, num_gpus=2 → done at sample 5
+    assert 2 >= 2
+    # Multi GPU, discovery incomplete: expected=4, found=2 → NOT done
+    assert not (2 >= 4)
+def test_per_gpu_peaks_to_result_list():
+    """per_gpu_peaks dict should convert to sorted list for ProbeResult."""
+    peaks = {0: 8000.5, 1: 12000.3}
+    result = [round(peaks[i], 1) for i in sorted(peaks)] if peaks else None
+    assert result == [8000.5, 12000.3]
+    empty = {}
+    result_empty = [round(empty[i], 1) for i in sorted(empty)] if empty else None
+    assert result_empty is None