PyPI - alloc - Versions diffs - 0.0.14__tar.gz → 0.0.16__tar.gz - Mend

alloc 0.0.14tar.gz → 0.0.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{alloc-0.0.14 → alloc-0.0.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.14
+Version: 0.0.16
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0

{alloc-0.0.14 → alloc-0.0.16}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alloc"
-version = "0.0.14"
+version = "0.0.16"
 description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
 readme = "README.md"
 license = "Apache-2.0"

{alloc-0.0.14 → alloc-0.0.16}/src/alloc/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
 _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
 del _warnings
-__version__ = "0.0.14"
+__version__ = "0.0.16"
 from alloc.ghost import ghost, GhostReport
 from alloc.callbacks import AllocCallback as HuggingFaceCallback

{alloc-0.0.14 → alloc-0.0.16}/src/alloc/cli.py RENAMED Viewed

@@ -2184,11 +2184,13 @@ def scan(
                     resp = client.post(f"{api_url}/scans", json=payload, headers=headers)
                 else:
                     # Token refresh failed — fall back to unauthenticated scan
-                    if not json_output:
-                        console.print(
-                            "[yellow]Session expired — falling back to public scan "
-                            "(org fleet context unavailable). Run `alloc login` to restore.[/yellow]",
-                        )
+                    # Always print to stderr (not stdout) so JSON output is clean
+                    import sys as _sys
+                    print(
+                        "Session expired — falling back to public scan "
+                        "(org fleet context unavailable). Run `alloc login` to restore.",
+                        file=_sys.stderr,
+                    )
                     del headers["Authorization"]
                     resp = client.post(f"{api_url}/scans/cli", json=payload, headers=headers)

{alloc-0.0.14 → alloc-0.0.16}/src/alloc/probe.py RENAMED Viewed

@@ -469,6 +469,8 @@ def probe_command(
                 pass
             handles = [handle]
+            # Map from handle index → physical GPU index (for per_gpu_peaks keying)
+            handle_gpu_indices = [gpu_index]
             discovery_done = False
             discovery_attempts = 0
             max_discovery_attempts = 3  # Retry at samples 5, 15, 30
@@ -487,6 +489,35 @@ def probe_command(
                 except ValueError:
                     pass
+            # Early-initialize handles for all expected GPUs so per_gpu_peaks
+            # is populated from sample 0 — don't depend on process-tree
+            # discovery timing. Discovery still runs for process_map and to
+            # confirm which specific GPUs are in use via PID matching.
+            # NOTE: Do NOT set num_gpus_ref here — that would satisfy
+            # discovery_done prematurely and prevent retries at samples 15/30.
+            early_init_indices = None  # type: Optional[list]
+            if expected_gpus > 1:
+                try:
+                    device_count = pynvml.nvmlDeviceGetCount()
+                    if device_count >= expected_gpus:
+                        early_handles = []
+                        early_indices = []
+                        for idx in range(device_count):
+                            if len(early_handles) >= expected_gpus:
+                                break
+                            try:
+                                h = pynvml.nvmlDeviceGetHandleByIndex(idx)
+                                early_handles.append(h)
+                                early_indices.append(idx)
+                            except Exception:
+                                pass
+                        if len(early_handles) >= expected_gpus:
+                            handles = early_handles
+                            handle_gpu_indices = early_indices
+                            early_init_indices = early_indices
+                except Exception:
+                    pass
             while not stop_event.is_set():
                 # Retry GPU discovery: at samples 5, 15, 30
                 # Keep retrying if we haven't found all expected GPUs yet
@@ -503,10 +534,12 @@ def probe_command(
                         )
                         if len(discovered) > 1:
                             handles = []
+                            handle_gpu_indices = []
                             pmap = []
                             for idx in discovered:
                                 h = pynvml.nvmlDeviceGetHandleByIndex(idx)
                                 handles.append(h)
+                                handle_gpu_indices.append(idx)
                                 pmap.append({"gpu_index": idx})
                             num_gpus_ref[0] = len(handles)
                             process_map_ref[0] = pmap
@@ -527,14 +560,21 @@ def probe_command(
                     # Stop retrying if we found expected count or exhausted attempts
                     if num_gpus_ref[0] >= expected_gpus or discovery_attempts >= max_discovery_attempts:
                         discovery_done = True
+                        # If discovery never confirmed multi-GPU via PID matching
+                        # but early-init opened handles for expected GPUs, generate
+                        # a fallback process_map from the early-init indices.
+                        if process_map_ref[0] is None and early_init_indices is not None:
+                            process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
+                            num_gpus_ref[0] = len(early_init_indices)
                 # Sample from all monitored GPUs — aggregate: peak vram = max, util/power = mean
-                try:
-                    vram_vals = []
-                    util_vals = []
-                    power_vals = []
-                    total_mb = 0.0
-                    for h in handles:
+                # Per-GPU try/except: one bad handle must not prevent tracking others
+                vram_vals = []
+                util_vals = []
+                power_vals = []
+                total_mb = 0.0
+                for h in handles:
+                    try:
                         mi = pynvml.nvmlDeviceGetMemoryInfo(h)
                         ut = pynvml.nvmlDeviceGetUtilizationRates(h)
                         pw = pynvml.nvmlDeviceGetPowerUsage(h) / 1000.0
@@ -542,25 +582,28 @@ def probe_command(
                         util_vals.append(ut.gpu)
                         power_vals.append(pw)
                         total_mb = mi.total / (1024 * 1024)
+                    except Exception:
+                        pass
-                    # Track per-GPU peak VRAM (always, even single GPU —
-                    # discovery may expand handles later, and we need history from sample 0)
-                    pgp = per_gpu_peaks_ref[0]
-                    for gi, vm in enumerate(vram_vals):
-                        pgp[gi] = max(pgp.get(gi, 0.0), vm)
+                # Track per-GPU peak VRAM (always, even single GPU —
+                # discovery may expand handles later, and we need history from sample 0)
+                pgp = per_gpu_peaks_ref[0]
+                for gi, vm in enumerate(vram_vals):
+                    pgp[gi] = max(pgp.get(gi, 0.0), vm)
+                if vram_vals:
                     samples.append(ProbeSample(
                         timestamp=time.time(),
                         memory_used_mb=max(vram_vals),
                         memory_total_mb=total_mb,
-                        gpu_util_pct=sum(util_vals) / len(util_vals),
-                        power_watts=sum(power_vals) / len(power_vals),
+                        gpu_util_pct=sum(util_vals) / len(util_vals) if util_vals else 0.0,
+                        power_watts=sum(power_vals) / len(power_vals) if power_vals else 0.0,
                     ))
-                except Exception:
-                    pass
                 # Calibrate mode: auto-stop when stable
-                if calibrate and len(samples) > ramp_up_samples:
+                # Delay stability check until GPU discovery is complete —
+                # prevents calibrate-and-exit before finding all expected GPUs.
+                if calibrate and discovery_done and len(samples) > ramp_up_samples:
                     from alloc.stability import check_stability, RAMP_UP_SAMPLES
                     sr = check_stability(samples, poll_interval_ms=poll_interval_ms)
                     if sr.is_stable:

{alloc-0.0.14 → alloc-0.0.16}/src/alloc.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.14
+Version: 0.0.16
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0

{alloc-0.0.14 → alloc-0.0.16}/tests/test_probe_multi.py RENAMED Viewed

@@ -245,3 +245,170 @@ def test_active_gpu_fallback_not_used_without_expected():
         with patch("alloc.probe._read_child_env", return_value=None):
             result = _discover_gpu_indices(1000, mock, fallback_index=0)
     assert result == [0]  # Falls back to default
+# ── Early handle initialization for expected GPUs ──
+def test_early_init_opens_handles_for_expected_gpus():
+    """When expected_gpus > 1 and device_count >= expected, early-init should
+    open handles for all expected GPUs."""
+    mock_pynvml = MagicMock()
+    mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=2)
+    handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
+    mock_pynvml.nvmlDeviceGetHandleByIndex = MagicMock(side_effect=lambda i: handles_map[i])
+    # Simulate early-init logic from probe_command._monitor()
+    expected_gpus = 2
+    handles = [handles_map[0]]
+    if expected_gpus > 1:
+        device_count = mock_pynvml.nvmlDeviceGetCount()
+        if device_count >= expected_gpus:
+            early_handles = []
+            early_indices = []
+            for idx in range(device_count):
+                if len(early_handles) >= expected_gpus:
+                    break
+                h = mock_pynvml.nvmlDeviceGetHandleByIndex(idx)
+                early_handles.append(h)
+                early_indices.append(idx)
+            if len(early_handles) >= expected_gpus:
+                handles = early_handles
+    assert len(handles) == 2
+def test_early_init_skipped_when_fewer_devices():
+    """When device_count < expected_gpus, early-init should not change handles."""
+    mock_pynvml = MagicMock()
+    mock_pynvml.nvmlDeviceGetCount = MagicMock(return_value=1)
+    expected_gpus = 2
+    handles = [MagicMock(name="gpu0")]
+    original_handles = list(handles)
+    if expected_gpus > 1:
+        device_count = mock_pynvml.nvmlDeviceGetCount()
+        if device_count >= expected_gpus:
+            assert False, "Should not reach here"
+    assert len(handles) == 1
+def test_per_gpu_sampling_resilient_to_partial_failure():
+    """Per-GPU try/except: one GPU failure should not prevent others from
+    being sampled into per_gpu_peaks."""
+    mock_pynvml = MagicMock()
+    handles_map = {0: MagicMock(name="gpu0"), 1: MagicMock(name="gpu1")}
+    mem_ok = MagicMock()
+    mem_ok.total = 24 * 1024 * 1024 * 1024
+    mem_ok.used = 8000 * 1024 * 1024
+    def mem_info_side_effect(h):
+        if h == handles_map[1]:
+            raise RuntimeError("GPU 1 memory read failed")
+        return mem_ok
+    mock_pynvml.nvmlDeviceGetMemoryInfo = MagicMock(side_effect=mem_info_side_effect)
+    util = MagicMock()
+    util.gpu = 80
+    mock_pynvml.nvmlDeviceGetUtilizationRates = MagicMock(return_value=util)
+    mock_pynvml.nvmlDeviceGetPowerUsage = MagicMock(return_value=100_000)
+    # Simulate the per-GPU sampling loop
+    handles = [handles_map[0], handles_map[1]]
+    per_gpu_peaks = {}
+    vram_vals = []
+    for h in handles:
+        try:
+            mi = mock_pynvml.nvmlDeviceGetMemoryInfo(h)
+            vram_vals.append(mi.used / (1024 * 1024))
+        except Exception:
+            pass
+    for gi, vm in enumerate(vram_vals):
+        per_gpu_peaks[gi] = max(per_gpu_peaks.get(gi, 0.0), vm)
+    # GPU 0 tracked, GPU 1 skipped
+    assert 0 in per_gpu_peaks
+    assert per_gpu_peaks[0] > 0
+    assert len(vram_vals) == 1
+def test_stability_delayed_until_discovery_done():
+    """Stability check requires discovery_done=True."""
+    # Single GPU: expected=1, num_gpus=1 → done immediately
+    assert 1 >= 1  # num_gpus >= expected
+    # Multi GPU with early-init: expected=2, num_gpus=2 → done at sample 5
+    assert 2 >= 2
+    # Multi GPU, discovery incomplete: expected=4, found=2 → NOT done
+    assert not (2 >= 4)
+def test_per_gpu_peaks_to_result_list():
+    """per_gpu_peaks dict should convert to sorted list for ProbeResult."""
+    peaks = {0: 8000.5, 1: 12000.3}
+    result = [round(peaks[i], 1) for i in sorted(peaks)] if peaks else None
+    assert result == [8000.5, 12000.3]
+    empty = {}
+    result_empty = [round(empty[i], 1) for i in sorted(empty)] if empty else None
+    assert result_empty is None
+def test_fallback_process_map_from_early_init():
+    """When discovery exhausts all attempts without confirming GPUs,
+    process_map should be generated from early-init indices."""
+    early_init_indices = [0, 1]
+    process_map_ref = [None]
+    num_gpus_ref = [1]
+    discovery_attempts = 3
+    max_discovery_attempts = 3
+    expected_gpus = 2
+    # Simulate: discovery exhausted, never found >1 via PID matching
+    if num_gpus_ref[0] >= expected_gpus or discovery_attempts >= max_discovery_attempts:
+        discovery_done = True
+        if process_map_ref[0] is None and early_init_indices is not None:
+            process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
+            num_gpus_ref[0] = len(early_init_indices)
+    assert process_map_ref[0] == [{"gpu_index": 0}, {"gpu_index": 1}]
+    assert num_gpus_ref[0] == 2
+def test_no_fallback_process_map_when_discovery_succeeded():
+    """When discovery already set process_map, fallback should not overwrite."""
+    early_init_indices = [0, 1]
+    process_map_ref = [[{"gpu_index": 2}, {"gpu_index": 3}]]  # discovery found GPUs 2,3
+    num_gpus_ref = [2]
+    # Simulate fallback condition
+    if process_map_ref[0] is None and early_init_indices is not None:
+        process_map_ref[0] = [{"gpu_index": idx} for idx in early_init_indices]
+    # Should keep discovery's result, not overwrite
+    assert process_map_ref[0] == [{"gpu_index": 2}, {"gpu_index": 3}]
+def test_early_init_does_not_set_num_gpus_ref():
+    """Early-init must NOT set num_gpus_ref — that would satisfy discovery_done
+    prematurely and prevent retries at samples 15/30."""
+    # Simulate the early-init code path
+    num_gpus_ref = [1]
+    expected_gpus = 2
+    device_count = 2  # enough devices
+    # Early-init opens handles but does NOT touch num_gpus_ref
+    early_init_indices = list(range(expected_gpus))
+    # num_gpus_ref should still be 1
+    assert num_gpus_ref[0] == 1
+    # So discovery_done check fails: 1 < 2
+    assert not (num_gpus_ref[0] >= expected_gpus)