PyPI - alloc - Versions diffs - 0.0.8__tar.gz → 0.0.10__tar.gz - Mend

alloc 0.0.8tar.gz → 0.0.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{alloc-0.0.8 → alloc-0.0.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.8
+Version: 0.0.10
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0
@@ -40,7 +40,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.2 — Calibrate
+alloc v0.0.9 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.8 → alloc-0.0.10}/README.md RENAMED Viewed

@@ -12,7 +12,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.2 — Calibrate
+alloc v0.0.9 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.8 → alloc-0.0.10}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alloc"
-version = "0.0.8"
+version = "0.0.10"
 description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
 readme = "README.md"
 license = "Apache-2.0"

{alloc-0.0.8 → alloc-0.0.10}/src/alloc/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
 _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
 del _warnings
-__version__ = "0.0.8"
+__version__ = "0.0.10"
 from alloc.ghost import ghost, GhostReport
 from alloc.callbacks import AllocCallback as HuggingFaceCallback

{alloc-0.0.8 → alloc-0.0.10}/src/alloc/browser_auth.py RENAMED Viewed

@@ -121,8 +121,9 @@ def browser_login(
     })
     authorize_url = f"{supabase_url}/auth/v1/authorize?{authorize_params}"
-    # Bind to 0.0.0.0 so both localhost and 127.0.0.1 reach the server.
-    server = HTTPServer(("0.0.0.0", port), _CallbackHandler)
+    # Bind to 127.0.0.1 only — the auth callback server should never be
+    # reachable from the network.
+    server = HTTPServer(("127.0.0.1", port), _CallbackHandler)
     server.auth_code = None  # type: ignore[attr-defined]
     server.auth_error = None  # type: ignore[attr-defined]
     server.timeout = 1  # poll interval for handle_request()

{alloc-0.0.8 → alloc-0.0.10}/src/alloc/callbacks.py RENAMED Viewed

@@ -501,7 +501,10 @@ class _NvmlMonitor:
             self._hw_context["nvlink_active_links"] = active_links
         except Exception:
-            pass
+            # NVLink detection code failed after entering the try block.
+            # We know NVML is functional (handles exist), so fall back to
+            # generic "nvlink" rather than leaving interconnect_type unset.
+            self._hw_context["interconnect_type"] = "nvlink"
         self._thread = threading.Thread(target=self._sample_loop, daemon=True)
         self._thread.start()

{alloc-0.0.8 → alloc-0.0.10}/src/alloc/cli.py RENAMED Viewed

@@ -2400,23 +2400,33 @@ def whoami(
             profile = _get("/profile")
             fleet = _get("/gpu-fleet")
         else:
-            if json_output:
+            # whoami is a status command — report structured result, exit 0
+            if e.response.status_code == 401:
+                out["token_status"] = "expired"
+            else:
+                out["token_status"] = "error"
                 out["error"] = f"API error {e.response.status_code}"
+            if json_output:
                 _print_json(out)
             else:
-                console.print(f"[red]API error {e.response.status_code}[/red]")
+                if e.response.status_code == 401:
+                    console.print("[yellow]Token expired.[/yellow]")
+                else:
+                    console.print(f"[red]API error {e.response.status_code}[/red]")
                 console.print("[dim]Run: alloc login[/dim]")
-            raise typer.Exit(1)
+            return
     except httpx.ConnectError:
+        out["token_status"] = "unreachable"
+        out["error"] = f"Cannot connect to {api_url}"
         if json_output:
-            out["error"] = f"Cannot connect to {api_url}"
             _print_json(out)
         else:
             console.print(f"[red]Cannot connect to {api_url}[/red]")
-        raise typer.Exit(1)
+        return
     # API validated the token — now we know login is real
     out["logged_in"] = True
+    out["token_status"] = "valid"
     gpus = fleet.get("gpus") or []
     fleet_count = len([g for g in gpus if g.get("fleet_status") == "in_fleet"])
@@ -3565,7 +3575,14 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
         strategy = "pp+dp" if (dp is not None and dp > 1) else "pp"
     elif dp is not None and dp > 1:
         strategy = "ddp"
-    # If none of the above matched, strategy stays None (unknown)
+    elif strategy is None and num_gpus_detected > 1 and not has_tp and not has_pp:
+        # Multiple GPUs detected via NVML with no TP/PP env vars →
+        # DDP is PyTorch's default and the only realistic inference.
+        # This is NOT the old `or "ddp"` — it only fires when probe
+        # actually observed multiple GPU processes.
+        strategy = "ddp"
+        if dp is None:
+            dp = num_gpus_detected
     return {
         "num_nodes": nnodes or 1,

{alloc-0.0.8 → alloc-0.0.10}/src/alloc/diagnosis_engine.py RENAMED Viewed

@@ -403,7 +403,7 @@ def _estimate_model_params(model_name: str) -> Optional[float]:
         "whisper-large": 1.55,
     }
-    for key, params in estimates.items():
+    for key, params in sorted(estimates.items(), key=lambda x: len(x[0]), reverse=True):
         if key in name:
             return params

{alloc-0.0.8 → alloc-0.0.10}/src/alloc/extractor_runner.py RENAMED Viewed

@@ -281,7 +281,30 @@ def main():
             "activation_method": activation_result.get("activation_method"),
         }
     else:
-        result = {"status": "no_model"}
+        # No model found — check if this is a distributed training script
+        # that hides the model inside __main__ guard or main()
+        _is_dist = False
+        try:
+            import torch.distributed as _dist_mod
+            if _dist_mod.is_initialized():
+                _is_dist = True
+        except Exception:
+            pass
+        if not _is_dist:
+            # Check if module imported distributed primitives
+            for attr_name in dir(module):
+                try:
+                    obj = getattr(module, attr_name)
+                    mod_name = getattr(obj, "__module__", "") or ""
+                    if "torch.distributed" in mod_name or "torch.nn.parallel" in mod_name:
+                        _is_dist = True
+                        break
+                except Exception:
+                    continue
+        if _is_dist:
+            result = {"status": "error_distributed", "error": "no model found — script uses distributed training"}
+        else:
+            result = {"status": "no_model"}
     with open(sidecar_path, "w") as f:
         json.dump(result, f)

{alloc-0.0.8 → alloc-0.0.10}/src/alloc/probe.py RENAMED Viewed

@@ -215,8 +215,19 @@ def _discover_gpu_indices(proc_pid, pynvml, fallback_index=0, expected_gpus=None
                     if 0 <= idx < device_count:
                         visible_physical.append(idx)
                 except ValueError:
-                    visible_physical = list(range(device_count))
-                    break
+                    # UUID-style device identifiers — try NVML UUID matching
+                    try:
+                        for phys_idx in range(device_count):
+                            handle = pynvml.nvmlDeviceGetHandleByIndex(phys_idx)
+                            uuid = pynvml.nvmlDeviceGetUUID(handle)
+                            if isinstance(uuid, bytes):
+                                uuid = uuid.decode("utf-8", errors="replace")
+                            if d in uuid:
+                                visible_physical.append(phys_idx)
+                                break
+                    except Exception:
+                        visible_physical = list(range(device_count))
+                        break
         search_indices = visible_physical if visible_physical else list(range(device_count))
     else:
         search_indices = list(range(device_count))

{alloc-0.0.8 → alloc-0.0.10}/src/alloc.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.8
+Version: 0.0.10
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0
@@ -40,7 +40,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.2 — Calibrate
+alloc v0.0.9 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.8 → alloc-0.0.10}/tests/test_auth.py RENAMED Viewed

@@ -68,6 +68,34 @@ def test_whoami_not_logged_in_json(tmp_path: Path):
     assert data["api_url"] == "https://api.example.com"
+def test_whoami_stale_token_json(tmp_path: Path):
+    """Stale token should exit 0 with token_status: expired."""
+    mock_resp = MagicMock()
+    mock_resp.status_code = 401
+    mock_resp.raise_for_status.side_effect = httpx.HTTPStatusError(
+        "Unauthorized", request=MagicMock(), response=mock_resp,
+    )
+    mock_client = MagicMock()
+    mock_client.__enter__.return_value = mock_client
+    mock_client.__exit__.return_value = False
+    mock_client.get.return_value = mock_resp
+    env = {
+        "HOME": str(tmp_path),
+        "ALLOC_API_URL": "https://api.example.com",
+        "ALLOC_TOKEN": "stale-token",
+    }
+    with patch("httpx.Client", return_value=mock_client), \
+         patch("alloc.cli.try_refresh_access_token", return_value=None):
+        result = runner.invoke(app, ["whoami", "--json"], env=env)
+    assert result.exit_code == 0
+    data = json.loads(result.output)
+    assert data["logged_in"] is False
+    assert data["token_status"] == "expired"
 def test_whoami_logged_in_json(tmp_path: Path):
     profile_resp = MagicMock()
     profile_resp.raise_for_status.return_value = None
@@ -110,6 +138,7 @@ def test_whoami_logged_in_json(tmp_path: Path):
     assert result.exit_code == 0
     data = json.loads(result.output)
     assert data["logged_in"] is True
+    assert data["token_status"] == "valid"
     assert data["token_source"] == "env"
     assert data["email"] == "user@example.com"
     assert data["fleet_count"] == 1

{alloc-0.0.8 → alloc-0.0.10}/tests/test_callbacks.py RENAMED Viewed

@@ -1189,3 +1189,71 @@ class TestNvmlMonitorThreadSafety:
         assert len(probe["per_rank_peak_vram_mb"]) == 2
         for peak in probe["per_rank_peak_vram_mb"]:
             assert peak > 0
+class TestNvmlMonitorNvlinkFallback:
+    def test_nvlink_detection_failure_sets_nvlink_fallback(self):
+        """When the outer NVLink detection block raises, fall back to 'nvlink'.
+        We trigger this by making nvmlDeviceGetNvLinkState raise on the first
+        call (inner except breaks the loop → active_links=0 → 'pcie'), and
+        then making the active_links comparison itself blow up. The simplest
+        trigger is having _gpu_handles[0] raise IndexError (empty list after
+        the early-return guard).
+        """
+        mock_pynvml = MagicMock()
+        mock_pynvml.nvmlInit.return_value = None
+        mock_pynvml.nvmlShutdown.return_value = None
+        mock_pynvml.nvmlDeviceGetCount.return_value = 2
+        mock_pynvml.nvmlDeviceGetName.return_value = "NVIDIA A100-SXM4-80GB"
+        mem = SimpleNamespace(total=80 * 1024**3, used=1 * 1024**3)
+        mock_pynvml.nvmlDeviceGetMemoryInfo.return_value = mem
+        mock_pynvml.nvmlSystemGetDriverVersion.return_value = "535"
+        mock_pynvml.nvmlSystemGetCudaDriverVersion.return_value = 12000
+        mock_pynvml.nvmlDeviceGetCudaComputeCapability.return_value = (8, 0)
+        util = SimpleNamespace(gpu=75, memory=60)
+        mock_pynvml.nvmlDeviceGetUtilizationRates.return_value = util
+        mock_pynvml.nvmlDeviceGetPowerUsage.return_value = 300000
+        # Use a handle list that passes the `if not self._gpu_handles` guard
+        # (it's truthy) but raises IndexError on `self._gpu_handles[0]`.
+        class BadHandleList:
+            """Truthy but raises on index access."""
+            def __bool__(self):
+                return True
+            def __len__(self):
+                return 2
+            def __iter__(self):
+                return iter([])
+            def __getitem__(self, idx):
+                raise IndexError("corrupted handle list")
+        with patch("alloc.callbacks._try_import_pynvml", return_value=mock_pynvml):
+            monitor = _NvmlMonitor()
+        # Replace handles after __init__ but before start().
+        # start() will re-populate from nvmlDeviceGetCount, so we also need to
+        # make the handle-building loop produce our bad list. We do this by
+        # patching nvmlDeviceGetHandleByIndex to raise, so _gpu_handles stays
+        # empty after the try/except in handle building. But that triggers
+        # the early return. Instead, we patch _gpu_handles AFTER start()
+        # builds them but BEFORE NVLink detection runs. We achieve this by
+        # having nvmlDeviceGetCudaComputeCapability (the last hw-context call
+        # before NVLink detection) swap in the bad handles as a side effect.
+        original_sm = mock_pynvml.nvmlDeviceGetCudaComputeCapability
+        def swap_handles_then_return_sm(handle):
+            monitor._gpu_handles = BadHandleList()
+            return (8, 0)
+        mock_pynvml.nvmlDeviceGetCudaComputeCapability = MagicMock(
+            side_effect=swap_handles_then_return_sm
+        )
+        monitor.start()
+        import time
+        time.sleep(0.02)
+        monitor.stop()
+        hw, _ = monitor.get_results()
+        assert hw.get("interconnect_type") == "nvlink"

{alloc-0.0.8 → alloc-0.0.10}/tests/test_diagnosis_engine.py RENAMED Viewed

@@ -359,3 +359,15 @@ def test_estimate_model_params_known_vision_model():
     result = _estimate_model_params("stable-diffusion")
     assert result == 0.865
+def test_estimate_model_params_gpt2_medium_prefix_match():
+    """gpt2-medium-finetuned should match gpt2-medium (0.355), not gpt2 (0.124)."""
+    result = _estimate_model_params("gpt2-medium-finetuned")
+    assert result == 0.355
+def test_estimate_model_params_gpt2_alone():
+    """Plain gpt2 should still match 0.124."""
+    result = _estimate_model_params("gpt2")
+    assert result == 0.124

{alloc-0.0.8 → alloc-0.0.10}/tests/test_probe_multi.py RENAMED Viewed

@@ -158,6 +158,61 @@ def test_parse_plain_python():
     assert _parse_launcher_gpu_count(["python", "train.py"]) is None
+# ── CVD UUID resolution ──
+def test_cvd_uuid_resolves_to_correct_index():
+    """UUID-style CUDA_VISIBLE_DEVICES should resolve to the matching physical GPU index."""
+    mock = _mock_pynvml_multi_gpu(
+        proc_pid=1000,
+        gpu_process_map={0: [1000], 1: [], 2: []},
+    )
+    mock.nvmlDeviceGetCount.return_value = 3
+    # Set up UUID resolution: GPU 0 → UUID-A, GPU 1 → UUID-B, GPU 2 → UUID-C
+    uuid_map = {0: "GPU-aaaa-1111", 1: "GPU-bbbb-2222", 2: "GPU-cccc-3333"}
+    handles = {}
+    for idx in range(3):
+        handles[idx] = MagicMock(name=f"handle_{idx}")
+    def get_handle(idx):
+        return handles[idx]
+    def get_uuid(handle):
+        for idx, h in handles.items():
+            if handle == h:
+                return uuid_map[idx]
+        return "GPU-unknown"
+    mock.nvmlDeviceGetHandleByIndex = MagicMock(side_effect=get_handle)
+    mock.nvmlDeviceGetUUID = MagicMock(side_effect=get_uuid)
+    # CVD set to GPU 2's UUID
+    with patch("alloc.probe._get_child_pids", return_value=[]):
+        with patch.dict("os.environ", {"CUDA_VISIBLE_DEVICES": "GPU-cccc-3333"}):
+            result = _discover_gpu_indices(1000, mock, fallback_index=0)
+    # Should only search GPU index 2
+    assert 2 in result or result == [0]  # either found on idx 2, or fallback if no PID match
+def test_cvd_invalid_uuid_falls_back_to_all_gpus():
+    """Invalid UUID that doesn't match any device should fall back to all GPUs."""
+    mock = _mock_pynvml_multi_gpu(
+        proc_pid=1000,
+        gpu_process_map={0: [1000], 1: []},
+    )
+    mock.nvmlDeviceGetCount.return_value = 2
+    # UUID lookup raises for all devices
+    mock.nvmlDeviceGetUUID = MagicMock(side_effect=RuntimeError("no UUID support"))
+    with patch("alloc.probe._get_child_pids", return_value=[]):
+        with patch.dict("os.environ", {"CUDA_VISIBLE_DEVICES": "GPU-nonexistent"}):
+            result = _discover_gpu_indices(1000, mock, fallback_index=0)
+    # Should fall back to searching all GPUs and find PID 1000 on GPU 0
+    assert 0 in result
 def test_parse_torch_distributed_launch():
     assert _parse_launcher_gpu_count([
         "python", "-m", "torch.distributed.launch", "--nproc_per_node=2", "train.py"

{alloc-0.0.8 → alloc-0.0.10}/tests/test_topology_strategy.py RENAMED Viewed

@@ -18,9 +18,15 @@ class TestStrategyInference:
                 num_gpus_detected=num_gpus,
             )
-    def test_no_degrees_strategy_none(self):
-        """When no degree env vars set, strategy should be None."""
-        result = self._topo({})
+    def test_no_degrees_multi_gpu_infers_ddp(self):
+        """When no degree env vars but multiple GPUs detected, infer DDP."""
+        result = self._topo({}, num_gpus=4)
+        assert result["strategy"] == "ddp"
+        assert result["dp_degree"] == 4
+    def test_single_gpu_no_degrees_strategy_none(self):
+        """Single GPU with no degrees → strategy stays None."""
+        result = self._topo({}, num_gpus=1)
         assert result["strategy"] is None
     def test_dp_only_is_ddp(self):