PyPI - alloc - Versions diffs - 0.0.6__tar.gz → 0.0.7__tar.gz - Mend

alloc 0.0.6tar.gz → 0.0.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{alloc-0.0.6 → alloc-0.0.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.6
+Version: 0.0.7
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0

{alloc-0.0.6 → alloc-0.0.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alloc"
-version = "0.0.6"
+version = "0.0.7"
 description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
 readme = "README.md"
 license = "Apache-2.0"

{alloc-0.0.6 → alloc-0.0.7}/src/alloc/__init__.py RENAMED Viewed

@@ -5,9 +5,11 @@ from __future__ import annotations
 import warnings as _warnings
 _warnings.filterwarnings("ignore", category=FutureWarning, module="pynvml")
 _warnings.filterwarnings("ignore", category=DeprecationWarning, module="pynvml")
+_warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda")
+_warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
 del _warnings
-__version__ = "0.0.6"
+__version__ = "0.0.7"
 from alloc.ghost import ghost, GhostReport
 from alloc.callbacks import AllocCallback as HuggingFaceCallback

{alloc-0.0.6 → alloc-0.0.7}/src/alloc/cli.py RENAMED Viewed

@@ -19,10 +19,12 @@ import sys
 import warnings
 from typing import List, Optional
-# Suppress noisy third-party warnings globally — pynvml deprecation and
-# urllib3 LibreSSL warnings clutter every CLI command on affected systems.
+# Suppress noisy third-party warnings globally — pynvml deprecation (emitted
+# from torch.cuda.__init__) and urllib3 LibreSSL warnings clutter CLI output.
 warnings.filterwarnings("ignore", category=FutureWarning, module="pynvml")
 warnings.filterwarnings("ignore", category=DeprecationWarning, module="pynvml")
+warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda")
+warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
 warnings.filterwarnings("ignore", message=".*LibreSSL.*", module="urllib3")
 import typer
@@ -75,6 +77,19 @@ def ghost(
             console.print(f"[dim]Tip: alloc ghost {script} --param-count-b 7.0[/dim]")
         raise typer.Exit(1)
+    if info.extraction_error:
+        if json_output:
+            _print_json({
+                "error": info.extraction_error,
+                "detail": info.extraction_detail,
+                "supported": False,
+            })
+        else:
+            console.print(f"[yellow]{info.extraction_detail}[/yellow]")
+            if info.extraction_error == "distributed_entrypoint":
+                console.print("[dim]Tip: alloc ghost model.py  (point to the file that defines your model)[/dim]")
+        raise typer.Exit(1)
     # Use dtype from execution if available, otherwise CLI flag
     resolved_dtype = info.dtype if info.method == "execution" else dtype
@@ -2099,12 +2114,32 @@ def scan(
     try:
         headers = {"Content-Type": "application/json"}
+        used_auth = bool(token)
         if token:
             headers["Authorization"] = f"Bearer {token}"
+            endpoint = "/scans"
+        else:
+            endpoint = "/scans/cli"
-        endpoint = "/scans" if token else "/scans/cli"
         with httpx.Client(timeout=30) as client:
             resp = client.post(f"{api_url}{endpoint}", json=payload, headers=headers)
+            # On 401 with a saved token: try refresh, then fall back to public endpoint
+            if resp.status_code == 401 and used_auth:
+                new_token = try_refresh_access_token()
+                if new_token:
+                    headers["Authorization"] = f"Bearer {new_token}"
+                    resp = client.post(f"{api_url}/scans", json=payload, headers=headers)
+                else:
+                    # Token refresh failed — fall back to unauthenticated scan
+                    console.print(
+                        "[yellow]Session expired — falling back to public scan "
+                        "(org fleet context unavailable). Run `alloc login` to restore.[/yellow]",
+                    )
+                    del headers["Authorization"]
+                    resp = client.post(f"{api_url}/scans/cli", json=payload, headers=headers)
             resp.raise_for_status()
             result = resp.json()

{alloc-0.0.6 → alloc-0.0.7}/src/alloc/model_extractor.py RENAMED Viewed

@@ -33,6 +33,8 @@ class ModelInfo:
     seq_length: Optional[int] = None
     activation_memory_bytes: Optional[int] = None
     activation_method: Optional[str] = None  # "traced" | None
+    extraction_error: Optional[str] = None   # "distributed_entrypoint" | None
+    extraction_detail: Optional[str] = None  # human-readable explanation
 def extract_model_info(
@@ -134,6 +136,24 @@ def _extract_via_subprocess(
                 activation_method=data.get("activation_method"),
             )
+        # Structured degradation for distributed scripts
+        if data.get("status") == "error":
+            error_msg = data.get("error", "")
+            _dist_keywords = ("init_process_group", "NCCL", "gloo", "distributed",
+                              "MASTER_ADDR", "MASTER_PORT", "RendezvousError")
+            if any(kw.lower() in error_msg.lower() for kw in _dist_keywords):
+                return ModelInfo(
+                    param_count=0,
+                    dtype="float16",
+                    model_name=None,
+                    method="execution",
+                    extraction_error="distributed_entrypoint",
+                    extraction_detail=(
+                        "Script requires a distributed runtime (e.g. torchrun). "
+                        "Run ghost on the model definition file instead of the launcher script."
+                    ),
+                )
         return None
     except subprocess.TimeoutExpired:

{alloc-0.0.6 → alloc-0.0.7}/src/alloc/probe.py RENAMED Viewed

@@ -18,6 +18,13 @@ from dataclasses import dataclass, field
 from enum import Enum
 from typing import List, Optional
+import warnings as _warnings
+_warnings.filterwarnings("ignore", category=FutureWarning, module="pynvml")
+_warnings.filterwarnings("ignore", category=DeprecationWarning, module="pynvml")
+_warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda")
+_warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
+del _warnings
 class StopReason(str, Enum):
     STABLE = "stable"

{alloc-0.0.6 → alloc-0.0.7}/src/alloc.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.6
+Version: 0.0.7
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0

{alloc-0.0.6 → alloc-0.0.7}/src/alloc.egg-info/SOURCES.txt RENAMED Viewed

@@ -43,11 +43,13 @@ tests/test_diagnosis_engine.py
 tests/test_diagnosis_rules.py
 tests/test_extractor_activation.py
 tests/test_ghost.py
+tests/test_ghost_degradation.py
 tests/test_init_from_org.py
 tests/test_interconnect.py
 tests/test_model_extractor.py
 tests/test_probe_hw.py
 tests/test_probe_multi.py
+tests/test_scan_auth.py
 tests/test_stability.py
 tests/test_upload.py
 tests/test_verdict.py

alloc-0.0.7/tests/test_ghost_degradation.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""Tests for ghost structured degradation on distributed scripts."""
+from __future__ import annotations
+import json
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+from typer.testing import CliRunner
+from alloc.cli import app
+from alloc.model_extractor import ModelInfo, extract_model_info
+runner = CliRunner()
+def test_distributed_error_returns_structured_modelinfo():
+    """When extractor subprocess fails with a distributed keyword, return structured ModelInfo."""
+    sidecar_data = json.dumps({
+        "status": "error",
+        "error": "RuntimeError: torch.distributed.init_process_group requires MASTER_ADDR",
+    })
+    def _fake_subprocess_run(*args, **kwargs):
+        # Write the sidecar file
+        sidecar_path = args[0][3]  # [python, -m, alloc.extractor_runner, sidecar_path, script_path]
+        with open(sidecar_path, "w") as f:
+            f.write(sidecar_data)
+    # Create a dummy script
+    fd, script_path = tempfile.mkstemp(suffix=".py", prefix="alloc_test_dist_")
+    os.write(fd, b"import torch\ntorch.distributed.init_process_group('nccl')\n")
+    os.close(fd)
+    try:
+        with patch("subprocess.run", side_effect=_fake_subprocess_run):
+            info = extract_model_info(script_path)
+        assert info is not None
+        assert info.extraction_error == "distributed_entrypoint"
+        assert "distributed runtime" in info.extraction_detail
+        assert info.param_count == 0
+    finally:
+        os.unlink(script_path)
+def test_distributed_error_nccl_keyword():
+    """NCCL errors should be caught as distributed failures."""
+    sidecar_data = json.dumps({
+        "status": "error",
+        "error": "NCCL error: unhandled system error",
+    })
+    fd, script_path = tempfile.mkstemp(suffix=".py", prefix="alloc_test_nccl_")
+    os.write(fd, b"pass\n")
+    os.close(fd)
+    try:
+        def _fake_run(*args, **kwargs):
+            sidecar_path = args[0][3]
+            with open(sidecar_path, "w") as f:
+                f.write(sidecar_data)
+        with patch("subprocess.run", side_effect=_fake_run):
+            info = extract_model_info(script_path)
+        assert info is not None
+        assert info.extraction_error == "distributed_entrypoint"
+    finally:
+        os.unlink(script_path)
+def test_non_distributed_error_returns_none():
+    """Non-distributed errors should still return None (fall through to AST)."""
+    sidecar_data = json.dumps({
+        "status": "error",
+        "error": "ImportError: No module named 'custom_lib'",
+    })
+    fd, script_path = tempfile.mkstemp(suffix=".py", prefix="alloc_test_other_")
+    # Script with no from_pretrained so AST also returns None
+    os.write(fd, b"import custom_lib\n")
+    os.close(fd)
+    try:
+        def _fake_run(*args, **kwargs):
+            sidecar_path = args[0][3]
+            with open(sidecar_path, "w") as f:
+                f.write(sidecar_data)
+        with patch("subprocess.run", side_effect=_fake_run):
+            info = extract_model_info(script_path)
+        # Should be None because error is not distributed and AST won't find a model either
+        assert info is None
+    finally:
+        os.unlink(script_path)
+def test_ghost_cli_distributed_error_json(tmp_path: Path):
+    """ghost --json shows structured error for distributed scripts."""
+    script_path = tmp_path / "train_ddp.py"
+    script_path.write_text("import torch\ntorch.distributed.init_process_group('nccl')\n")
+    dist_info = ModelInfo(
+        param_count=0,
+        dtype="float16",
+        model_name=None,
+        method="execution",
+        extraction_error="distributed_entrypoint",
+        extraction_detail="Script requires a distributed runtime (e.g. torchrun). Run ghost on the model definition file instead of the launcher script.",
+    )
+    with patch("alloc.model_extractor.extract_model_info", return_value=dist_info):
+        result = runner.invoke(app, ["ghost", str(script_path), "--json"])
+    assert result.exit_code != 0
+    data = json.loads(result.output)
+    assert data["error"] == "distributed_entrypoint"
+    assert data["supported"] is False
+    assert "distributed runtime" in data["detail"]
+def test_ghost_cli_distributed_error_human(tmp_path: Path):
+    """ghost shows human-readable message with tip for distributed scripts."""
+    script_path = tmp_path / "train_ddp.py"
+    script_path.write_text("import torch\n")
+    dist_info = ModelInfo(
+        param_count=0,
+        dtype="float16",
+        model_name=None,
+        method="execution",
+        extraction_error="distributed_entrypoint",
+        extraction_detail="Script requires a distributed runtime (e.g. torchrun). Run ghost on the model definition file instead of the launcher script.",
+    )
+    with patch("alloc.model_extractor.extract_model_info", return_value=dist_info):
+        result = runner.invoke(app, ["ghost", str(script_path)])
+    assert result.exit_code != 0
+    assert "distributed runtime" in result.output
+    assert "model.py" in result.output  # tip about pointing to model file

alloc-0.0.7/tests/test_scan_auth.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""Tests for scan command 401 retry + /scans/cli fallback."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+import httpx
+from typer.testing import CliRunner
+from alloc.cli import app
+runner = CliRunner()
+def _make_resp(status_code: int, body: dict, url: str = "https://api.example.com/scans"):
+    req = httpx.Request("POST", url)
+    return httpx.Response(
+        status_code,
+        request=req,
+        content=json.dumps(body).encode(),
+        headers={"content-type": "application/json"},
+    )
+def test_scan_401_refresh_retry(tmp_path: Path):
+    """On 401, refresh token and retry on /scans."""
+    resp_401 = _make_resp(401, {"detail": "unauthorized"})
+    resp_ok = _make_resp(200, {"vram_gb": 16.0, "configs": []})
+    mock_client = MagicMock()
+    mock_client.__enter__.return_value = mock_client
+    mock_client.__exit__.return_value = False
+    mock_client.post.side_effect = [resp_401, resp_ok]
+    cfg_file = tmp_path / ".alloc" / "config.json"
+    cfg_file.parent.mkdir(parents=True)
+    cfg_file.write_text(json.dumps({"token": "old-tok", "refresh_token": "rt"}))
+    env = {
+        "HOME": str(tmp_path),
+        "ALLOC_API_URL": "https://api.example.com",
+    }
+    with (
+        patch("httpx.Client", return_value=mock_client),
+        patch("alloc.cli.try_refresh_access_token", return_value="new-tok"),
+    ):
+        result = runner.invoke(app, ["scan", "--model", "llama-3-8b", "--json"], env=env)
+    assert result.exit_code == 0
+    assert mock_client.post.call_count == 2
+    # Second call should use refreshed token
+    second_call = mock_client.post.call_args_list[1]
+    assert "Bearer new-tok" in str(second_call)
+def test_scan_401_refresh_fails_fallback_public(tmp_path: Path):
+    """On 401 + refresh failure, fall back to /scans/cli with warning."""
+    resp_401 = _make_resp(401, {"detail": "unauthorized"})
+    resp_ok = _make_resp(200, {"vram_gb": 16.0, "configs": []},
+                          url="https://api.example.com/scans/cli")
+    mock_client = MagicMock()
+    mock_client.__enter__.return_value = mock_client
+    mock_client.__exit__.return_value = False
+    mock_client.post.side_effect = [resp_401, resp_ok]
+    cfg_file = tmp_path / ".alloc" / "config.json"
+    cfg_file.parent.mkdir(parents=True)
+    cfg_file.write_text(json.dumps({"token": "old-tok", "refresh_token": "rt"}))
+    env = {
+        "HOME": str(tmp_path),
+        "ALLOC_API_URL": "https://api.example.com",
+    }
+    with (
+        patch("httpx.Client", return_value=mock_client),
+        patch("alloc.cli.try_refresh_access_token", return_value=None),
+    ):
+        result = runner.invoke(app, ["scan", "--model", "llama-3-8b", "--json"], env=env)
+    assert result.exit_code == 0
+    assert mock_client.post.call_count == 2
+    # Second call should hit /scans/cli
+    second_url = str(mock_client.post.call_args_list[1])
+    assert "/scans/cli" in second_url
+def test_scan_401_fallback_warns_about_dropped_features(tmp_path: Path):
+    """Fallback to public scan warns user about lost org context."""
+    resp_401 = _make_resp(401, {"detail": "unauthorized"})
+    resp_ok = _make_resp(200, {"vram_gb": 16.0, "configs": []})
+    mock_client = MagicMock()
+    mock_client.__enter__.return_value = mock_client
+    mock_client.__exit__.return_value = False
+    mock_client.post.side_effect = [resp_401, resp_ok]
+    cfg_file = tmp_path / ".alloc" / "config.json"
+    cfg_file.parent.mkdir(parents=True)
+    cfg_file.write_text(json.dumps({"token": "old-tok", "refresh_token": "rt"}))
+    env = {
+        "HOME": str(tmp_path),
+        "ALLOC_API_URL": "https://api.example.com",
+    }
+    with (
+        patch("httpx.Client", return_value=mock_client),
+        patch("alloc.cli.try_refresh_access_token", return_value=None),
+    ):
+        # Non-JSON mode to see the warning message
+        result = runner.invoke(app, ["scan", "--model", "llama-3-8b"], env=env)
+    assert result.exit_code == 0
+    assert "expired" in result.output.lower() or "falling back" in result.output.lower()
+def test_scan_no_token_uses_public_directly(tmp_path: Path):
+    """Without a token, scan goes directly to /scans/cli."""
+    resp_ok = _make_resp(200, {"vram_gb": 16.0, "configs": []})
+    mock_client = MagicMock()
+    mock_client.__enter__.return_value = mock_client
+    mock_client.__exit__.return_value = False
+    mock_client.post.return_value = resp_ok
+    env = {
+        "HOME": str(tmp_path),
+        "ALLOC_API_URL": "https://api.example.com",
+    }
+    with patch("httpx.Client", return_value=mock_client):
+        result = runner.invoke(app, ["scan", "--model", "llama-3-8b", "--json"], env=env)
+    assert result.exit_code == 0
+    assert mock_client.post.call_count == 1
+    call_url = str(mock_client.post.call_args_list[0])
+    assert "/scans/cli" in call_url