PyPI - alloc - Versions diffs - 0.0.7__tar.gz → 0.0.9__tar.gz - Mend

alloc 0.0.7tar.gz → 0.0.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{alloc-0.0.7 → alloc-0.0.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.7
+Version: 0.0.9
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0
@@ -40,7 +40,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.2 — Calibrate
+alloc v0.0.8 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.7 → alloc-0.0.9}/README.md RENAMED Viewed

@@ -12,7 +12,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.2 — Calibrate
+alloc v0.0.8 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.7 → alloc-0.0.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alloc"
-version = "0.0.7"
+version = "0.0.9"
 description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
 readme = "README.md"
 license = "Apache-2.0"

{alloc-0.0.7 → alloc-0.0.9}/src/alloc/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
 _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
 del _warnings
-__version__ = "0.0.7"
+__version__ = "0.0.9"
 from alloc.ghost import ghost, GhostReport
 from alloc.callbacks import AllocCallback as HuggingFaceCallback

{alloc-0.0.7 → alloc-0.0.9}/src/alloc/cli.py RENAMED Viewed

@@ -455,7 +455,9 @@ def run(
             "tp_degree": topology.get("tp_degree"),
             "pp_degree": topology.get("pp_degree"),
             "dp_degree": topology.get("dp_degree"),
+            "strategy": topology.get("strategy"),
             "interconnect_type": topology.get("interconnect_type"),
+            "process_map": result.process_map,
             "objective": objective,
             "max_budget_hourly": max_budget_hourly,
             "command": " ".join(command),
@@ -2368,7 +2370,7 @@ def whoami(
     out = {
         "api_url": api_url,
-        "logged_in": bool(token),
+        "logged_in": False,
         "token_source": token_source if token else None,
     }
@@ -2398,20 +2400,33 @@ def whoami(
             profile = _get("/profile")
             fleet = _get("/gpu-fleet")
         else:
-            if json_output:
+            # whoami is a status command — report structured result, exit 0
+            if e.response.status_code == 401:
+                out["token_status"] = "expired"
+            else:
+                out["token_status"] = "error"
                 out["error"] = f"API error {e.response.status_code}"
+            if json_output:
                 _print_json(out)
             else:
-                console.print(f"[red]API error {e.response.status_code}[/red]")
+                if e.response.status_code == 401:
+                    console.print("[yellow]Token expired.[/yellow]")
+                else:
+                    console.print(f"[red]API error {e.response.status_code}[/red]")
                 console.print("[dim]Run: alloc login[/dim]")
-            raise typer.Exit(1)
+            return
     except httpx.ConnectError:
+        out["token_status"] = "unreachable"
+        out["error"] = f"Cannot connect to {api_url}"
         if json_output:
-            out["error"] = f"Cannot connect to {api_url}"
             _print_json(out)
         else:
             console.print(f"[red]Cannot connect to {api_url}[/red]")
-        raise typer.Exit(1)
+        return
+    # API validated the token — now we know login is real
+    out["logged_in"] = True
+    out["token_status"] = "valid"
     gpus = fleet.get("gpus") or []
     fleet_count = len([g for g in gpus if g.get("fleet_status") == "in_fleet"])
@@ -3087,7 +3102,7 @@ def status(
     out = {
         "version": __version__,
-        "logged_in": bool(token),
+        "has_token": bool(token),
         "api_url": api_url,
         "artifact": None,
         "dashboard_url": None,
@@ -3548,6 +3563,27 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
     if interconnect not in ("pcie", "nvlink", "nvlink_switch", "nvlink_p2p", "infiniband", "unknown"):
         interconnect = "unknown"
+    # Infer strategy from degrees — only when evidence exists
+    strategy = None
+    has_tp = tp is not None and tp > 1
+    has_pp = pp is not None and pp > 1
+    if has_tp and has_pp:
+        strategy = "tp+pp+dp"
+    elif has_tp:
+        strategy = "tp+dp" if (dp is not None and dp > 1) else "tp"
+    elif has_pp:
+        strategy = "pp+dp" if (dp is not None and dp > 1) else "pp"
+    elif dp is not None and dp > 1:
+        strategy = "ddp"
+    elif strategy is None and num_gpus_detected > 1 and not has_tp and not has_pp:
+        # Multiple GPUs detected via NVML with no TP/PP env vars →
+        # DDP is PyTorch's default and the only realistic inference.
+        # This is NOT the old `or "ddp"` — it only fires when probe
+        # actually observed multiple GPU processes.
+        strategy = "ddp"
+        if dp is None:
+            dp = num_gpus_detected
     return {
         "num_nodes": nnodes or 1,
         "gpus_per_node": gpn,
@@ -3555,6 +3591,7 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
         "pp_degree": pp,
         "dp_degree": dp,
         "interconnect_type": interconnect,
+        "strategy": strategy,
     }

{alloc-0.0.7 → alloc-0.0.9}/src/alloc/extractor_runner.py RENAMED Viewed

@@ -206,7 +206,11 @@ def main():
         except SystemExit:
             pass  # catch real SystemExit too
     except Exception as e:
-        result = {"status": "error", "error": str(e)[:200]}
+        error_msg = str(e)[:500]
+        _dist_keywords = ("init_process_group", "nccl", "gloo", "distributed",
+                          "master_addr", "master_port", "rendezvouserror")
+        status = "error_distributed" if any(kw in error_msg.lower() for kw in _dist_keywords) else "error"
+        result = {"status": status, "error": error_msg}
         with open(sidecar_path, "w") as f:
             json.dump(result, f)
         return
@@ -277,7 +281,30 @@ def main():
             "activation_method": activation_result.get("activation_method"),
         }
     else:
-        result = {"status": "no_model"}
+        # No model found — check if this is a distributed training script
+        # that hides the model inside __main__ guard or main()
+        _is_dist = False
+        try:
+            import torch.distributed as _dist_mod
+            if _dist_mod.is_initialized():
+                _is_dist = True
+        except Exception:
+            pass
+        if not _is_dist:
+            # Check if module imported distributed primitives
+            for attr_name in dir(module):
+                try:
+                    obj = getattr(module, attr_name)
+                    mod_name = getattr(obj, "__module__", "") or ""
+                    if "torch.distributed" in mod_name or "torch.nn.parallel" in mod_name:
+                        _is_dist = True
+                        break
+                except Exception:
+                    continue
+        if _is_dist:
+            result = {"status": "error_distributed", "error": "no model found — script uses distributed training"}
+        else:
+            result = {"status": "no_model"}
     with open(sidecar_path, "w") as f:
         json.dump(result, f)

{alloc-0.0.7 → alloc-0.0.9}/src/alloc/model_extractor.py RENAMED Viewed

@@ -108,6 +108,10 @@ def _extract_via_subprocess(
         env.setdefault("WORLD_SIZE", "1")
         env.setdefault("MASTER_ADDR", "127.0.0.1")
         env.setdefault("MASTER_PORT", "29500")
+        # Suppress pynvml/torch.cuda deprecation warnings in subprocess
+        existing = env.get("PYTHONWARNINGS", "")
+        filters = "ignore::FutureWarning,ignore::DeprecationWarning"
+        env["PYTHONWARNINGS"] = f"{existing},{filters}" if existing else filters
         subprocess.run(
             [sys.executable, "-m", "alloc.extractor_runner", sidecar_path, script_abs],
@@ -137,11 +141,16 @@ def _extract_via_subprocess(
             )
         # Structured degradation for distributed scripts
-        if data.get("status") == "error":
-            error_msg = data.get("error", "")
-            _dist_keywords = ("init_process_group", "NCCL", "gloo", "distributed",
-                              "MASTER_ADDR", "MASTER_PORT", "RendezvousError")
-            if any(kw.lower() in error_msg.lower() for kw in _dist_keywords):
+        status = data.get("status", "")
+        if status in ("error", "error_distributed"):
+            is_distributed = status == "error_distributed"
+            if not is_distributed:
+                # Fallback keyword match for older sidecar format
+                error_msg = data.get("error", "")
+                _dist_keywords = ("init_process_group", "NCCL", "gloo", "distributed",
+                                  "MASTER_ADDR", "MASTER_PORT", "RendezvousError")
+                is_distributed = any(kw.lower() in error_msg.lower() for kw in _dist_keywords)
+            if is_distributed:
                 return ModelInfo(
                     param_count=0,
                     dtype="float16",

{alloc-0.0.7 → alloc-0.0.9}/src/alloc/probe.py RENAMED Viewed

@@ -363,7 +363,7 @@ def probe_command(
     """
     pynvml = _try_import_pynvml()
-    # Launch the subprocess
+    # Launch the user's training subprocess — do NOT modify env (their warnings matter)
     try:
         proc = subprocess.Popen(
             command,

{alloc-0.0.7 → alloc-0.0.9}/src/alloc/upload.py RENAMED Viewed

@@ -123,6 +123,7 @@ def upload_artifact(artifact_path: str, api_url: str, token: str) -> dict:
         "dataloader_wait_pct": probe.get("dataloader_wait_pct"),
         "comm_overhead_pct": probe.get("comm_overhead_pct"),
         "per_rank_peak_vram_mb": probe.get("per_rank_peak_vram_mb"),
+        "process_map": probe.get("process_map"),
         # Architecture fields: probe (callbacks) takes priority over ghost defaults
         "batch_size": probe.get("batch_size") or (ghost.get("batch_size") if ghost else None),
         "seq_length": ghost.get("seq_length") if ghost else None,

{alloc-0.0.7 → alloc-0.0.9}/src/alloc.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.7
+Version: 0.0.9
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0
@@ -40,7 +40,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.2 — Calibrate
+alloc v0.0.8 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.7 → alloc-0.0.9}/src/alloc.egg-info/SOURCES.txt RENAMED Viewed

@@ -51,6 +51,7 @@ tests/test_probe_hw.py
 tests/test_probe_multi.py
 tests/test_scan_auth.py
 tests/test_stability.py
+tests/test_topology_strategy.py
 tests/test_upload.py
 tests/test_verdict.py
 tests/test_yaml_config.py

{alloc-0.0.7 → alloc-0.0.9}/tests/test_auth.py RENAMED Viewed

@@ -68,6 +68,34 @@ def test_whoami_not_logged_in_json(tmp_path: Path):
     assert data["api_url"] == "https://api.example.com"
+def test_whoami_stale_token_json(tmp_path: Path):
+    """Stale token should exit 0 with token_status: expired."""
+    mock_resp = MagicMock()
+    mock_resp.status_code = 401
+    mock_resp.raise_for_status.side_effect = httpx.HTTPStatusError(
+        "Unauthorized", request=MagicMock(), response=mock_resp,
+    )
+    mock_client = MagicMock()
+    mock_client.__enter__.return_value = mock_client
+    mock_client.__exit__.return_value = False
+    mock_client.get.return_value = mock_resp
+    env = {
+        "HOME": str(tmp_path),
+        "ALLOC_API_URL": "https://api.example.com",
+        "ALLOC_TOKEN": "stale-token",
+    }
+    with patch("httpx.Client", return_value=mock_client), \
+         patch("alloc.cli.try_refresh_access_token", return_value=None):
+        result = runner.invoke(app, ["whoami", "--json"], env=env)
+    assert result.exit_code == 0
+    data = json.loads(result.output)
+    assert data["logged_in"] is False
+    assert data["token_status"] == "expired"
 def test_whoami_logged_in_json(tmp_path: Path):
     profile_resp = MagicMock()
     profile_resp.raise_for_status.return_value = None
@@ -110,6 +138,7 @@ def test_whoami_logged_in_json(tmp_path: Path):
     assert result.exit_code == 0
     data = json.loads(result.output)
     assert data["logged_in"] is True
+    assert data["token_status"] == "valid"
     assert data["token_source"] == "env"
     assert data["email"] == "user@example.com"
     assert data["fleet_count"] == 1

{alloc-0.0.7 → alloc-0.0.9}/tests/test_cli.py RENAMED Viewed

@@ -239,7 +239,7 @@ def test_status_json_no_artifact(tmp_path, monkeypatch):
     assert result.exit_code == 0
     data = json.loads(result.output.strip())
     assert data["artifact"] is None
-    assert data["logged_in"] is False
+    assert data["has_token"] is False
     assert "version" in data

alloc-0.0.9/tests/test_topology_strategy.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""Tests for strategy inference from topology degrees (P0-B)."""
+from __future__ import annotations
+import os
+from unittest.mock import patch
+from alloc.cli import _infer_parallel_topology_from_env
+class TestStrategyInference:
+    """Strategy should be inferred from TP/PP/DP degrees when present."""
+    def _topo(self, env=None, num_gpus=4):
+        env = env or {}
+        with patch.dict(os.environ, env, clear=False):
+            return _infer_parallel_topology_from_env(
+                num_gpus_detected=num_gpus,
+            )
+    def test_no_degrees_multi_gpu_infers_ddp(self):
+        """When no degree env vars but multiple GPUs detected, infer DDP."""
+        result = self._topo({}, num_gpus=4)
+        assert result["strategy"] == "ddp"
+        assert result["dp_degree"] == 4
+    def test_single_gpu_no_degrees_strategy_none(self):
+        """Single GPU with no degrees → strategy stays None."""
+        result = self._topo({}, num_gpus=1)
+        assert result["strategy"] is None
+    def test_dp_only_is_ddp(self):
+        """WORLD_SIZE=4 with no TP/PP → dp inferred → strategy=ddp."""
+        result = self._topo({"WORLD_SIZE": "4"})
+        assert result["strategy"] == "ddp"
+        assert result["dp_degree"] == 4
+    def test_tp_only(self):
+        """TP_SIZE=4 alone → strategy=tp."""
+        result = self._topo({"TP_SIZE": "4"})
+        assert result["strategy"] == "tp"
+    def test_pp_only(self):
+        """PP_SIZE=4 alone → strategy=pp."""
+        result = self._topo({"PP_SIZE": "4"})
+        assert result["strategy"] == "pp"
+    def test_tp_dp(self):
+        """TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
+        result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
+        assert result["strategy"] == "tp+dp"
+    def test_pp_dp(self):
+        """PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
+        result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
+        assert result["strategy"] == "pp+dp"
+    def test_tp_pp_dp(self):
+        """All three degrees → strategy=tp+pp+dp."""
+        result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
+        assert result["strategy"] == "tp+pp+dp"
+    def test_tp_pp_no_dp(self):
+        """TP+PP without explicit DP → strategy=tp+pp+dp."""
+        result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
+        assert result["strategy"] == "tp+pp+dp"
+    def test_tp_size_1_not_counted(self):
+        """TP_SIZE=1 should not count as tensor parallelism."""
+        result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
+        assert result["strategy"] == "ddp"
+    def test_pp_size_1_not_counted(self):
+        """PP_SIZE=1 should not count as pipeline parallelism."""
+        result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
+        assert result["strategy"] == "ddp"
+    def test_dp_inferred_from_world_size(self):
+        """DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
+        result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
+        assert result["dp_degree"] == 4
+        assert result["strategy"] == "tp+dp"
+class TestProcessMapInProbeDictAssembly:
+    """process_map should reach probe_dict from ProbeResult."""
+    def test_process_map_present_in_topology_return(self):
+        """Topology dict now includes strategy field."""
+        with patch.dict(os.environ, {"WORLD_SIZE": "4"}, clear=False):
+            topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
+        assert "strategy" in topo
+        assert topo["strategy"] == "ddp"