PyPI - alloc - Versions diffs - 0.0.9__tar.gz → 0.0.11__tar.gz - Mend

alloc 0.0.9tar.gz → 0.0.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{alloc-0.0.9 → alloc-0.0.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.9
+Version: 0.0.11
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0
@@ -40,7 +40,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.8 — Calibrate
+alloc v0.0.9 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.9 → alloc-0.0.11}/README.md RENAMED Viewed

@@ -12,7 +12,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.8 — Calibrate
+alloc v0.0.9 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.9 → alloc-0.0.11}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "alloc"
-version = "0.0.9"
+version = "0.0.11"
 description = "Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints."
 readme = "README.md"
 license = "Apache-2.0"

{alloc-0.0.9 → alloc-0.0.11}/src/alloc/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ _warnings.filterwarnings("ignore", category=FutureWarning, module=r"torch\.cuda"
 _warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"torch\.cuda")
 del _warnings
-__version__ = "0.0.9"
+__version__ = "0.0.11"
 from alloc.ghost import ghost, GhostReport
 from alloc.callbacks import AllocCallback as HuggingFaceCallback

{alloc-0.0.9 → alloc-0.0.11}/src/alloc/browser_auth.py RENAMED Viewed

@@ -121,8 +121,9 @@ def browser_login(
     })
     authorize_url = f"{supabase_url}/auth/v1/authorize?{authorize_params}"
-    # Bind to 0.0.0.0 so both localhost and 127.0.0.1 reach the server.
-    server = HTTPServer(("0.0.0.0", port), _CallbackHandler)
+    # Bind to 127.0.0.1 only — the auth callback server should never be
+    # reachable from the network.
+    server = HTTPServer(("127.0.0.1", port), _CallbackHandler)
     server.auth_code = None  # type: ignore[attr-defined]
     server.auth_error = None  # type: ignore[attr-defined]
     server.timeout = 1  # poll interval for handle_request()

{alloc-0.0.9 → alloc-0.0.11}/src/alloc/callbacks.py RENAMED Viewed

@@ -501,7 +501,10 @@ class _NvmlMonitor:
             self._hw_context["nvlink_active_links"] = active_links
         except Exception:
-            pass
+            # NVLink detection code failed after entering the try block.
+            # We know NVML is functional (handles exist), so fall back to
+            # generic "nvlink" rather than leaving interconnect_type unset.
+            self._hw_context["interconnect_type"] = "nvlink"
         self._thread = threading.Thread(target=self._sample_loop, daemon=True)
         self._thread.start()

{alloc-0.0.9 → alloc-0.0.11}/src/alloc/cli.py RENAMED Viewed

@@ -328,6 +328,7 @@ def run(
     no_config: bool = typer.Option(False, "--no-config", help="Skip .alloc.yaml (use catalog defaults)"),
     after: Optional[str] = typer.Option(None, "--after", help="Previous run ID to compare against (outcome tracking)"),
     experiment: Optional[str] = typer.Option(None, "--experiment", "-e", help="Experiment group name"),
+    strategy: Optional[str] = typer.Option(None, "--strategy", help="Override detected strategy (ddp, fsdp, deepspeed, tp, pp, etc.)"),
 ):
     """Run a training command with GPU monitoring."""
     from alloc.probe import probe_command
@@ -342,6 +343,18 @@ def run(
         console.print("Usage: alloc run python train.py")
         raise typer.Exit(1)
+    # Validate --strategy against API-accepted values
+    _VALID_STRATEGIES = {
+        "ddp", "fsdp", "deepspeed", "tp", "pp",
+        "tp+dp", "pp+dp", "tp+pp+dp", "tp+pp+fsdp",
+    }
+    if strategy and strategy.lower() not in _VALID_STRATEGIES:
+        console.print(
+            f"[red]Invalid --strategy '{strategy}'. "
+            f"Valid values: {', '.join(sorted(_VALID_STRATEGIES))}[/red]"
+        )
+        raise typer.Exit(1)
     # ALLOC_POLICY: "warn" or "enforce" forces full monitoring
     alloc_policy = os.environ.get("ALLOC_POLICY", "").lower().strip()
     if alloc_policy and alloc_policy not in ("warn", "enforce"):
@@ -425,10 +438,25 @@ def run(
     # Discover environment context (git, container, Ray)
     from alloc.context import discover_context
     env_context = discover_context()
+    # AST strategy hint: detect FSDP/DDP/DeepSpeed from script source
+    ast_hint = None  # type: Optional[str]
+    try:
+        from alloc.code_analyzer import detect_strategy_hint
+        # Find the .py script in the command (e.g. "python train.py" or "torchrun ... train.py")
+        for arg in command:
+            if arg.endswith(".py") and os.path.isfile(arg):
+                ast_hint = detect_strategy_hint(arg)
+                break
+    except Exception:
+        pass  # Never crash on AST analysis failure
     topology = _infer_parallel_topology_from_env(
         num_gpus_detected=result.num_gpus_detected,
         config_interconnect=gpu_context.get("interconnect") if gpu_context else None,
         detected_interconnect=result.detected_interconnect,
+        strategy_override=strategy,
+        ast_strategy_hint=ast_hint,
     )
     objective = os.environ.get("ALLOC_OBJECTIVE", "").strip().lower() or _objective_from_context(gpu_context)
     max_budget_hourly = _max_budget_hourly_from_context(gpu_context)
@@ -456,6 +484,7 @@ def run(
             "pp_degree": topology.get("pp_degree"),
             "dp_degree": topology.get("dp_degree"),
             "strategy": topology.get("strategy"),
+            "strategy_detection_method": topology.get("strategy_detection_method"),
             "interconnect_type": topology.get("interconnect_type"),
             "process_map": result.process_map,
             "objective": objective,
@@ -3522,8 +3551,22 @@ def _print_gpu_context_detail(ctx: dict) -> None:
     console.print(Panel("\n".join(lines), title="GPU Context (.alloc.yaml)", border_style="cyan", padding=(1, 0)))
-def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_interconnect: Optional[str] = None, detected_interconnect: Optional[str] = None) -> dict:
-    """Infer distributed topology hints from common launcher env vars."""
+def _infer_parallel_topology_from_env(
+    *,
+    num_gpus_detected: int,
+    config_interconnect: Optional[str] = None,
+    detected_interconnect: Optional[str] = None,
+    strategy_override: Optional[str] = None,
+    ast_strategy_hint: Optional[str] = None,
+) -> dict:
+    """Infer distributed topology hints from common launcher env vars.
+    Strategy precedence:
+      1. --strategy override (user explicit)
+      2. AST hint (code_analyzer detected FSDP/DDP/DeepSpeed)
+      3. Env var inference (TP/PP/DP degrees)
+      4. None (unknown — never silently default to ddp)
+    """
     def _get_int(name: str) -> Optional[int]:
         val = os.environ.get(name)
@@ -3545,8 +3588,12 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
     tp = _get_int("TP_SIZE") or _get_int("TENSOR_PARALLEL_SIZE")
     pp = _get_int("PP_SIZE") or _get_int("PIPELINE_PARALLEL_SIZE")
-    dp = _get_int("DP_SIZE") or _get_int("DATA_PARALLEL_SIZE")
+    dp_explicit = _get_int("DP_SIZE") or _get_int("DATA_PARALLEL_SIZE")
+    dp = dp_explicit
+    # Derive dp from WORLD_SIZE when not explicitly set.
+    # This gives us the degree but does NOT imply strategy=ddp
+    # (WORLD_SIZE is set for both DDP and FSDP).
     if dp is None and world_size is not None:
         denom = (tp or 1) * (pp or 1)
         if denom > 0 and world_size % denom == 0:
@@ -3563,26 +3610,41 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
     if interconnect not in ("pcie", "nvlink", "nvlink_switch", "nvlink_p2p", "infiniband", "unknown"):
         interconnect = "unknown"
-    # Infer strategy from degrees — only when evidence exists
+    # Strategy detection with strict precedence and provenance tracking
     strategy = None
-    has_tp = tp is not None and tp > 1
-    has_pp = pp is not None and pp > 1
-    if has_tp and has_pp:
+    strategy_detection_method = None  # type: Optional[str]
+    # 1. Explicit --strategy override
+    if strategy_override:
+        strategy = strategy_override.lower()
+        strategy_detection_method = "user_override"
+    # 2. Env var inference from explicit TP/PP/DP degree env vars
+    #    TP_SIZE/PP_SIZE unambiguously identify the strategy.
+    #    DP_SIZE (explicit) implies DDP. But WORLD_SIZE-derived dp does NOT
+    #    imply DDP — FSDP uses the same WORLD_SIZE.
+    elif tp is not None and tp > 1 and pp is not None and pp > 1:
         strategy = "tp+pp+dp"
-    elif has_tp:
+        strategy_detection_method = "env_degrees"
+    elif tp is not None and tp > 1:
         strategy = "tp+dp" if (dp is not None and dp > 1) else "tp"
-    elif has_pp:
+        strategy_detection_method = "env_degrees"
+    elif pp is not None and pp > 1:
         strategy = "pp+dp" if (dp is not None and dp > 1) else "pp"
-    elif dp is not None and dp > 1:
-        strategy = "ddp"
-    elif strategy is None and num_gpus_detected > 1 and not has_tp and not has_pp:
-        # Multiple GPUs detected via NVML with no TP/PP env vars →
-        # DDP is PyTorch's default and the only realistic inference.
-        # This is NOT the old `or "ddp"` — it only fires when probe
-        # actually observed multiple GPU processes.
+        strategy_detection_method = "env_degrees"
+    elif dp_explicit is not None and dp_explicit > 1:
         strategy = "ddp"
+        strategy_detection_method = "env_degrees"
+    # 3. AST hint (code_analyzer detected FSDP/DDP/DeepSpeed in script)
+    elif ast_strategy_hint and num_gpus_detected > 1:
+        strategy = ast_strategy_hint
+        strategy_detection_method = "ast_analysis"
         if dp is None:
             dp = num_gpus_detected
+    # 4. No trustworthy signal — leave strategy=None
+    # (Never silently collapse unknown distributed runs to ddp)
+    if strategy and dp is None and num_gpus_detected > 1:
+        dp = num_gpus_detected
     return {
         "num_nodes": nnodes or 1,
@@ -3592,6 +3654,7 @@ def _infer_parallel_topology_from_env(*, num_gpus_detected: int, config_intercon
         "dp_degree": dp,
         "interconnect_type": interconnect,
         "strategy": strategy,
+        "strategy_detection_method": strategy_detection_method,
     }

{alloc-0.0.9 → alloc-0.0.11}/src/alloc/code_analyzer.py RENAMED Viewed

@@ -132,6 +132,34 @@ def analyze_script(script_path: str) -> CodeFindings:
     return findings
+def detect_strategy_hint(script_path: str) -> Optional[str]:
+    """Lightweight AST check: return strategy kind if detectable, else None.
+    Returns one of: 'fsdp', 'ddp', 'deepspeed', 'data_parallel', or None.
+    Never crashes — returns None on any error.
+    """
+    try:
+        if not os.path.isfile(script_path):
+            return None
+        with open(script_path, "r") as f:
+            source = f.read()
+        tree = ast.parse(source, filename=script_path)
+        imports = _walk_imports(tree)
+        distributed = _find_distributed(tree, imports, source.splitlines(), script_path)
+        # Priority: fsdp > deepspeed > ddp
+        # data_parallel is single-process (not a distributed strategy) — ignored.
+        kinds = {d.kind for d in distributed}
+        if "fsdp" in kinds:
+            return "fsdp"
+        if "deepspeed" in kinds:
+            return "deepspeed"
+        if "ddp" in kinds:
+            return "ddp"
+        return None
+    except Exception:
+        return None
 # ---------------------------------------------------------------------------
 # Import resolution
 # ---------------------------------------------------------------------------

{alloc-0.0.9 → alloc-0.0.11}/src/alloc/diagnosis_engine.py RENAMED Viewed

@@ -403,7 +403,7 @@ def _estimate_model_params(model_name: str) -> Optional[float]:
         "whisper-large": 1.55,
     }
-    for key, params in estimates.items():
+    for key, params in sorted(estimates.items(), key=lambda x: len(x[0]), reverse=True):
         if key in name:
             return params

{alloc-0.0.9 → alloc-0.0.11}/src/alloc/probe.py RENAMED Viewed

@@ -215,8 +215,19 @@ def _discover_gpu_indices(proc_pid, pynvml, fallback_index=0, expected_gpus=None
                     if 0 <= idx < device_count:
                         visible_physical.append(idx)
                 except ValueError:
-                    visible_physical = list(range(device_count))
-                    break
+                    # UUID-style device identifiers — try NVML UUID matching
+                    try:
+                        for phys_idx in range(device_count):
+                            handle = pynvml.nvmlDeviceGetHandleByIndex(phys_idx)
+                            uuid = pynvml.nvmlDeviceGetUUID(handle)
+                            if isinstance(uuid, bytes):
+                                uuid = uuid.decode("utf-8", errors="replace")
+                            if d in uuid:
+                                visible_physical.append(phys_idx)
+                                break
+                    except Exception:
+                        visible_physical = list(range(device_count))
+                        break
         search_indices = visible_physical if visible_physical else list(range(device_count))
     else:
         search_indices = list(range(device_count))
@@ -363,12 +374,27 @@ def probe_command(
     """
     pynvml = _try_import_pynvml()
-    # Launch the user's training subprocess — do NOT modify env (their warnings matter)
+    # Launch the user's training subprocess.
+    # Suppress only pynvml/torch.cuda FutureWarning noise — these come from
+    # Alloc's own callbacks or from torch internals, not from user code.
+    # Propagates to torchrun children and most Ray workers via env inheritance.
+    child_env = os.environ.copy()
+    existing_pw = child_env.get("PYTHONWARNINGS", "")
+    alloc_filters = (
+        "ignore::FutureWarning:pynvml,"
+        "ignore::DeprecationWarning:pynvml,"
+        "ignore::FutureWarning:torch.cuda,"
+        "ignore::DeprecationWarning:torch.cuda"
+    )
+    child_env["PYTHONWARNINGS"] = (
+        f"{existing_pw},{alloc_filters}" if existing_pw else alloc_filters
+    )
     try:
         proc = subprocess.Popen(
             command,
             stdout=sys.stdout,
             stderr=sys.stderr,
+            env=child_env,
         )
     except Exception as e:
         return ProbeResult(
@@ -507,11 +533,11 @@ def probe_command(
                         power_vals.append(pw)
                         total_mb = mi.total / (1024 * 1024)
-                    # Track per-GPU peak VRAM for multi-GPU runs
-                    if len(handles) > 1:
-                        pgp = per_gpu_peaks_ref[0]
-                        for gi, vm in enumerate(vram_vals):
-                            pgp[gi] = max(pgp.get(gi, 0.0), vm)
+                    # Track per-GPU peak VRAM (always, even single GPU —
+                    # discovery may expand handles later, and we need history from sample 0)
+                    pgp = per_gpu_peaks_ref[0]
+                    for gi, vm in enumerate(vram_vals):
+                        pgp[gi] = max(pgp.get(gi, 0.0), vm)
                     samples.append(ProbeSample(
                         timestamp=time.time(),
@@ -664,7 +690,7 @@ def probe_command(
         process_map=process_map_ref[0],
         per_gpu_peak_vram_mb=(
             [round(per_gpu_peaks_ref[0].get(i, 0), 1) for i in range(num_gpus_ref[0])]
-            if len(per_gpu_peaks_ref[0]) > 1 else None
+            if num_gpus_ref[0] > 1 and per_gpu_peaks_ref[0] else None
         ),
         detected_interconnect=detected_ic_ref[0],
     )

{alloc-0.0.9 → alloc-0.0.11}/src/alloc.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alloc
-Version: 0.0.9
+Version: 0.0.11
 Summary: Engineer-first training calibration: estimate VRAM fit, profile short runs, and pick GPU configs under real budget constraints.
 Author-email: Alloc Labs <hello@alloclabs.com>
 License-Expression: Apache-2.0
@@ -40,7 +40,7 @@ alloc run python train.py
 ```
 ```
-alloc v0.0.8 — Calibrate
+alloc v0.0.9 — Calibrate
  Run Summary
   Peak VRAM       31.2 GB / 40.0 GB (A100)

{alloc-0.0.9 → alloc-0.0.11}/tests/test_callbacks.py RENAMED Viewed

@@ -1189,3 +1189,71 @@ class TestNvmlMonitorThreadSafety:
         assert len(probe["per_rank_peak_vram_mb"]) == 2
         for peak in probe["per_rank_peak_vram_mb"]:
             assert peak > 0
+class TestNvmlMonitorNvlinkFallback:
+    def test_nvlink_detection_failure_sets_nvlink_fallback(self):
+        """When the outer NVLink detection block raises, fall back to 'nvlink'.
+        We trigger this by making nvmlDeviceGetNvLinkState raise on the first
+        call (inner except breaks the loop → active_links=0 → 'pcie'), and
+        then making the active_links comparison itself blow up. The simplest
+        trigger is having _gpu_handles[0] raise IndexError (empty list after
+        the early-return guard).
+        """
+        mock_pynvml = MagicMock()
+        mock_pynvml.nvmlInit.return_value = None
+        mock_pynvml.nvmlShutdown.return_value = None
+        mock_pynvml.nvmlDeviceGetCount.return_value = 2
+        mock_pynvml.nvmlDeviceGetName.return_value = "NVIDIA A100-SXM4-80GB"
+        mem = SimpleNamespace(total=80 * 1024**3, used=1 * 1024**3)
+        mock_pynvml.nvmlDeviceGetMemoryInfo.return_value = mem
+        mock_pynvml.nvmlSystemGetDriverVersion.return_value = "535"
+        mock_pynvml.nvmlSystemGetCudaDriverVersion.return_value = 12000
+        mock_pynvml.nvmlDeviceGetCudaComputeCapability.return_value = (8, 0)
+        util = SimpleNamespace(gpu=75, memory=60)
+        mock_pynvml.nvmlDeviceGetUtilizationRates.return_value = util
+        mock_pynvml.nvmlDeviceGetPowerUsage.return_value = 300000
+        # Use a handle list that passes the `if not self._gpu_handles` guard
+        # (it's truthy) but raises IndexError on `self._gpu_handles[0]`.
+        class BadHandleList:
+            """Truthy but raises on index access."""
+            def __bool__(self):
+                return True
+            def __len__(self):
+                return 2
+            def __iter__(self):
+                return iter([])
+            def __getitem__(self, idx):
+                raise IndexError("corrupted handle list")
+        with patch("alloc.callbacks._try_import_pynvml", return_value=mock_pynvml):
+            monitor = _NvmlMonitor()
+        # Replace handles after __init__ but before start().
+        # start() will re-populate from nvmlDeviceGetCount, so we also need to
+        # make the handle-building loop produce our bad list. We do this by
+        # patching nvmlDeviceGetHandleByIndex to raise, so _gpu_handles stays
+        # empty after the try/except in handle building. But that triggers
+        # the early return. Instead, we patch _gpu_handles AFTER start()
+        # builds them but BEFORE NVLink detection runs. We achieve this by
+        # having nvmlDeviceGetCudaComputeCapability (the last hw-context call
+        # before NVLink detection) swap in the bad handles as a side effect.
+        original_sm = mock_pynvml.nvmlDeviceGetCudaComputeCapability
+        def swap_handles_then_return_sm(handle):
+            monitor._gpu_handles = BadHandleList()
+            return (8, 0)
+        mock_pynvml.nvmlDeviceGetCudaComputeCapability = MagicMock(
+            side_effect=swap_handles_then_return_sm
+        )
+        monitor.start()
+        import time
+        time.sleep(0.02)
+        monitor.stop()
+        hw, _ = monitor.get_results()
+        assert hw.get("interconnect_type") == "nvlink"

{alloc-0.0.9 → alloc-0.0.11}/tests/test_diagnosis_engine.py RENAMED Viewed

@@ -359,3 +359,15 @@ def test_estimate_model_params_known_vision_model():
     result = _estimate_model_params("stable-diffusion")
     assert result == 0.865
+def test_estimate_model_params_gpt2_medium_prefix_match():
+    """gpt2-medium-finetuned should match gpt2-medium (0.355), not gpt2 (0.124)."""
+    result = _estimate_model_params("gpt2-medium-finetuned")
+    assert result == 0.355
+def test_estimate_model_params_gpt2_alone():
+    """Plain gpt2 should still match 0.124."""
+    result = _estimate_model_params("gpt2")
+    assert result == 0.124

{alloc-0.0.9 → alloc-0.0.11}/tests/test_probe_multi.py RENAMED Viewed

@@ -158,6 +158,61 @@ def test_parse_plain_python():
     assert _parse_launcher_gpu_count(["python", "train.py"]) is None
+# ── CVD UUID resolution ──
+def test_cvd_uuid_resolves_to_correct_index():
+    """UUID-style CUDA_VISIBLE_DEVICES should resolve to the matching physical GPU index."""
+    mock = _mock_pynvml_multi_gpu(
+        proc_pid=1000,
+        gpu_process_map={0: [1000], 1: [], 2: []},
+    )
+    mock.nvmlDeviceGetCount.return_value = 3
+    # Set up UUID resolution: GPU 0 → UUID-A, GPU 1 → UUID-B, GPU 2 → UUID-C
+    uuid_map = {0: "GPU-aaaa-1111", 1: "GPU-bbbb-2222", 2: "GPU-cccc-3333"}
+    handles = {}
+    for idx in range(3):
+        handles[idx] = MagicMock(name=f"handle_{idx}")
+    def get_handle(idx):
+        return handles[idx]
+    def get_uuid(handle):
+        for idx, h in handles.items():
+            if handle == h:
+                return uuid_map[idx]
+        return "GPU-unknown"
+    mock.nvmlDeviceGetHandleByIndex = MagicMock(side_effect=get_handle)
+    mock.nvmlDeviceGetUUID = MagicMock(side_effect=get_uuid)
+    # CVD set to GPU 2's UUID
+    with patch("alloc.probe._get_child_pids", return_value=[]):
+        with patch.dict("os.environ", {"CUDA_VISIBLE_DEVICES": "GPU-cccc-3333"}):
+            result = _discover_gpu_indices(1000, mock, fallback_index=0)
+    # Should only search GPU index 2
+    assert 2 in result or result == [0]  # either found on idx 2, or fallback if no PID match
+def test_cvd_invalid_uuid_falls_back_to_all_gpus():
+    """Invalid UUID that doesn't match any device should fall back to all GPUs."""
+    mock = _mock_pynvml_multi_gpu(
+        proc_pid=1000,
+        gpu_process_map={0: [1000], 1: []},
+    )
+    mock.nvmlDeviceGetCount.return_value = 2
+    # UUID lookup raises for all devices
+    mock.nvmlDeviceGetUUID = MagicMock(side_effect=RuntimeError("no UUID support"))
+    with patch("alloc.probe._get_child_pids", return_value=[]):
+        with patch.dict("os.environ", {"CUDA_VISIBLE_DEVICES": "GPU-nonexistent"}):
+            result = _discover_gpu_indices(1000, mock, fallback_index=0)
+    # Should fall back to searching all GPUs and find PID 1000 on GPU 0
+    assert 0 in result
 def test_parse_torch_distributed_launch():
     assert _parse_launcher_gpu_count([
         "python", "-m", "torch.distributed.launch", "--nproc_per_node=2", "train.py"

alloc-0.0.11/tests/test_topology_strategy.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""Tests for strategy inference from topology degrees (P0-B)."""
+from __future__ import annotations
+import os
+from unittest.mock import patch
+from alloc.cli import _infer_parallel_topology_from_env
+class TestStrategyInference:
+    """Strategy should be inferred from TP/PP/DP degrees when present."""
+    def _topo(self, env=None, num_gpus=4, **kwargs):
+        env = env or {}
+        with patch.dict(os.environ, env, clear=False):
+            return _infer_parallel_topology_from_env(
+                num_gpus_detected=num_gpus,
+                **kwargs,
+            )
+    def test_no_degrees_multi_gpu_strategy_none(self):
+        """When no degree env vars and no AST hint, strategy stays None."""
+        result = self._topo({}, num_gpus=4)
+        assert result["strategy"] is None
+        assert result["strategy_detection_method"] is None
+    def test_single_gpu_no_degrees_strategy_none(self):
+        """Single GPU with no degrees → strategy stays None."""
+        result = self._topo({}, num_gpus=1)
+        assert result["strategy"] is None
+    def test_world_size_only_strategy_none(self):
+        """WORLD_SIZE=4 with no explicit DP_SIZE → strategy=None (ambiguous)."""
+        result = self._topo({"WORLD_SIZE": "4"})
+        assert result["strategy"] is None
+        assert result["dp_degree"] == 4  # degree still derived for topology
+    def test_explicit_dp_size_is_ddp(self):
+        """Explicit DP_SIZE=4 → strategy=ddp."""
+        result = self._topo({"DP_SIZE": "4"})
+        assert result["strategy"] == "ddp"
+        assert result["dp_degree"] == 4
+    def test_tp_only(self):
+        """TP_SIZE=4 alone → strategy=tp."""
+        result = self._topo({"TP_SIZE": "4"})
+        assert result["strategy"] == "tp"
+    def test_pp_only(self):
+        """PP_SIZE=4 alone → strategy=pp."""
+        result = self._topo({"PP_SIZE": "4"})
+        assert result["strategy"] == "pp"
+    def test_tp_dp(self):
+        """TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
+        result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
+        assert result["strategy"] == "tp+dp"
+    def test_pp_dp(self):
+        """PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
+        result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
+        assert result["strategy"] == "pp+dp"
+    def test_tp_pp_dp(self):
+        """All three degrees → strategy=tp+pp+dp."""
+        result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
+        assert result["strategy"] == "tp+pp+dp"
+    def test_tp_pp_no_dp(self):
+        """TP+PP without explicit DP → strategy=tp+pp+dp."""
+        result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
+        assert result["strategy"] == "tp+pp+dp"
+    def test_tp_size_1_not_counted(self):
+        """TP_SIZE=1 should not count as tensor parallelism."""
+        result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
+        assert result["strategy"] == "ddp"
+    def test_pp_size_1_not_counted(self):
+        """PP_SIZE=1 should not count as pipeline parallelism."""
+        result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
+        assert result["strategy"] == "ddp"
+    def test_dp_inferred_from_world_size(self):
+        """DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
+        result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
+        assert result["dp_degree"] == 4
+        assert result["strategy"] == "tp+dp"
+class TestStrategyOverride:
+    """--strategy override takes highest precedence."""
+    def _topo(self, env=None, num_gpus=4, **kwargs):
+        env = env or {}
+        with patch.dict(os.environ, env, clear=False):
+            return _infer_parallel_topology_from_env(
+                num_gpus_detected=num_gpus,
+                **kwargs,
+            )
+    def test_override_beats_env_degrees(self):
+        """Explicit --strategy overrides env var inference."""
+        result = self._topo({"WORLD_SIZE": "4"}, strategy_override="fsdp")
+        assert result["strategy"] == "fsdp"
+        assert result["strategy_detection_method"] == "user_override"
+    def test_override_beats_ast_hint(self):
+        """Explicit --strategy overrides AST hint."""
+        result = self._topo({}, num_gpus=2, strategy_override="fsdp", ast_strategy_hint="ddp")
+        assert result["strategy"] == "fsdp"
+        assert result["strategy_detection_method"] == "user_override"
+    def test_ast_hint_used_when_multi_gpu(self):
+        """AST hint used when no env degrees and multi-GPU."""
+        result = self._topo({}, num_gpus=2, ast_strategy_hint="fsdp")
+        assert result["strategy"] == "fsdp"
+        assert result["strategy_detection_method"] == "ast_analysis"
+        assert result["dp_degree"] == 2
+    def test_ast_hint_ignored_when_single_gpu(self):
+        """AST hint ignored for single GPU — no distributed strategy applies."""
+        result = self._topo({}, num_gpus=1, ast_strategy_hint="fsdp")
+        assert result["strategy"] is None
+    def test_env_degrees_beat_ast_hint(self):
+        """Env var TP/PP degrees take precedence over AST hint."""
+        result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"}, ast_strategy_hint="fsdp")
+        assert result["strategy"] == "tp+dp"
+        assert result["strategy_detection_method"] == "env_degrees"
+    def test_world_size_plus_ast_fsdp_returns_fsdp(self):
+        """WORLD_SIZE=2 + ast_hint='fsdp' → strategy='fsdp' (real torchrun FSDP case)."""
+        result = self._topo({"WORLD_SIZE": "2"}, num_gpus=2, ast_strategy_hint="fsdp")
+        assert result["strategy"] == "fsdp"
+        assert result["strategy_detection_method"] == "ast_analysis"
+        assert result["dp_degree"] == 2
+    def test_world_size_plus_ast_deepspeed_returns_deepspeed(self):
+        """WORLD_SIZE=4 + ast_hint='deepspeed' → strategy='deepspeed'."""
+        result = self._topo({"WORLD_SIZE": "4"}, num_gpus=4, ast_strategy_hint="deepspeed")
+        assert result["strategy"] == "deepspeed"
+        assert result["strategy_detection_method"] == "ast_analysis"
+    def test_world_size_only_no_hint_stays_none(self):
+        """WORLD_SIZE=2 with no AST hint → strategy=None (ambiguous)."""
+        result = self._topo({"WORLD_SIZE": "2"}, num_gpus=2)
+        assert result["strategy"] is None
+        assert result["dp_degree"] == 2
+    def test_unknown_multi_gpu_stays_none(self):
+        """Multi-GPU with no hint and no env vars → strategy=None, not ddp."""
+        result = self._topo({}, num_gpus=4)
+        assert result["strategy"] is None
+        assert result["strategy_detection_method"] is None
+    def test_strategy_detection_method_in_result(self):
+        """strategy_detection_method is always present in result."""
+        result = self._topo({"DP_SIZE": "4"})
+        assert "strategy_detection_method" in result
+        assert result["strategy_detection_method"] == "env_degrees"
+class TestDetectStrategyHint:
+    """code_analyzer.detect_strategy_hint returns correct strategy from AST."""
+    def test_fsdp_script(self, tmp_path):
+        script = tmp_path / "train_fsdp.py"
+        script.write_text(
+            "from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n"
+            "model = FSDP(model)\n"
+        )
+        from alloc.code_analyzer import detect_strategy_hint
+        assert detect_strategy_hint(str(script)) == "fsdp"
+    def test_ddp_script(self, tmp_path):
+        script = tmp_path / "train_ddp.py"
+        script.write_text(
+            "from torch.nn.parallel import DistributedDataParallel as DDP\n"
+            "model = DDP(model)\n"
+        )
+        from alloc.code_analyzer import detect_strategy_hint
+        assert detect_strategy_hint(str(script)) == "ddp"
+    def test_fsdp_beats_ddp_when_both_present(self, tmp_path):
+        script = tmp_path / "train_both.py"
+        script.write_text(
+            "from torch.nn.parallel import DistributedDataParallel as DDP\n"
+            "from torch.distributed.fsdp import FullyShardedDataParallel as FSDP\n"
+            "model = FSDP(model)\n"
+        )
+        from alloc.code_analyzer import detect_strategy_hint
+        assert detect_strategy_hint(str(script)) == "fsdp"
+    def test_no_distributed_returns_none(self, tmp_path):
+        script = tmp_path / "train_simple.py"
+        script.write_text("import torch\nmodel = torch.nn.Linear(10, 10)\n")
+        from alloc.code_analyzer import detect_strategy_hint
+        assert detect_strategy_hint(str(script)) is None
+    def test_nonexistent_file_returns_none(self):
+        from alloc.code_analyzer import detect_strategy_hint
+        assert detect_strategy_hint("/nonexistent/train.py") is None
+class TestProcessMapInProbeDictAssembly:
+    """process_map should reach probe_dict from ProbeResult."""
+    def test_process_map_present_in_topology_return(self):
+        """Topology dict now includes strategy field."""
+        with patch.dict(os.environ, {"DP_SIZE": "4"}, clear=False):
+            topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
+        assert "strategy" in topo
+        assert topo["strategy"] == "ddp"

alloc-0.0.9/tests/test_topology_strategy.py DELETED Viewed

@@ -1,93 +0,0 @@
-"""Tests for strategy inference from topology degrees (P0-B)."""
-from __future__ import annotations
-import os
-from unittest.mock import patch
-from alloc.cli import _infer_parallel_topology_from_env
-class TestStrategyInference:
-    """Strategy should be inferred from TP/PP/DP degrees when present."""
-    def _topo(self, env=None, num_gpus=4):
-        env = env or {}
-        with patch.dict(os.environ, env, clear=False):
-            return _infer_parallel_topology_from_env(
-                num_gpus_detected=num_gpus,
-            )
-    def test_no_degrees_multi_gpu_infers_ddp(self):
-        """When no degree env vars but multiple GPUs detected, infer DDP."""
-        result = self._topo({}, num_gpus=4)
-        assert result["strategy"] == "ddp"
-        assert result["dp_degree"] == 4
-    def test_single_gpu_no_degrees_strategy_none(self):
-        """Single GPU with no degrees → strategy stays None."""
-        result = self._topo({}, num_gpus=1)
-        assert result["strategy"] is None
-    def test_dp_only_is_ddp(self):
-        """WORLD_SIZE=4 with no TP/PP → dp inferred → strategy=ddp."""
-        result = self._topo({"WORLD_SIZE": "4"})
-        assert result["strategy"] == "ddp"
-        assert result["dp_degree"] == 4
-    def test_tp_only(self):
-        """TP_SIZE=4 alone → strategy=tp."""
-        result = self._topo({"TP_SIZE": "4"})
-        assert result["strategy"] == "tp"
-    def test_pp_only(self):
-        """PP_SIZE=4 alone → strategy=pp."""
-        result = self._topo({"PP_SIZE": "4"})
-        assert result["strategy"] == "pp"
-    def test_tp_dp(self):
-        """TP_SIZE=2 with DP_SIZE=2 → strategy=tp+dp."""
-        result = self._topo({"TP_SIZE": "2", "DP_SIZE": "2"})
-        assert result["strategy"] == "tp+dp"
-    def test_pp_dp(self):
-        """PP_SIZE=2 with DP_SIZE=2 → strategy=pp+dp."""
-        result = self._topo({"PP_SIZE": "2", "DP_SIZE": "2"})
-        assert result["strategy"] == "pp+dp"
-    def test_tp_pp_dp(self):
-        """All three degrees → strategy=tp+pp+dp."""
-        result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2", "DP_SIZE": "2"})
-        assert result["strategy"] == "tp+pp+dp"
-    def test_tp_pp_no_dp(self):
-        """TP+PP without explicit DP → strategy=tp+pp+dp."""
-        result = self._topo({"TP_SIZE": "2", "PP_SIZE": "2"})
-        assert result["strategy"] == "tp+pp+dp"
-    def test_tp_size_1_not_counted(self):
-        """TP_SIZE=1 should not count as tensor parallelism."""
-        result = self._topo({"TP_SIZE": "1", "DP_SIZE": "4"})
-        assert result["strategy"] == "ddp"
-    def test_pp_size_1_not_counted(self):
-        """PP_SIZE=1 should not count as pipeline parallelism."""
-        result = self._topo({"PP_SIZE": "1", "DP_SIZE": "4"})
-        assert result["strategy"] == "ddp"
-    def test_dp_inferred_from_world_size(self):
-        """DP inferred from WORLD_SIZE / (TP * PP) → strategy includes dp."""
-        result = self._topo({"WORLD_SIZE": "8", "TP_SIZE": "2"})
-        assert result["dp_degree"] == 4
-        assert result["strategy"] == "tp+dp"
-class TestProcessMapInProbeDictAssembly:
-    """process_map should reach probe_dict from ProbeResult."""
-    def test_process_map_present_in_topology_return(self):
-        """Topology dict now includes strategy field."""
-        with patch.dict(os.environ, {"WORLD_SIZE": "4"}, clear=False):
-            topo = _infer_parallel_topology_from_env(num_gpus_detected=4)
-        assert "strategy" in topo
-        assert topo["strategy"] == "ddp"