PyPI - gpu-dev - Versions diffs - 0.7.10__tar.gz → 0.7.12__tar.gz - Mend

gpu-dev 0.7.10tar.gz → 0.7.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (232) hide show

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.7.10
+Version: 0.7.12
 Summary: CLI + Python SDK for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py RENAMED Viewed

@@ -1724,6 +1724,47 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
                      "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"]
+def _build_submit_remote_script(workdir: str, remote_cmd: str, ref: Optional[str],
+                                no_build: bool) -> str:
+    """Build the remote shell script `submit` runs over SSH (under `bash -lc`).
+    Without --ref this is just `cd <workdir> && <cmd>`. With --ref the pytorch
+    tree is staged in the *background* in-pod (stage-pytorch &), and the tree is
+    only chowned to dev + the ref fully checked out at the very end. Running the
+    user command before that finishes is the footgun Driss hit: a root-owned tree
+    (git "dubious ownership") and a source/installed-torch mismatch (the ref is
+    checked out but the prebuilt .so is the stale base build -> `import torch`
+    fails). So with --ref we prepend a preamble that:
+      1. waits for staging to finish (`.pytorch-staging` marker removed at end),
+      2. marks /home/dev/pytorch a git safe.directory for the dev user,
+      3. unless --no-build, rebuilds incrementally so installed torch == the
+         checked-out source (warm build/ -> ~tens of seconds; a rebuild failure
+         exits 90 before the user command runs).
+    The rebuild/safe.directory only touch pytorch when staging actually ran
+    (`.pytorch-ready` present), so --disk reservations (ref ignored, no staging)
+    are unaffected.
+    """
+    import shlex
+    cd_run = f"cd {shlex.quote(workdir)} && {remote_cmd}"
+    if not ref:
+        return cd_run
+    lines = [
+        'if [ -e /home/dev/.pytorch-staging ]; then',
+        '  echo "[gpu-dev] waiting for background pytorch --ref staging to finish…"',
+        '  for _i in $(seq 1 3600); do [ -e /home/dev/.pytorch-staging ] || break; sleep 1; done',
+        'fi',
+        'if [ -f /home/dev/.pytorch-ready ]; then',
+        '  git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true',
+    ]
+    if not no_build:
+        lines += [
+            '  echo "[gpu-dev] rebuilding torch to match --ref (pip install -e . --no-build-isolation)…"',
+            '  ( cd /home/dev/pytorch && pip install -e . --no-build-isolation ) || { echo "[gpu-dev] torch rebuild failed"; exit 90; }',
+        ]
+    lines += ['fi', cd_run]
+    return "\n".join(lines)
 @main.command(context_settings={"ignore_unknown_options": True})
 @click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
 @click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
@@ -1743,6 +1784,8 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
 @click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
               help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
 @click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
+@click.option("--no-build", is_flag=True,
+              help="With --ref, skip the incremental torch rebuild before the command (Python-only PRs / quick checks). Default: rebuild so `import torch` reflects the ref.")
 @click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
 @click.option("--name", type=str, default=None, help="Reservation name.")
 @click.option("--timeout", type=int, default=24 * 60, show_default=True,
@@ -1750,7 +1793,7 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
 @click.argument("command", nargs=-1, required=True)
 @click.pass_context
 def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dockerfile, dockerimage, preserve_entrypoint,
-           runtime, no_pull, keep_alive, name, timeout, command):
+           runtime, no_pull, no_build, keep_alive, name, timeout, command):
     """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
     \b
@@ -1961,11 +2004,15 @@ def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dock
         else:
             workdir = "/home/dev"
-        # Run remote command via login shell so MULTINODE_* etc. are loaded
+        # Run remote command via login shell so MULTINODE_* etc. are loaded. With
+        # --ref, the script first waits for background pytorch staging + rebuilds
+        # so `import torch` matches the checked-out ref (see helper docstring).
         remote_cmd = " ".join(shlex.quote(c) for c in command)
         rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
-        ssh_run = ssh_base + [ssh_alias,
-                              f"cd {shlex.quote(workdir)} && bash -lc {shlex.quote(remote_cmd)}"]
+        if ref and not no_build:
+            rprint("[dim]   (--ref: will wait for staging + rebuild torch first; pass --no-build to skip)[/dim]")
+        remote_script = _build_submit_remote_script(workdir, remote_cmd, ref, no_build)
+        ssh_run = ssh_base + [ssh_alias, f"bash -lc {shlex.quote(remote_script)}"]
         rc = subprocess.call(ssh_run)
         rprint(f"\n[dim]Job exited with code {rc}[/dim]")

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py RENAMED Viewed

@@ -55,11 +55,23 @@ async def tunnel_ssh(target_host: str, target_port: int):
     # WebSocket URL - wss:// for secure WebSocket
     ws_url = f"wss://{proxy_host}/tunnel/{target_host}"
+    # Verify TLS against certifi's CA bundle. The default SSL context uses the OS
+    # trust store, which on macOS python.org builds is often empty
+    # ("unable to get local issuer certificate" / CERTIFICATE_VERIFY_FAILED).
+    # certifi ships the Mozilla bundle, so this works without the manual
+    # "Install Certificates.command" step.
+    ssl_ctx = ssl_module.create_default_context()
+    try:
+        import certifi
+        ssl_ctx.load_verify_locations(certifi.where())
+    except Exception:
+        pass  # fall back to the default trust store
     last_exc = None
     for attempt in range(MAX_RETRIES):
         try:
             async with websockets.connect(
-                ws_url, open_timeout=20,
+                ws_url, ssl=ssl_ctx, open_timeout=20,
                 ping_interval=30, ping_timeout=10,
             ) as websocket:
                 # Set up stdin/stdout for SSH

gpu_dev-0.7.12/docs/GPU_DEV_SUBMIT.md ADDED Viewed

@@ -0,0 +1,89 @@
+# `gpu-dev submit` — guide & footguns
+`gpu-dev submit` reserves a box, (optionally) rsyncs a local dir up, runs your
+command over SSH, syncs results back, and auto-cancels. It's the non-interactive
+sibling of `gpu-dev reserve` — good for CI-style validation, one-shot test runs,
+and scripted repros.
+```bash
+# run a script in a local dir on 1x H100, sync results back, auto-cancel
+gpu-dev submit --runtime ./ --gpu-type h100 -- bash run.sh
+# validate a PyTorch PR's tests on H100 (stages + builds the PR for you)
+gpu-dev submit --gpu-type h100 --no-persistent-disk --ref pr/186015 -- \
+    python test/test_foo.py -k some_test
+# keep the box after the job (debug a failure interactively)
+gpu-dev submit --keep-alive --gpu-type h100 -- pytest test/test_x.py
+```
+Exit code = your command's exit code (so it composes in scripts/CI).
+---
+## Footguns (read before your first `--ref` run)
+### 1. `--ref` stages PyTorch in the background — `submit` now waits for it
+With `--ref`, the in-pod startup checks out your ref into `/home/dev/pytorch`
+**in the background** and only chowns the tree to `dev` + finishes the checkout
+at the very end. Historically `submit` could SSH in and run your command before
+that finished, so you'd hit:
+- a **root-owned** `/home/dev/pytorch` (git: *"detected dubious ownership"*), and
+- a **source/installed-torch mismatch** → `import torch` fails (the ref source is
+  checked out but the importable `.so` is still the stale prebuilt base).
+`submit` now **waits for staging to complete**, marks the tree a git
+`safe.directory`, and (by default) **rebuilds incrementally** so the installed
+torch matches the checked-out ref before your command runs. You don't need the
+`sudo chown` / `safe.directory` workaround anymore.
+### 2. `--ref` rebuilds torch by default — use `--no-build` to skip
+The dropped-in `build/` + `.so` come from the **base** tree, not your ref. To make
+`import torch` reflect your ref's compiled (C++/CUDA) changes, `submit --ref`
+runs `pip install -e . --no-build-isolation` (incremental, warm `build/` →
+typically tens of seconds; a cold/cross-arch build is much longer).
+- Pass **`--no-build`** for Python-only PRs or quick checks — skips the rebuild
+  (import still works; it just won't include compiled changes).
+- A rebuild failure exits **90** *before* your command runs (so a broken build
+  doesn't masquerade as a test failure).
+### 3. Prebuilt fast path is **prod-arch only** (H100 / B200)
+The by-SHA / viable-strict prebuilt trees are compiled for `sm_90;sm_100`
+(H100/B200). On other GPU types (t4, a100, l4, …) or staging there's no matching
+prebuilt, so `--ref` falls back to a **full from-scratch build** — slow. Validate
+ref-based jobs on `--gpu-type h100` (or `b200`).
+### 4. `--ref` is ignored with `--disk`
+A persistent disk brings its own `/home/dev/pytorch`; `--ref` does **not** stage
+onto a `--disk` reservation (and `submit` won't rebuild it). Use
+`--no-persistent-disk` (or omit `--disk`) when you want a ref staged.
+### 5. `--preserve-entrypoint` needs SSH
+`submit` runs your command over SSH, so a custom image with
+`--preserve-entrypoint` must still expose the SSH harness or `submit` can't reach
+it. For pure entrypoint containers, use `reserve`, not `submit`.
+### 6. Results sync-back is best-effort
+With `--runtime`, output is rsync'd back to your local dir when the job exits
+(unless `--no-pull`). If the box dies mid-job (spot reclaim, expiry) the sync-back
+may be partial — you'll see a warning. For long jobs prefer `--keep-alive` and
+pull manually, or write important artifacts to `/shared-personal` (persists
+across reservations).
+### 7. `--hours` is a ceiling, not the runtime
+It's the reservation lifetime cap; the job auto-cancels as soon as your command
+exits (unless `--keep-alive`). Set it high enough that queueing + build + run fit.
+---
+## Finding footguns early
+- `gpu-dev submit --keep-alive … -- true` then `gpu-dev connect <id>` — get a
+  box in the exact submit state and poke around before committing a real run.
+- With `--ref`, watch staging directly: `tail -f /home/dev/.pytorch-staging.log`
+  in the pod; `.pytorch-ready` (HEAD sha) is written when staging is done.
+- `python -c "import torch; print(torch.__file__, torch.version.git_version)"`
+  confirms which torch you're actually importing vs. the ref you asked for.
+Found a new one? Add it here and ping `oncall:pytorch_release_engineering`.

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.7.10
+Version: 0.7.12
 Summary: CLI + Python SDK for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/SOURCES.txt RENAMED Viewed

@@ -24,6 +24,7 @@ cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py
 cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py
 cli-tools/scripts/clear_stale_disk_locks.py
 docs/FAST_REPRO_DESIGN.md
+docs/GPU_DEV_SUBMIT.md
 docs/SDK_REPRO.md
 docs/USER_GUIDE.md
 docs/devgpu-features.html
@@ -212,6 +213,7 @@ tests/unit/lambda_fn/__init__.py
 tests/unit/lambda_fn/test_availability.py
 tests/unit/lambda_fn/test_cancellation.py
 tests/unit/lambda_fn/test_claim.py
+tests/unit/lambda_fn/test_finalize_no_ssh.py
 tests/unit/lambda_fn/test_mig_gpu_config.py
 tests/unit/lambda_fn/test_pod_resources.py
 tests/unit/lambda_fn/test_ref_staging.py

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gpu-dev"
-version = "0.7.10"
+version = "0.7.12"
 description = "CLI + Python SDK for PyTorch GPU developer server reservations"
 authors = [{name = "PyTorch Team"}]
 readme = "cli-tools/gpu-dev-cli/README.md"

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/Dockerfile RENAMED Viewed

@@ -148,12 +148,13 @@ COPY ssh_config /etc/ssh/sshd_config
 # Bump CLAUDE_CODE_BUILD to bust the layer cache and re-fetch the latest Claude Code
 # (the installer always grabs latest; without a bump Docker reuses the cached layer).
 USER root
-ARG CLAUDE_CODE_BUILD=2026-05-29
+ARG CLAUDE_CODE_BUILD=2026-06-09
 RUN echo "Claude Code build marker: $CLAUDE_CODE_BUILD" && \
     curl -fsSL https://claude.ai/install.sh | HOME=/opt/claude bash || echo "Claude native install failed (non-fatal at build time)"
 RUN if [ -e /opt/claude/.local/bin/claude ]; then \
         ln -sf /opt/claude/.local/bin/claude /usr/local/bin/claude; \
         chmod -R a+rX /opt/claude; \
+        echo "Installed Claude Code (native): $(/usr/local/bin/claude --version 2>/dev/null || echo unknown)"; \
     fi
 # Set up npm global directory for dev user (kept for ad-hoc dev-installed CLIs).
@@ -176,7 +177,7 @@ RUN npm install -g --prefix /usr/local @openai/codex || echo "Codex CLI install
 # leaves /usr/local/bin/codex as a SYMLINK to that codex.js, so we must `rm -f` it first —
 # writing through the symlink would clobber codex.js itself, making the wrapper exec itself
 # (infinite recursion -> codex hangs on launch).
-RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IHdpcmVkIHRvIEdQVC01LjUgb24gQVdTIEJlZHJvY2sgKHVzLWVhc3QtMiBtYW50bGUgZW5kcG9pbnQpLgojIEF1dGggdmlhIHRoZSBwb2QgSVJTQSAtPiBzaG9ydC1saXZlZCAofjEyaCkgQmVkcm9jayBiZWFyZXIgdG9rZW47IG5vIHBlci11c2VyIGtleS4KIyBSZWFzb25pbmcgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgZW52IChkZWZhdWx0IGhpZ2gpLiBUaGUgY29uZmlnIGlzIChyZSl3cml0dGVuIG9uCiMgZXZlcnkgbGF1bmNoOiAvaG9tZS9kZXYgaXMgZXBoZW1lcmFsLCBhbmQgdGhpcyBhbHNvIHNlbGYtaGVhbHMgYSAvbW9kZWwgbWlzaGFwCiMgKHRoZSBwaWNrZXIgY2FuIGNvcnJ1cHQgdGhlIG1vZGVsIGlkOyByZXN0YXJ0aW5nIGNvZGV4IHJlc2V0cyBpdCkuCnNldCArZQpSRUFMPS91c3IvbG9jYWwvbGliL25vZGVfbW9kdWxlcy9Ab3BlbmFpL2NvZGV4L2Jpbi9jb2RleC5qcwpFRkZPUlQ9IiR7Q09ERVhfRUZGT1JUOi1oaWdofSIKbWtkaXIgLXAgIiRIT01FLy5jb2RleCIgMj4vZGV2L251bGwKY2F0ID4gIiRIT01FLy5jb2RleC9jb25maWcudG9tbCIgPDxDRkcKbW9kZWwgPSAib3BlbmFpLmdwdC01LjUiCm1vZGVsX3Byb3ZpZGVyID0gImJlZHJvY2siCndlYl9zZWFyY2ggPSAiZGlzYWJsZWQiCm1vZGVsX2NvbnRleHRfd2luZG93ID0gMjcyMDAwCm1vZGVsX21heF9vdXRwdXRfdG9rZW5zID0gMTI4MDAwCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKClttb2RlbF9wcm92aWRlcnMuYmVkcm9ja10KbmFtZSA9ICJBV1MgQmVkcm9jayAoR1BULTUuNSkiCmJhc2VfdXJsID0gImh0dHBzOi8vYmVkcm9jay1tYW50bGUudXMtZWFzdC0yLmFwaS5hd3Mvb3BlbmFpL3YxIgplbnZfa2V5ID0gIk9QRU5BSV9BUElfS0VZIgp3aXJlX2FwaSA9ICJyZXNwb25zZXMiCkNGRwpUT0s9IiQocHl0aG9uMyAtYyAnZnJvbSBhd3NfYmVkcm9ja190b2tlbl9nZW5lcmF0b3IgaW1wb3J0IHByb3ZpZGVfdG9rZW47IHByaW50KHByb3ZpZGVfdG9rZW4ocmVnaW9uPSJ1cy1lYXN0LTIiKSknIDI+L2Rldi9udWxsKSIKWyAtbiAiJFRPSyIgXSAmJiBleHBvcnQgT1BFTkFJX0FQSV9LRVk9IiRUT0siCmV4ZWMgIiRSRUFMIiAiJEAiCg==' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
+RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IHdpcmVkIHRvIEdQVC01LjUgb24gQVdTIEJlZHJvY2sgKHVzLWVhc3QtMiBtYW50bGUgZW5kcG9pbnQpLgojIEF1dGggdmlhIHRoZSBwb2QgSVJTQSAtPiBzaG9ydC1saXZlZCAofjEyaCkgQmVkcm9jayBiZWFyZXIgdG9rZW47IG5vIHBlci11c2VyIGtleS4KIyBSZWFzb25pbmcgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgZW52IChkZWZhdWx0IGhpZ2gpLiBUaGUgY29uZmlnIGlzIChyZSl3cml0dGVuIG9uCiMgZXZlcnkgbGF1bmNoOiAvaG9tZS9kZXYgaXMgZXBoZW1lcmFsLCBhbmQgdGhpcyBhbHNvIHNlbGYtaGVhbHMgYSAvbW9kZWwgbWlzaGFwCiMgKHRoZSBwaWNrZXIgY2FuIGNvcnJ1cHQgdGhlIG1vZGVsIGlkOyByZXN0YXJ0aW5nIGNvZGV4IHJlc2V0cyBpdCkuCnNldCArZQpSRUFMPS91c3IvbG9jYWwvbGliL25vZGVfbW9kdWxlcy9Ab3BlbmFpL2NvZGV4L2Jpbi9jb2RleC5qcwpFRkZPUlQ9IiR7Q09ERVhfRUZGT1JUOi1oaWdofSIKbWtkaXIgLXAgIiRIT01FLy5jb2RleCIgMj4vZGV2L251bGwKY2F0ID4gIiRIT01FLy5jb2RleC9jb25maWcudG9tbCIgPDxDRkcKbW9kZWwgPSAib3BlbmFpLmdwdC01LjUiCm1vZGVsX3Byb3ZpZGVyID0gImJlZHJvY2siCndlYl9zZWFyY2ggPSAiZGlzYWJsZWQiCm1vZGVsX2NvbnRleHRfd2luZG93ID0gMjcyMDAwCm1vZGVsX21heF9vdXRwdXRfdG9rZW5zID0gMTI4MDAwCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKClttb2RlbF9wcm92aWRlcnMuYmVkcm9ja10KbmFtZSA9ICJBV1MgQmVkcm9jayAoR1BULTUuNSkiCmJhc2VfdXJsID0gImh0dHBzOi8vYmVkcm9jay1tYW50bGUudXMtZWFzdC0yLmFwaS5hd3Mvb3BlbmFpL3YxIgplbnZfa2V5ID0gIk9QRU5BSV9BUElfS0VZIgp3aXJlX2FwaSA9ICJyZXNwb25zZXMiCkNGRwpUT0s9IiQoL3Vzci9iaW4vcHl0aG9uMyAtYyAnZnJvbSBhd3NfYmVkcm9ja190b2tlbl9nZW5lcmF0b3IgaW1wb3J0IHByb3ZpZGVfdG9rZW47IHByaW50KHByb3ZpZGVfdG9rZW4ocmVnaW9uPSJ1cy1lYXN0LTIiKSknIDI+L2Rldi9udWxsKSIKWyAtbiAiJFRPSyIgXSAmJiBleHBvcnQgT1BFTkFJX0FQSV9LRVk9IiRUT0siCmV4ZWMgIiRSRUFMIiAiJEAiCg==' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
 USER dev

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_processor/index.py RENAMED Viewed

@@ -3832,40 +3832,73 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
             f"MAIN FLOW: Pod is ready, checking SSH daemon status from logs for {reservation_id}"
         )
+        # Let the user know we're past pod creation and waiting on the service.
+        # On persistent-disk reservations the entrypoint restores the disk before
+        # sshd binds, so this can legitimately take a few minutes.
+        update_reservation_status(
+            reservation_id,
+            "preparing",
+            "Container running — restoring your environment and starting SSH…"
+            if use_persistent_disk
+            else "Container running — starting SSH service…",
+        )
         record_trace_event(trace_data, "ssh_ready_check_start")
         ssh_ready = False
         try:
             v1 = client.CoreV1Api(k8s_client)
-            # Poll for SSH daemon: 100ms for first 8s, then backoff to 5s
-            # Default image starts SSH in ~2-5s, so rapid polling catches it instantly
-            # Custom images may take longer, backoff keeps API load reasonable
-            max_attempts = 60
+            # Poll pod logs for the sshd-ready marker. Fast (100ms) for the first
+            # 8s to catch the common fast path instantly, then back off to 5s.
+            # Slow-disk startups restore the disk *before* sshd binds, so allow up
+            # to ~150s. If the marker never appears we finalize anyway below —
+            # routing is already in place and the SSH proxy retries until sshd binds.
+            deadline = time.time() + 150.0
             elapsed = 0.0
-            for attempt in range(max_attempts):
+            attempt = 0
+            logs = ""
+            while time.time() < deadline:
                 logs = v1.read_namespaced_pod_log(
-                    name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=100
+                    name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=200
                 )
                 if "SSH daemon starting on port 22" in logs or "Server listening on" in logs:
                     logger.info(
                         f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1}, {elapsed:.1f}s elapsed)")
                     ssh_ready = True
                     break
-                else:
-                    if attempt < max_attempts - 1:
-                        delay = 0.1 if elapsed < 8.0 else min(1.0 + (elapsed - 8.0) * 0.3, 5.0)
-                        time.sleep(delay)
-                        elapsed += delay
-                    else:
-                        logger.warning(
-                            f"SSH daemon not detected after {max_attempts} attempts, logs preview: {logs[-200:]}")
+                delay = 0.1 if elapsed < 8.0 else min(1.0 + (elapsed - 8.0) * 0.3, 5.0)
+                time.sleep(delay)
+                elapsed += delay
+                attempt += 1
+            if not ssh_ready:
+                logger.warning(
+                    f"SSH daemon marker not seen for {pod_name} after {elapsed:.1f}s, logs preview: {logs[-200:]}")
         except Exception as e:
             logger.warning(f"Could not check SSH daemon logs: {e}")
             # Assume ready if pod is running (NLB will handle routing)
             ssh_ready = True
         record_trace_event(trace_data, "ssh_ready_check_end")
+        # If the sshd marker never showed, don't orphan the reservation in
+        # 'preparing'. Only a genuinely broken pod should fail here; otherwise the
+        # pod is just slow to bind sshd (disk restore) — routing is already stored,
+        # so we finalize anyway and let the SSH proxy retry until sshd is up.
+        if not ssh_ready:
+            logger.warning(
+                f"MAIN FLOW: SSH daemon not confirmed for reservation {reservation_id}, checking pod status for errors")
+            pod_info = update_pod_status_and_events(k8s_client, pod_name, reservation_id)
+            if not should_finalize_without_ssh_marker(pod_info):
+                update_reservation_status(
+                    reservation_id,
+                    "failed",
+                    f"Pod failed to start properly: {pod_info['display_message']}",
+                )
+                raise RuntimeError(f"Pod failed: {pod_info['display_message']}")
+            logger.warning(
+                f"SSH daemon not confirmed for {pod_name}, but pod is healthy — "
+                f"finalizing connection anyway (SSH proxy retries until sshd binds)")
+            ssh_ready = True
         if ssh_ready:
             # Update status: Finalizing connection
             update_reservation_status(
@@ -3985,28 +4018,6 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
                     f"Failed to trigger availability update: {update_error}")
                 # Don't fail the reservation for this
-        else:
-            logger.warning(
-                f"MAIN FLOW: SSH connectivity test FAILED for reservation {reservation_id}, checking pod status for errors")
-            # Check pod status using our consolidated monitoring function
-            pod_info = update_pod_status_and_events(
-                k8s_client, pod_name, reservation_id)
-            if pod_info["has_errors"]:
-                update_reservation_status(
-                    reservation_id,
-                    "failed",
-                    f"Pod failed to start properly: {pod_info['display_message']}",
-                )
-                raise RuntimeError(
-                    f"Pod failed: {pod_info['display_message']}")
-            else:
-                # Pod is running but SSH not ready yet - keep as preparing
-                # Status message already updated by update_pod_status_and_events
-                pass
-                logger.warning(
-                    f"SSH not ready yet for {pod_name}, keeping reservation in preparing state"
-                )
         # GPU allocation handled automatically by K8s scheduler
         # Store trace data in DynamoDB if tracing is enabled
@@ -4057,6 +4068,18 @@ def delete_sqs_message(record: dict[str, Any]) -> None:
         logger.error(f"Error deleting SQS message: {str(e)}")
+def should_finalize_without_ssh_marker(pod_info: dict) -> bool:
+    """Decide what to do when the sshd-ready log marker never appeared.
+    The pod's routing (domain mapping) is stored before the readiness poll, so a
+    slow sshd (e.g. a persistent-disk restore that runs before sshd binds) is not
+    a failure — finalizing anyway lets the CLI's SSH proxy retry until sshd is up,
+    instead of orphaning the reservation in 'preparing' forever. Only a pod that
+    actually reports errors should fail.
+    """
+    return not pod_info.get("has_errors", False)
 def update_reservation_status(reservation_id: str, status: str, detailed_status: str = None, failure_reason: str = None) -> None:
     """
     Update reservation status with unified status tracking.
@@ -6328,7 +6351,7 @@ EOF
                             # Only start Jupyter if enabled at creation time
                             if [ "$JUPYTER_ENABLED" = "true" ]; then
                                 echo "[STARTUP] Starting Jupyter Lab in background..."
-                                nohup su - dev -c "cd /workspace && /opt/conda/bin/jupyter-lab --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
+                                nohup su - dev -c "cd /workspace && $(command -v jupyter-lab || echo /usr/local/bin/jupyter-lab) --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
                                 echo "[STARTUP] Jupyter Lab started (check /tmp/jupyter.log for details)"
                             else
                                 echo "[STARTUP] Jupyter Lab configured but not started (use 'gpu-dev edit --enable-jupyter' to enable)"
@@ -8487,7 +8510,7 @@ def update_pod_status_and_events(k8s_client, pod_name: str, reservation_id: str)
             if pod_phase == "Pending":
                 display_message = "⏳ Pod pending"
             elif pod_phase == "Running":
-                display_message = "🚀 Container running"
+                display_message = "🚀 Container running — starting SSH service…"
             else:
                 display_message = f"Pod phase: {pod_phase}"
@@ -9296,7 +9319,7 @@ def enable_jupyter_in_pod(
             # Start Jupyter as dev user in background (config already exists)
             echo "Starting Jupyter Lab with existing config..."
-            nohup su - dev -c "cd /workspace && /opt/conda/bin/jupyter-lab --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
+            nohup su - dev -c "cd /workspace && $(command -v jupyter-lab || echo /usr/local/bin/jupyter-lab) --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
             # Wait for startup
             sleep 3

{gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_submit.py RENAMED Viewed

@@ -19,12 +19,58 @@ from unittest.mock import MagicMock, patch
 import pytest
-from gpu_dev_cli.cli import main
+from gpu_dev_cli.cli import main, _build_submit_remote_script
 USER_INFO = {"user_id": "u-123", "github_user": "octocat"}
+# ---------------------------------------------------------------------------
+# _build_submit_remote_script — the --ref staging-gate + rebuild preamble
+# (regression for Driss's footguns: root-owned tree + source/installed mismatch)
+# ---------------------------------------------------------------------------
+def test_remote_script_no_ref_is_plain_cd_run():
+    s = _build_submit_remote_script("/workspace/x", "python a.py", ref=None, no_build=False)
+    assert s == "cd /workspace/x && python a.py"
+    assert "pytorch-staging" not in s
+    assert "no-build-isolation" not in s
+def test_remote_script_with_ref_waits_and_rebuilds():
+    s = _build_submit_remote_script("/home/dev", "pytest q.py", ref="pr/123", no_build=False)
+    # waits for the background staging marker
+    assert "/home/dev/.pytorch-staging" in s
+    # only acts once staging actually completed
+    assert "/home/dev/.pytorch-ready" in s
+    # marks safe.directory for the dev user (fixes git "dubious ownership")
+    assert "safe.directory /home/dev/pytorch" in s
+    # rebuilds so installed torch matches the checked-out ref
+    assert "pip install -e . --no-build-isolation" in s
+    # user command still runs last, in the workdir
+    assert s.rstrip().endswith("cd /home/dev && pytest q.py")
+def test_remote_script_ref_no_build_skips_rebuild():
+    s = _build_submit_remote_script("/home/dev", "pytest q.py", ref="pr/123", no_build=True)
+    assert "/home/dev/.pytorch-staging" in s          # still waits for staging
+    assert "safe.directory /home/dev/pytorch" in s     # still fixes ownership
+    assert "no-build-isolation" not in s               # but no rebuild
+    assert s.rstrip().endswith("cd /home/dev && pytest q.py")
+def test_remote_script_quotes_workdir():
+    s = _build_submit_remote_script("/work space/x", "echo hi", ref=None, no_build=False)
+    assert "'/work space/x'" in s
+def test_no_build_flag_threaded_and_defaults_false(cli_runner):
+    # --no-build is accepted; with --ref it changes the rebuild preamble. Here we
+    # just assert the flag parses (reservation returns None -> exit 2).
+    res, rm = _run(cli_runner, ["--ref", "pr/1", "--no-build", "--", "x"])
+    assert res.exit_code == 2
+    rm.create_reservation.assert_called_once()
 # ---------------------------------------------------------------------------
 # patch harness
 # ---------------------------------------------------------------------------

gpu_dev-0.7.12/tests/unit/lambda_fn/test_finalize_no_ssh.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Unit tests for the slow-sshd finalize decision.
+Regression for the orphaned-`preparing` bug: a persistent-disk reservation
+restores its disk *before* sshd binds, so the readiness poll's log marker never
+shows within the window. The main flow used to leave such reservations stuck in
+`preparing` forever. It now finalizes anyway (routing is already stored, the SSH
+proxy retries) and only fails when the pod itself reports errors.
+"""
+def test_finalize_when_pod_healthy_but_no_ssh_marker(lambda_index):
+    # Running pod, no errors, sshd marker not seen -> finalize anyway.
+    info = {"has_errors": False, "display_message": "🚀 Container running — starting SSH service…"}
+    assert lambda_index.should_finalize_without_ssh_marker(info) is True
+def test_do_not_finalize_when_pod_has_errors(lambda_index):
+    info = {"has_errors": True, "display_message": "❌ ImagePullBackOff"}
+    assert lambda_index.should_finalize_without_ssh_marker(info) is False
+def test_missing_has_errors_key_defaults_to_finalize(lambda_index):
+    # Defensive: a partial pod_info dict shouldn't strand the reservation.
+    assert lambda_index.should_finalize_without_ssh_marker({}) is True