PyPI - gpu-dev - Versions diffs - 0.7.12__tar.gz → 0.7.14__tar.gz - Mend

gpu-dev 0.7.12tar.gz → 0.7.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (235) hide show

{gpu_dev-0.7.12 → gpu_dev-0.7.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.7.12
+Version: 0.7.14
 Summary: CLI + Python SDK for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py RENAMED Viewed

@@ -1521,19 +1521,27 @@ def reserve(
 @main.command(context_settings={"ignore_unknown_options": True})
-@click.argument("ref")
-@click.argument("test_args", nargs=-1, required=True)
-@click.option("--gpu-type", default="b200", show_default=True, help="GPU type for the repro box.")
+@click.argument("ref", required=False)
+@click.argument("test_args", nargs=-1, required=False)
+@click.option("--lint", is_flag=True, default=False,
+              help="Run a PyTorch lint job (lintrunner) on a CPU box instead of a python test — "
+                   "mirrors CI's lint (.github/scripts/lintrunner.sh): regenerates version/type "
+                   "stubs then runs the python/general linters. Defaults to --gpu-type cpu-x86, "
+                   "no torch build. PR ref lints its diff; main lints all files; extra args override scope.")
+@click.option("--clang", is_flag=True, default=False,
+              help="With --lint, also run the C++ linters (CLANGTIDY/CLANGFORMAT). CI runs these in a "
+                   "separate job — they generate clang build files and are heavy on a full tree.")
+@click.option("--gpu-type", default=None, help="GPU type for the repro box (default: b200; cpu-x86 with --lint).")
 @click.option("--gpus", type=int, default=1, show_default=True)
 @click.option("--hours", type=float, default=3.0, show_default=True,
               help="Lifetime ceiling for the box.")
 @click.option("--no-connect", is_flag=True, default=False,
-              help="CI mode: run the test, auto-cancel, exit code = test result. Default (on a TTY) drops you into the box to iterate.")
+              help="CI mode: run the test/lint, auto-cancel, exit code = result. Default (on a TTY) drops you into the box to iterate.")
 @click.option("--keep", is_flag=True, default=False,
               help="Never cancel the box (skip the cancel prompt / auto-cancel).")
 @click.pass_context
-def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
-    """Reserve a GPU, check out a PR/commit, run a test, then drop you into the box.
+def repro(ctx, ref, test_args, lint, clang, gpu_type, gpus, hours, no_connect, keep):
+    """Reserve a box, check out a PR/commit, run a test (or lint), then drop you in.
     By default (in a terminal) repro runs the test and then **connects you into the
     box** at ~/pytorch — the ref is checked out, so you can fix and re-run. The box
@@ -1546,10 +1554,32 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
     TEST_ARGS are passed straight to `python` inside ~/pytorch, e.g.
       gpu-dev repro pr/185264 test/inductor/test_flex_attention.py TestFlexAttentionCUDA.test_large_kv_int64_pointer_math_cuda
+    --lint runs lintrunner on a CPU box instead (no GPU, no torch build), mirroring
+    CI's lint (regenerate version/type stubs, then the python/general linters), e.g.
+      gpu-dev repro --lint                          # lint main (all files)
+      gpu-dev repro --lint pr/185264                # lint the PR diff (CI-equivalent)
+      gpu-dev repro --lint pr/185264 --all-files    # lint everything
+      gpu-dev repro --lint --clang pr/185264        # also run C++ clang-tidy/format
+    The box stays up after the run: on a TTY you're dropped in and prompted to
+    cancel on exit (use --keep to leave it running; --no-connect auto-cancels).
     """
     import shlex
     import subprocess
     import sys
+    if not ref:
+        if not lint:
+            rprint("[red]❌ Provide a REF (pr/N, branch, or commit) — or use --lint to lint main.[/red]")
+            sys.exit(2)
+        ref = "main"  # bare `repro --lint` lints current main
+    if not lint and not test_args:
+        rprint("[red]❌ Provide a test, e.g. gpu-dev repro pr/123 test/foo.py — or pass --lint for a lint job.[/red]")
+        sys.exit(2)
+    gpu_type = (gpu_type or ("cpu-x86" if lint else "b200")).lower()
+    if gpu_type.startswith("cpu"):
+        gpus = 0  # CPU reservations must have gpu_count=0
     config = load_config()
     reservation_mgr = ReservationManager(config)
     try:
@@ -1637,9 +1667,58 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
         f"PYTHONPATH=/home/dev/pytorch python {testcmd}"
     )
+    runlabel, rerun_hint = "test", f"python {testcmd}"
+    if lint:
+        # Mirror pytorch CI's lint (.github/scripts/lintrunner.sh): regenerate version +
+        # type stubs (so mypy/pyrefly are accurate), then run the python/general linters.
+        # CLANGTIDY/CLANGFORMAT are a separate CI job (need generated build files, very
+        # heavy on a full tree) -> opt-in via --clang. No torch build. Source-only tree
+        # (cloned if a CPU pod doesn't have one). Scope mirrors CI: a PR lints its diff
+        # (merge-base), main lints all files; extra args override the scope.
+        if test_args:
+            scope = " ".join(test_args)
+        elif prnum:
+            scope = "--merge-base-with origin/main"
+        else:
+            scope = "--all-files"
+        runlabel = "lint"
+        rerun_hint = f"lintrunner --skip CLANGTIDY,CLANGTIDY_EXECUTORCH_COMPATIBILITY,CLANGFORMAT {scope}"
+        clang_block = (
+            "echo '[lint] === C++ linters (CLANGTIDY/CLANGFORMAT) — generating clang build files (heavy)… ==='; "
+            "python -m tools.linter.clang_tidy.generate_build_files 2>/dev/null || true; "
+            f"lintrunner --force-color --take CLANGTIDY,CLANGFORMAT {scope}; rr=$?; [ $rr -ne 0 ] && RC=$rr; "
+        ) if clang else (
+            "echo '[lint] C++ linters (CLANGTIDY/CLANGFORMAT) skipped — add --clang to run them'; "
+        )
+        remote = (
+            "set +e; "
+            "git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true; "
+            "if [ ! -d /home/dev/pytorch/.git ]; then echo '[lint] no pytorch tree on this pod — cloning (partial)…'; "
+            "rm -rf /home/dev/pytorch; git clone --filter=blob:none https://github.com/pytorch/pytorch.git /home/dev/pytorch; fi; "
+            "cd /home/dev/pytorch; "
+            + resolve +
+            "echo \"[lint] target ${WANT:-?}\"; "
+            "git fetch origin main 2>/dev/null || true; "
+            "echo \"[lint] checking out $FREF\"; " + checkout + "; "
+            "echo \"[lint] HEAD $(git rev-parse --short HEAD)\"; "
+            "command -v lintrunner >/dev/null 2>&1 || pip install --break-system-packages -q lintrunner; "
+            # CI codegen so mypy/pyrefly see generated files (version.py + type stubs)
+            "echo '[lint] regenerating version + type stubs (CI parity)…'; "
+            "python -m tools.generate_torch_version --is-debug=false 2>/dev/null || true; "
+            "python -m tools.pyi.gen_pyi --native-functions-path aten/src/ATen/native/native_functions.yaml "
+            "--tags-path aten/src/ATen/native/tags.yaml --deprecated-functions-path tools/autograd/deprecated.yaml 2>/dev/null || true; "
+            "python torch/utils/data/datapipes/gen_pyi.py 2>/dev/null || true; "
+            "echo '[lint] lintrunner init…'; lintrunner init; RC=0; "
+            f"echo '[lint] === python/general linters: lintrunner {scope} ==='; "
+            f"lintrunner --force-color --skip CLANGTIDY,CLANGTIDY_EXECUTORCH_COMPATIBILITY,CLANGFORMAT {scope}; rr=$?; [ $rr -ne 0 ] && RC=$rr; "
+            + clang_block +
+            "exit $RC"
+        )
     # Reserve — warm claim (instant) first, else cold ephemeral. Always no-persist
     # (so the prebuilt tree is staged; a default disk would skip staging).
-    rprint(f"[cyan]🔬 repro: reserving {gpus}x {gpu_type} (warm if available)…[/cyan]")
+    desc = f"{gpus}x {gpu_type}" if gpus else gpu_type
+    rprint(f"[cyan]🔬 repro: reserving {desc} (warm if available)…[/cyan]")
     rid = ssh_cmd = None
     try:
         res = reservation_mgr.claim_direct(
@@ -1675,14 +1754,14 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
     except KeyboardInterrupt:
         rprint("\n[yellow]interrupted[/yellow]"); rc = 130
-    verdict = "[green]✓ test passed[/green]" if rc == 0 else f"[red]✗ test failed (exit {rc})[/red]"
+    verdict = f"[green]✓ {runlabel} passed[/green]" if rc == 0 else f"[red]✗ {runlabel} failed (exit {rc})[/red]"
     # Default (TTY): drop into the box so you can fix and re-run. --no-connect is the
     # CI path: auto-cancel and exit with the test's code.
     connect = (not no_connect) and sys.stdout.isatty()
     if connect:
         rprint(f"\n{verdict} — dropping you into the box at ~/pytorch ({ref} checked out).")
-        rprint(f"[dim]  re-run:  python {testcmd}[/dim]")
+        rprint(f"[dim]  re-run:  {rerun_hint}[/dim]")
         rprint(f"[dim]  finish:  gpu-dev cancel  (from inside)  •  or exit this shell[/dim]\n")
         shell_cmd = f"{ssh_cmd} -t {shlex.quote('cd /home/dev/pytorch 2>/dev/null; exec ${SHELL:-bash} -l')}"
         try:
@@ -3232,6 +3311,172 @@ def show(ctx: click.Context, reservation_id: Optional[str]) -> None:
         rprint(f"[red]❌ Error: {str(e)}[/red]")
+def _print_recovery_hints(connection_info: dict) -> None:
+    """Tell the user how to unblock/recover their own reservation based on status."""
+    status = (connection_info.get("status") or "").lower()
+    disk_name = connection_info.get("disk_name") or ""
+    res_id = connection_info.get("reservation_id", "") or ""
+    short_id = res_id[:8] if res_id else "<id>"
+    hints = []
+    if status in ("failed", "expired", "cancelled"):
+        if disk_name:
+            hints.append(
+                f"Your data on disk '{disk_name}' is preserved — re-reserve with: "
+                f"gpu-dev reserve --disk {disk_name}")
+            hints.append(f"If that disk is stuck locked: gpu-dev disk unlock {disk_name}")
+        else:
+            hints.append("Re-reserve a new box with: gpu-dev reserve")
+    elif status == "active":
+        hints.append(
+            f"If status is 'active' but you can't SSH, the pod likely died (e.g. OOM). "
+            f"Free it (and your disk) with: gpu-dev cancel {short_id}  — then re-reserve.")
+        if disk_name:
+            hints.append(f"If the disk stays locked after cancel: gpu-dev disk unlock {disk_name}")
+    if hints:
+        rprint("\n[bold]Recovery:[/bold]")
+        for h in hints:
+            rprint(f"  • {h}")
+def _show_diagnostics(connection_info: dict) -> None:
+    """Render the extra diagnostics `gpu-dev debug` adds on top of the status panel:
+    failure reason, OOM events, the full status-history timeline, captured pod logs,
+    and recovery hints. All sourced from data the lambdas write to DynamoDB, so it
+    needs no cluster/lambda access."""
+    from rich.text import Text
+    status = (connection_info.get("status") or "").lower()
+    # Failure reason / latest detailed status — shown for ANY status (the normal
+    # `show` only surfaces failure_reason on 'failed'; for an active-but-dead pod
+    # this is exactly what the user needs).
+    failure_reason = (connection_info.get("failure_reason") or "").strip()
+    detailed = (connection_info.get("current_detailed_status") or "").strip()
+    if failure_reason:
+        rprint(f"\n[bold red]Why it ended:[/bold red] {failure_reason}")
+    elif detailed and status != "active":
+        rprint(f"\n[bold]Latest status:[/bold] {detailed}")
+    # OOM events
+    oom_count = int(connection_info.get("oom_count", 0) or 0)
+    if oom_count > 0:
+        last = connection_info.get("last_oom_at") or "unknown"
+        cont = connection_info.get("oom_container") or "?"
+        rprint(f"[red]⚠️  OOM:[/red] {oom_count} event(s) — last {last} (container: {cont})")
+    # Status-history timeline (the gold for "what happened to my reservation")
+    history = connection_info.get("status_history") or []
+    if history:
+        table = Table(title="Status timeline (most recent last)", show_header=True,
+                      header_style="bold", box=None, pad_edge=False)
+        table.add_column("Time", style="dim", no_wrap=True)
+        table.add_column("Event")
+        for entry in history[-40:]:
+            if isinstance(entry, dict):
+                table.add_row(str(entry.get("timestamp", "")), str(entry.get("message", "")))
+        console.print("")
+        console.print(table)
+    else:
+        rprint("\n[dim]No status history recorded for this reservation.[/dim]")
+    # Captured pod logs (lambda snapshot — last lines around the failure)
+    pod_logs = (connection_info.get("pod_logs") or "").strip()
+    if pod_logs:
+        console.print(Panel(Text(pod_logs[-4000:]), title="Captured pod logs (snapshot)",
+                            border_style="yellow"))
+    _print_recovery_hints(connection_info)
+def _show_lambda_logs(reservation_mgr, reservation_id: str, user_id: str) -> None:
+    """Fetch + render the raw lambda (CloudWatch) logs for a reservation."""
+    from rich.text import Text
+    rprint("\n[bold]Fetching lambda logs from CloudWatch…[/bold] [dim](a few seconds)[/dim]")
+    result = reservation_mgr.get_reservation_logs(reservation_id, user_id)
+    if result is None:
+        rprint("[yellow]Could not reach the log backend (it may not be deployed yet, "
+               "or you lack lambda:InvokeFunctionUrl access).[/yellow]")
+        return
+    if result.get("error"):
+        rprint(f"[yellow]Log query: {result['error']}[/yellow]")
+    lines = result.get("lines") or []
+    if not lines:
+        rprint("[dim]No lambda log lines found for this reservation (outside the "
+               "retention window, or none recorded).[/dim]")
+        return
+    body = "\n".join(f"{ln.get('timestamp','')}  {ln.get('message','')}".rstrip()
+                     for ln in lines)
+    console.print(Panel(Text(body[-16000:]),
+                        title=f"Lambda logs · {len(lines)} line(s)", border_style="cyan"))
+@main.command()
+@click.argument("reservation_id", required=False)
+@click.option("--logs", "show_logs", is_flag=True,
+              help="Also fetch the raw lambda logs for this reservation from CloudWatch.")
+@click.pass_context
+def debug(ctx: click.Context, reservation_id: Optional[str], show_logs: bool) -> None:
+    """Diagnose your own reservation — why a box died or won't connect.
+    Shows the status timeline, failure reason, OOM events, and captured pod logs,
+    plus recovery steps — all without needing cluster or lambda access. Add --logs
+    to also pull the raw reservation/expiry lambda logs from CloudWatch.
+    \b
+    Examples:
+        gpu-dev debug                 # pick from your active reservations
+        gpu-dev debug abc12345        # a specific reservation (id prefix ok)
+        gpu-dev debug abc12345 --logs # + raw lambda logs from CloudWatch
+    For a recently failed/expired box, find its id with 'gpu-dev list' then
+    'gpu-dev debug <id>'.
+    """
+    try:
+        config = load_config()
+        user_info = authenticate_user(config)
+        reservation_mgr = ReservationManager(config)
+        # In-pod fast path: the pod's own reservation id is on the env.
+        if reservation_id is None:
+            reservation_id = os.environ.get("GPU_DEV_RESERVATION_ID") or None
+        if reservation_id is None:
+            reservations = _fetch_reservations_cross_region(
+                reservation_mgr, user_info["user_id"],
+                ["active", "preparing", "queued", "pending"], config)
+            if not reservations:
+                rprint("[yellow]📋 No active reservations.[/yellow] To debug a recent "
+                       "failed/expired one, find its id with [bold]gpu-dev list[/bold] "
+                       "then run [bold]gpu-dev debug <id>[/bold].")
+                return
+            if len(reservations) == 1:
+                reservation_id = reservations[0].get("reservation_id")
+            else:
+                selected = select_reservation_interactive(reservations, "debug")
+                if not selected or selected in ("__QUIT__", "__ALL__"):
+                    rprint("[yellow]Cancelled.[/yellow]")
+                    return
+                reservation_id = selected
+        connection_info = reservation_mgr.get_connection_info(
+            reservation_id, user_info["user_id"])
+        if not connection_info:
+            rprint(f"[red]❌ No reservation found matching '{reservation_id}'[/red] "
+                   "(try a longer id prefix, or check 'gpu-dev list').")
+            return
+        _show_single_reservation(connection_info)
+        _show_diagnostics(connection_info)
+        if show_logs:
+            _show_lambda_logs(reservation_mgr, connection_info["reservation_id"],
+                              user_info["user_id"])
+    except RuntimeError as e:
+        rprint(f"[red]❌ {str(e)}[/red]")
+    except Exception as e:
+        rprint(f"[red]❌ Error: {str(e)}[/red]")
 def _maybe_show_sdk_tip() -> None:
     """For a user's first few reservations, nudge them toward the Python SDK +

{gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py RENAMED Viewed

@@ -613,7 +613,7 @@ class ReservationManager:
                 pass
         return self._direct_url or None
-    def _signed_post(self, url: str, payload: dict) -> Optional[dict]:
+    def _signed_post(self, url: str, payload: dict, timeout: int = 20) -> Optional[dict]:
         """SigV4-signed POST to the Function URL. Returns parsed JSON or None."""
         try:
             creds = self.config.session.get_credentials()
@@ -623,13 +623,29 @@ class ReservationManager:
             aws_req = AWSRequest(method="POST", url=url, data=data,
                                  headers={"Content-Type": "application/json"})
             SigV4Auth(creds, "lambda", self.config.aws_region).add_auth(aws_req)
-            resp = requests.post(url, data=data, headers=dict(aws_req.headers), timeout=20)
+            resp = requests.post(url, data=data, headers=dict(aws_req.headers), timeout=timeout)
             if resp.status_code != 200:
                 return None
             return resp.json()
         except Exception:
             return None
+    def get_reservation_logs(self, reservation_id: str, user_id: str) -> Optional[Dict[str, Any]]:
+        """Fetch a reservation's lambda logs (CloudWatch Logs Insights) via the
+        processor Function URL. Returns {"lines": [...]} / {"error": ...}, or None if
+        the backend/URL is unavailable. Used by `gpu-dev debug --logs`."""
+        url = self._get_direct_url()
+        if not url:
+            return None
+        payload = {
+            "action": "get_logs",
+            "reservation_id": reservation_id,
+            "user_id": user_id,
+            "version": get_version(),
+        }
+        # CloudWatch Logs Insights queries take longer than a claim — allow ~70s.
+        return self._signed_post(url, payload, timeout=70)
     def claim_direct(self, *, user_id: str, gpu_count: int, gpu_type: str,
                      duration_hours: Union[int, float], name: Optional[str] = None,
                      github_user: Optional[str] = None, ref: Optional[str] = None) -> Optional[Dict[str, Any]]:
@@ -999,11 +1015,19 @@ class ReservationManager:
                 "jupyter_enabled": reservation.get("jupyter_enabled", False),
                 "jupyter_error": reservation.get("jupyter_error", ""),
                 "ebs_volume_id": reservation.get("ebs_volume_id", ""),
+                "disk_name": reservation.get("disk_name", ""),
                 "secondary_users": reservation.get("secondary_users", []),
                 "warning": reservation.get("warning", ""),
                 "is_multinode": is_multinode,
                 "pod_ip": reservation.get("pod_ip", ""),
+                "node_ip": reservation.get("node_ip", ""),
+                "node_name": reservation.get("node_name", ""),
                 "fqdn": reservation.get("fqdn", ""),
+                # Health/diagnostics (surfaced by `gpu-dev debug`); written by the
+                # reservation + expiry lambdas. Present off the raw item, not always set.
+                "oom_count": int(reservation.get("oom_count", 0) or 0),
+                "last_oom_at": reservation.get("last_oom_at", ""),
+                "oom_container": reservation.get("oom_container", ""),
             }
             # If multi-node, fetch all nodes in the group

{gpu_dev-0.7.12 → gpu_dev-0.7.14}/gpu_dev.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.7.12
+Version: 0.7.14
 Summary: CLI + Python SDK for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.7.12 → gpu_dev-0.7.14}/gpu_dev.egg-info/SOURCES.txt RENAMED Viewed

@@ -198,6 +198,7 @@ tests/unit/cli/test_cancel.py
 tests/unit/cli/test_config_cmd.py
 tests/unit/cli/test_config_module.py
 tests/unit/cli/test_connect.py
+tests/unit/cli/test_debug.py
 tests/unit/cli/test_disks.py
 tests/unit/cli/test_edit.py
 tests/unit/cli/test_interactive.py
@@ -213,7 +214,9 @@ tests/unit/lambda_fn/__init__.py
 tests/unit/lambda_fn/test_availability.py
 tests/unit/lambda_fn/test_cancellation.py
 tests/unit/lambda_fn/test_claim.py
+tests/unit/lambda_fn/test_dead_pod_cleanup.py
 tests/unit/lambda_fn/test_finalize_no_ssh.py
+tests/unit/lambda_fn/test_get_logs.py
 tests/unit/lambda_fn/test_mig_gpu_config.py
 tests/unit/lambda_fn/test_pod_resources.py
 tests/unit/lambda_fn/test_ref_staging.py

{gpu_dev-0.7.12 → gpu_dev-0.7.14}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gpu-dev"
-version = "0.7.12"
+version = "0.7.14"
 description = "CLI + Python SDK for PyTorch GPU developer server reservations"
 authors = [{name = "PyTorch Team"}]
 readme = "cli-tools/gpu-dev-cli/README.md"

{gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/Dockerfile RENAMED Viewed

@@ -46,8 +46,12 @@ RUN for attempt in 1 2 3; do \
 RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
     apt-get install -y nodejs
-# Install older CUDA toolkits alongside base CUDA 13.2
+# Install additional CUDA toolkits alongside base CUDA 13.2
 # Base image already has NVIDIA repo configured, no need for cuda-keyring
+# NOTE: cuda-toolkit-13-3 is intentionally NOT here. CUDA 13.3 ships a unified
+# `cccl-13-3` package that `Breaks` `cuda-cccl-12-8`/`-12-9`, so 13.3 cannot coexist
+# with the 12.8/12.9 toolkits in one image. To add 13.3 we'd have to drop 12.8/12.9
+# (or hand-curate 13.3 sub-packages that exclude cccl). Kept 12.8-13.2 for now.
 RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-toolkit-12-8 \
         cuda-toolkit-12-9 \
@@ -163,21 +167,32 @@ WORKDIR /home/dev
 RUN mkdir -p ~/.npm-global && \
     npm config set prefix ~/.npm-global
-# OpenAI Codex CLI on GPT-5.5 via AWS Bedrock (GA 2026-06-01). Installed system-wide
-# (parallels Claude above), then /usr/local/bin/codex is replaced with a thin wrapper that
-# auths via the pod IRSA — it mints a short-lived Bedrock bearer token (no per-user OpenAI
-# key) and pins the bedrock-mantle provider + GPT-5.5 metadata. Reasoning effort is set with
-# the CODEX_EFFORT env var (default high); the wrapper rewrites ~/.codex/config.toml each
-# launch (home is ephemeral) so a /model picker mishap self-heals on restart. IAM is already
-# in place (pod IRSA: bedrock-mantle:* + aws-marketplace:Subscribe).
+# OpenAI Codex CLI on OpenAI gpt-5.x via AWS Bedrock. Installed system-wide (parallels
+# Claude above), then /usr/local/bin/codex is replaced with a thin wrapper that auths via
+# the pod IRSA — it mints a short-lived Bedrock bearer token (AWS_BEARER_TOKEN_BEDROCK), no
+# per-user key. The wrapper uses codex's NATIVE `amazon-bedrock` model provider (the Bedrock
+# Mantle path serves the OpenAI Responses API for supported OpenAI models — per the official
+# Codex/Bedrock docs), so NO custom endpoint/wire_api config is needed. Model via CODEX_MODEL
+# (default openai.gpt-5.4), effort via CODEX_EFFORT (default high). The wrapper forces
+# AWS_REGION=us-east-1.
+#
+# Why gpt-5.4 default (2026-06-16): gpt-5.5 is mid-rollout on Bedrock us-east-1 — it works
+# intermittently but ~30% of calls still 404 "Engine not found" (us-east-2 fails outright).
+# gpt-5.4 is rock-solid in us-east-1. To switch to 5.5 once AWS's rollout stabilizes, change
+# the default above to openai.gpt-5.5 (one line) — region is already us-east-1. Users can opt
+# in early with CODEX_MODEL=openai.gpt-5.5. The wrapper rewrites ~/.codex/config.toml each
+# launch. IAM already in place (pod IRSA: bedrock-mantle:* — native Mantle path does NOT need
+# bedrock:CallWithBearerToken).
 USER root
+# Always install the latest codex (the native amazon-bedrock provider is stable across
+# releases, so no need to pin — each image rebuild tracks latest). Validated on 0.140.0.
 RUN npm install -g --prefix /usr/local @openai/codex || echo "Codex CLI install failed (non-fatal at build time)"
 # Bedrock wrapper, base64-embedded to avoid heredoc/quoting fragility. It execs the real
 # launcher at /usr/local/lib/node_modules/@openai/codex/bin/codex.js. CRITICAL: `npm install`
 # leaves /usr/local/bin/codex as a SYMLINK to that codex.js, so we must `rm -f` it first —
 # writing through the symlink would clobber codex.js itself, making the wrapper exec itself
 # (infinite recursion -> codex hangs on launch).
-RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IHdpcmVkIHRvIEdQVC01LjUgb24gQVdTIEJlZHJvY2sgKHVzLWVhc3QtMiBtYW50bGUgZW5kcG9pbnQpLgojIEF1dGggdmlhIHRoZSBwb2QgSVJTQSAtPiBzaG9ydC1saXZlZCAofjEyaCkgQmVkcm9jayBiZWFyZXIgdG9rZW47IG5vIHBlci11c2VyIGtleS4KIyBSZWFzb25pbmcgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgZW52IChkZWZhdWx0IGhpZ2gpLiBUaGUgY29uZmlnIGlzIChyZSl3cml0dGVuIG9uCiMgZXZlcnkgbGF1bmNoOiAvaG9tZS9kZXYgaXMgZXBoZW1lcmFsLCBhbmQgdGhpcyBhbHNvIHNlbGYtaGVhbHMgYSAvbW9kZWwgbWlzaGFwCiMgKHRoZSBwaWNrZXIgY2FuIGNvcnJ1cHQgdGhlIG1vZGVsIGlkOyByZXN0YXJ0aW5nIGNvZGV4IHJlc2V0cyBpdCkuCnNldCArZQpSRUFMPS91c3IvbG9jYWwvbGliL25vZGVfbW9kdWxlcy9Ab3BlbmFpL2NvZGV4L2Jpbi9jb2RleC5qcwpFRkZPUlQ9IiR7Q09ERVhfRUZGT1JUOi1oaWdofSIKbWtkaXIgLXAgIiRIT01FLy5jb2RleCIgMj4vZGV2L251bGwKY2F0ID4gIiRIT01FLy5jb2RleC9jb25maWcudG9tbCIgPDxDRkcKbW9kZWwgPSAib3BlbmFpLmdwdC01LjUiCm1vZGVsX3Byb3ZpZGVyID0gImJlZHJvY2siCndlYl9zZWFyY2ggPSAiZGlzYWJsZWQiCm1vZGVsX2NvbnRleHRfd2luZG93ID0gMjcyMDAwCm1vZGVsX21heF9vdXRwdXRfdG9rZW5zID0gMTI4MDAwCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKClttb2RlbF9wcm92aWRlcnMuYmVkcm9ja10KbmFtZSA9ICJBV1MgQmVkcm9jayAoR1BULTUuNSkiCmJhc2VfdXJsID0gImh0dHBzOi8vYmVkcm9jay1tYW50bGUudXMtZWFzdC0yLmFwaS5hd3Mvb3BlbmFpL3YxIgplbnZfa2V5ID0gIk9QRU5BSV9BUElfS0VZIgp3aXJlX2FwaSA9ICJyZXNwb25zZXMiCkNGRwpUT0s9IiQoL3Vzci9iaW4vcHl0aG9uMyAtYyAnZnJvbSBhd3NfYmVkcm9ja190b2tlbl9nZW5lcmF0b3IgaW1wb3J0IHByb3ZpZGVfdG9rZW47IHByaW50KHByb3ZpZGVfdG9rZW4ocmVnaW9uPSJ1cy1lYXN0LTIiKSknIDI+L2Rldi9udWxsKSIKWyAtbiAiJFRPSyIgXSAmJiBleHBvcnQgT1BFTkFJX0FQSV9LRVk9IiRUT0siCmV4ZWMgIiRSRUFMIiAiJEAiCg==' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
+RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IG9uIE9wZW5BSSBncHQtNS54IHZpYSBBV1MgQmVkcm9jayB1c2luZyBjb2RleCdzIE5BVElWRSBgYW1hem9uLWJlZHJvY2tgCiMgcHJvdmlkZXIuIFJlZ2lvbiB1cy1lYXN0LTEgKGdwdC01LnggTWFudGxlIHJlZ2lvbikuIEF1dGg6IGEgc2hvcnQtbGl2ZWQgQmVkcm9jawojIGJlYXJlciB0b2tlbiBtaW50ZWQgZnJvbSB0aGUgcG9kIElSU0EgKG5vIHBlci11c2VyIGtleSkuIE1vZGVsIHZpYSBDT0RFWF9NT0RFTAojIChkZWZhdWx0IG9wZW5haS5ncHQtNS40KSwgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgKGhpZ2gpLgojCiMgbW9kZWxfY29udGV4dF93aW5kb3cgaXMgc2V0IGV4cGxpY2l0bHkgYmVjYXVzZSBjb2RleCdzIGNhdGFsb2cgZG9lc24ndCBrbm93IHRoZQojIEJlZHJvY2stcHJlZml4ZWQgaWQgIm9wZW5haS5ncHQtNS54IiBhbmQgb3RoZXJ3aXNlIHdhcm5zICJNb2RlbCBtZXRhZGF0YSBub3QgZm91bmQsCiMgZGVmYXVsdGluZyB0byBmYWxsYmFjayBtZXRhZGF0YSIuIDI3MjAwMCBpcyBncHQtNS41J3MgYnVuZGxlZCBjb250ZXh0IHdpbmRvdy4KIwojIGdwdC01LjUgbm90ZSAoMjAyNi0wNi0xNik6IHByb3Zpc2lvbmVkIGluIHVzLWVhc3QtMSBidXQgbWlkLXJvbGxvdXQg4oCUIH4zMCUgb2YgY2FsbHMKIyBzdGlsbCA0MDQgIkVuZ2luZSBub3QgZm91bmQiLiBEZWZhdWx0IHN0YXlzIGdwdC01LjQgKHNvbGlkKTsgc3dpdGNoIHRoZSBkZWZhdWx0IHRvCiMgb3BlbmFpLmdwdC01LjUgb25jZSBBV1Mgc3RhYmlsaXplcywgb3Igb3B0IGluIG5vdyB3aXRoIENPREVYX01PREVMPW9wZW5haS5ncHQtNS41LgpzZXQgK2UKTU9ERUw9IiR7Q09ERVhfTU9ERUw6LW9wZW5haS5ncHQtNS40fSIKRUZGT1JUPSIke0NPREVYX0VGRk9SVDotaGlnaH0iCmV4cG9ydCBBV1NfUkVHSU9OPXVzLWVhc3QtMSBBV1NfREVGQVVMVF9SRUdJT049dXMtZWFzdC0xCm1rZGlyIC1wICIkSE9NRS8uY29kZXgiCmNhdCA+ICIkSE9NRS8uY29kZXgvY29uZmlnLnRvbWwiIDw8Q0ZHCm1vZGVsX3Byb3ZpZGVyID0gImFtYXpvbi1iZWRyb2NrIgptb2RlbCA9ICIkTU9ERUwiCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKbW9kZWxfY29udGV4dF93aW5kb3cgPSAyNzIwMDAKd2ViX3NlYXJjaCA9ICJkaXNhYmxlZCIKQ0ZHClRPSz0iJCgvdXNyL2Jpbi9weXRob24zIC1jICdmcm9tIGF3c19iZWRyb2NrX3Rva2VuX2dlbmVyYXRvciBpbXBvcnQgcHJvdmlkZV90b2tlbjsgcHJpbnQocHJvdmlkZV90b2tlbihyZWdpb249InVzLWVhc3QtMSIpKScgMj4vZGV2L251bGwpIgpbIC1uICIkVE9LIiBdICYmIGV4cG9ydCBBV1NfQkVBUkVSX1RPS0VOX0JFRFJPQ0s9IiRUT0siCmV4ZWMgL3Vzci9sb2NhbC9saWIvbm9kZV9tb2R1bGVzL0BvcGVuYWkvY29kZXgvYmluL2NvZGV4LmpzICIkQCIK' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
 USER dev

gpu-dev 0.7.12__tar.gz → 0.7.14__tar.gz

gpu-dev 0.7.12tar.gz → 0.7.14tar.gz