PyPI - gpu-dev - Versions diffs - 0.5.17__tar.gz → 0.5.19__tar.gz - Mend

gpu-dev 0.5.17tar.gz → 0.5.19tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.5.17
+Version: 0.5.19
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.5.17
+Version: 0.5.19
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt RENAMED Viewed

@@ -112,4 +112,8 @@ terraform-gpu-devservers/ssh-proxy/requirements.txt
 terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
 terraform-gpu-devservers/templates/al2023-user-data.sh
 terraform-gpu-devservers/templates/user-data-self-managed.sh
-terraform-gpu-devservers/templates/user-data.sh
+terraform-gpu-devservers/templates/user-data.sh
+tests/submit/README.md
+tests/submit/fail/run.sh
+tests/submit/multinode/run.sh
+tests/submit/success/run.sh

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py RENAMED Viewed

@@ -1349,6 +1349,254 @@ def reserve(
         rprint(f"[red]❌ Error: {str(e)}[/red]")
+_SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
+                     "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
+                     "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
+@main.command(context_settings={"ignore_unknown_options": True})
+@click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
+@click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
+@click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
+@click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
+@click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
+@click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
+              help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
+@click.option("--dockerimage", type=str, default=None,
+              help="Pre-built container image reference (e.g. ghcr.io/me/img:tag) to run instead of the default.")
+@click.option("--preserve-entrypoint", is_flag=True,
+              help="Keep the custom image's ENTRYPOINT/CMD instead of letting gpu-dev wrap with the SSH harness. Note: submit needs SSH to work.")
+@click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
+              help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
+@click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
+@click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
+@click.option("--name", type=str, default=None, help="Reservation name.")
+@click.option("--timeout", type=int, default=24 * 60, show_default=True,
+              help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
+@click.argument("command", nargs=-1, required=True)
+@click.pass_context
+def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
+           runtime, no_pull, keep_alive, name, timeout, command):
+    """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
+    \b
+    Examples:
+      gpu-dev submit --runtime ./ -- python train.py
+      gpu-dev submit --gpus 16 --gpu-type h100 --runtime . -- bash run.sh
+      gpu-dev submit --keep-alive -- nvidia-smi
+    The job runs on rank 0 (master pod). For multinode jobs, MULTINODE_HOSTS / RANK /
+    SIZE / MASTER_ADDR / MASTER_PORT are exported on every pod so torchrun and friends
+    work without manual wiring. Exit code mirrors the remote command's exit code.
+    """
+    import subprocess
+    import shlex
+    import sys
+    from pathlib import Path
+    if not command:
+        rprint("[red]❌ Provide a command after --, e.g. gpu-dev submit --runtime ./ -- python train.py[/red]")
+        sys.exit(2)
+    gt = gpu_type.lower()
+    # Per-type max GPUs (mirrors gpu_configs in reserve flow)
+    max_per_node = {
+        "t4": 4, "l4": 4, "a10g": 4, "rtxpro6000": 4, "t4-small": 1,
+        "a100": 8, "h100": 8, "h200": 8, "b200": 8,
+        "h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8,
+        "b200-mig-1g": 4, "b200-mig-2g": 2, "b200-mig-3g": 2,
+        "cpu-arm": 0, "cpu-x86": 0,
+    }.get(gt)
+    if max_per_node is None:
+        rprint(f"[red]❌ Unknown gpu-type '{gpu_type}'[/red]")
+        sys.exit(2)
+    is_multinode = gt not in ("cpu-arm", "cpu-x86") and gpus > max_per_node
+    if is_multinode and gpus % max_per_node != 0:
+        rprint(f"[red]❌ For multinode {gt}, --gpus must be a multiple of {max_per_node}[/red]")
+        sys.exit(2)
+    config = load_config()
+    try:
+        user_info = authenticate_user(config)
+    except RuntimeError as e:
+        rprint(f"[red]❌ {str(e)}[/red]")
+        sys.exit(2)
+    rm = ReservationManager(config)
+    # Determine effective disk handling. Multinode: only master gets persistent disk; we always
+    # SSH into rank 0, so passing --disk is fine.
+    disk_name = None if no_persistent_disk else disk
+    # Build dockerfile context if provided (mirrors the reserve-flow logic)
+    dockerfile_payload = None
+    if dockerfile:
+        import os, tarfile, tempfile, base64
+        if os.path.getsize(dockerfile) > 512 * 1024:
+            rprint("[red]❌ Dockerfile too large (max 512KB)[/red]")
+            sys.exit(2)
+        ctx_dir = os.path.dirname(os.path.abspath(dockerfile))
+        rprint(f"[cyan]📦 Building tar.gz context from {ctx_dir}[/cyan]")
+        with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
+            with tarfile.open(tmp.name, "w:gz") as tar:
+                for root, _, files in os.walk(ctx_dir):
+                    for f in files:
+                        full = os.path.join(root, f)
+                        tar.add(full, arcname=os.path.relpath(full, ctx_dir))
+                if os.path.basename(dockerfile).lower() != "dockerfile":
+                    tar.add(dockerfile, arcname="Dockerfile")
+            tar_size = os.path.getsize(tmp.name)
+            if tar_size > 700 * 1024:
+                os.unlink(tmp.name)
+                rprint(f"[red]❌ Build context too large: {tar_size}B (max ~700KB compressed)[/red]")
+                sys.exit(2)
+            with open(tmp.name, "rb") as fh:
+                dockerfile_payload = base64.b64encode(fh.read()).decode("utf-8")
+            os.unlink(tmp.name)
+        rprint(f"[green]✅ Dockerfile context: {tar_size}B compressed[/green]")
+    if dockerimage and not preserve_entrypoint:
+        rprint("[dim]Note: passing --dockerimage without --preserve-entrypoint, so gpu-dev wraps the image with the SSH harness.[/dim]")
+    if preserve_entrypoint and not (dockerfile or dockerimage):
+        rprint("[red]❌ --preserve-entrypoint requires --dockerfile or --dockerimage[/red]")
+        sys.exit(2)
+    rprint(f"[cyan]🎫 Reserving {gpus}x {gpu_type.upper()} for up to {hours}h...[/cyan]")
+    if is_multinode:
+        reservation_ids = rm.create_multinode_reservation(
+            user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
+            duration_hours=hours, name=name, github_user=user_info["github_user"],
+            no_persistent_disk=no_persistent_disk, disk_name=disk_name,
+            dockerfile=dockerfile_payload, dockerimage=dockerimage,
+            preserve_entrypoint=preserve_entrypoint)
+        if not reservation_ids:
+            rprint("[red]❌ Failed to create multinode reservation[/red]")
+            sys.exit(2)
+        primary_id = reservation_ids[0]
+    else:
+        primary_id = rm.create_reservation(
+            user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
+            duration_hours=hours, name=name, github_user=user_info["github_user"],
+            no_persistent_disk=no_persistent_disk, disk_name=disk_name,
+            dockerfile=dockerfile_payload, dockerimage=dockerimage,
+            preserve_entrypoint=preserve_entrypoint)
+        if not primary_id:
+            rprint("[red]❌ Failed to create reservation[/red]")
+            sys.exit(2)
+        reservation_ids = [primary_id]
+    short_id = primary_id[:8]
+    cancelled = {"done": False}
+    def maybe_cancel(reason: str):
+        if cancelled["done"] or keep_alive:
+            return
+        cancelled["done"] = True
+        rprint(f"[yellow]🛑 Cancelling reservation {short_id} ({reason})[/yellow]")
+        for rid in reservation_ids:
+            try:
+                rm.cancel_reservation(rid, user_info["user_id"])
+            except Exception as ce:
+                rprint(f"[dim]   cancel {rid[:8]} failed: {ce}[/dim]")
+    try:
+        if timeout >= 60:
+            wait_str = f"up to {timeout//60}h{(" " + str(timeout%60) + "m") if timeout%60 else ""}"
+        else:
+            wait_str = f"up to {timeout}m"
+        rprint(f"[cyan]⏳ Waiting for reservation {short_id} to become active ({wait_str}; can queue when cluster is full)...[/cyan]")
+        if is_multinode:
+            results = rm.wait_for_multinode_reservation_completion(reservation_ids, timeout_minutes=timeout)
+        else:
+            single = rm.wait_for_reservation_completion(primary_id, timeout_minutes=timeout)
+            results = [single] if single else None
+        if not results:
+            rprint("[red]❌ Reservation never became active[/red]")
+            maybe_cancel("activation timeout")
+            sys.exit(1)
+        # Resolve master pod (rank 0)
+        conn = rm.get_connection_info(primary_id, user_info["user_id"])
+        if not conn:
+            rprint("[red]❌ Could not fetch connection info[/red]")
+            maybe_cancel("no connection info")
+            sys.exit(1)
+        if conn.get("is_multinode"):
+            nodes = sorted(conn["nodes"], key=lambda n: n.get("node_index", 0))
+            master = nodes[0]
+            master_id, master_pod, master_fqdn, master_name = (
+                master["reservation_id"], master["pod_name"],
+                master.get("fqdn"), master.get("name"))
+        else:
+            master_id, master_pod, master_fqdn, master_name = (
+                primary_id, conn["pod_name"], conn.get("fqdn"), conn.get("name"))
+        # Ensure SSH config exists
+        gpu_dev_dir = Path.home() / ".gpu-dev"
+        config_file = gpu_dev_dir / f"{master_id[:8]}-sshconfig"
+        if not config_file.exists():
+            if not (master_fqdn and master_pod):
+                rprint("[red]❌ Master pod has no FQDN yet — can't SSH[/red]")
+                maybe_cancel("no fqdn")
+                sys.exit(1)
+            create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
+        ssh_alias = master_pod
+        ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
+        rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
+        # Working directory and rsync up
+        if runtime:
+            workdir = f"/workspace/submit-{master_id[:8]}"
+            rprint(f"[cyan]📦 Syncing {runtime} → {ssh_alias}:{workdir}[/cyan]")
+            r = subprocess.run(ssh_base + [ssh_alias, f"mkdir -p {shlex.quote(workdir)}"])
+            if r.returncode != 0:
+                rprint("[red]❌ Failed to create remote workspace[/red]")
+                maybe_cancel("mkdir failed"); sys.exit(2)
+            r = subprocess.run([
+                "rsync", "-az", "--delete", "-e", rsync_e,
+                f"{runtime.rstrip('/')}/", f"{ssh_alias}:{workdir}/",
+            ])
+            if r.returncode != 0:
+                rprint("[red]❌ Upload rsync failed[/red]")
+                maybe_cancel("upload failed"); sys.exit(2)
+        else:
+            workdir = "/home/dev"
+        # Run remote command via login shell so MULTINODE_* etc. are loaded
+        remote_cmd = " ".join(shlex.quote(c) for c in command)
+        rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
+        ssh_run = ssh_base + [ssh_alias,
+                              f"cd {shlex.quote(workdir)} && bash -lc {shlex.quote(remote_cmd)}"]
+        rc = subprocess.call(ssh_run)
+        rprint(f"\n[dim]Job exited with code {rc}[/dim]")
+        # Sync back results before cancelling
+        if runtime and not no_pull:
+            rprint(f"[cyan]📥 Syncing {ssh_alias}:{workdir}/ → {runtime}[/cyan]")
+            pull = subprocess.run([
+                "rsync", "-az", "-e", rsync_e,
+                f"{ssh_alias}:{workdir}/", f"{runtime.rstrip('/')}/",
+            ])
+            if pull.returncode != 0:
+                rprint(f"[yellow]⚠️  Result rsync exited with {pull.returncode} — your output may be incomplete[/yellow]")
+        maybe_cancel("job complete")
+        sys.exit(rc)
+    except KeyboardInterrupt:
+        rprint("\n[yellow]Interrupted — cancelling[/yellow]")
+        maybe_cancel("user interrupt")
+        sys.exit(130)
+    except SystemExit:
+        raise
+    except Exception as e:
+        rprint(f"[red]❌ Submit error: {e}[/red]")
+        maybe_cancel("submit error")
+        sys.exit(2)
 @main.command()
 @click.option(
     "--user",

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gpu-dev"
-version = "0.5.17"
+version = "0.5.19"
 description = "CLI tool for PyTorch GPU developer server reservations"
 authors = [{name = "PyTorch Team"}]
 readme = "cli-tools/gpu-dev-cli/README.md"

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/index.py RENAMED Viewed

@@ -1423,6 +1423,11 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
             logger.info(
                 f"Starting parallel processing for {total_nodes} nodes")
+            # Deterministic peer pod names by node_index so MULTINODE_RANK aligns with the
+            # position of this pod in MULTINODE_HOSTS across all replicas.
+            nodes_sorted = sorted(nodes, key=lambda n: int(n.get("node_index", 0)))
+            peer_pod_names = [f"gpu-dev-{n['reservation_id'][:8]}" for n in nodes_sorted]
             def process_single_node(node_data):
                 """Process a single node - to be run in parallel"""
                 i, node = node_data
@@ -1435,7 +1440,8 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
                         'action': 'process_multinode_individual',
                         'node_index': int(node_index),
                         'total_nodes': int(total_nodes),
-                        'master_reservation_id': str(master_reservation_id)
+                        'master_reservation_id': str(master_reservation_id),
+                        'multinode_peer_pods': peer_pod_names,
                     }
                     logger.info(
@@ -1541,6 +1547,12 @@ def process_multinode_individual_node(message_body: dict) -> bool:
         node_data = response["Item"]
+        # Forward peer pod list from coordinator into request dict so create_pod can
+        # bake MULTINODE_HOSTS / MASTER_ADDR / MULTINODE_RANK env vars into the pod.
+        peer_pods = message_body.get("multinode_peer_pods")
+        if peer_pods:
+            node_data["multinode_peer_pods"] = peer_pods
         # Update status to preparing pod
         update_multinode_pod_status(
             reservation_id, "preparing pod", node_index, total_nodes)
@@ -2888,6 +2900,8 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
             dockerimage=dockerimage,
             target_az=target_az,
             target_node=target_node,
+            multinode_peer_pods=request.get("multinode_peer_pods"),
+            multinode_rank=int(request.get("node_index", 0)) if is_multinode else 0,
             preserve_entrypoint=preserve_entrypoint,
             node_labels=node_labels,
             trace_data=trace_data,
@@ -3429,6 +3443,8 @@ def create_kubernetes_resources(
     efs_filesystem_id: str = None,
     is_multinode: bool = False,
     target_node: str = None,
+    multinode_peer_pods: list = None,
+    multinode_rank: int = 0,
     dockerfile_base64_data: str = None,
     dockerimage: str = None,
     target_az: str = None,
@@ -3533,6 +3549,8 @@ def create_kubernetes_resources(
                         dockerimage=dockerimage,
                         target_az=target_az,
                         target_node=target_node,
+                        multinode_peer_pods=multinode_peer_pods,
+                        multinode_rank=multinode_rank,
                         preserve_entrypoint=preserve_entrypoint,
                         node_labels=node_labels,
                         trace_data=trace_data,
@@ -3620,6 +3638,8 @@ def create_kubernetes_resources(
                         dockerimage=dockerimage,
                         target_az=target_az,
                         target_node=target_node,
+                        multinode_peer_pods=multinode_peer_pods,
+                        multinode_rank=multinode_rank,
                         preserve_entrypoint=preserve_entrypoint,
                         node_labels=node_labels,
                         trace_data=trace_data,
@@ -3722,6 +3742,30 @@ def find_available_node_port(k8s_client) -> int:
         return random.randint(30000, 32767)
+def _mig_slice_fraction(gpu_type: str) -> float:
+    """For MIG SKUs return slice fraction of a single GPU (1g=1/7, 2g=2/7, ..., 7g=1).
+    Slice naming counts GPCs (compute slices). H100 and B200 both have 7 GPCs per GPU
+    in the typical all-balanced profile, so a 1g slice is 1/7 of a GPU regardless of
+    family. Used to size CPU/memory requests proportional to the GPU fraction the pod
+    actually consumes — the older `gpu_count/max_gpus` ratio over-claimed node resources
+    (a 1g slice would claim 1/4 or 1/16 of the host instead of 1/56).
+    """
+    if "mig" not in gpu_type:
+        return 1.0
+    try:
+        slices = int(gpu_type.split("-mig-")[1].rstrip("g"))
+    except (IndexError, ValueError):
+        return 1.0
+    return slices / 7.0
+# Number of full GPUs on the underlying instance — used to convert the slice fraction
+# into a fraction of the host's CPU/memory. Both p5.48xlarge (H100) and p6-b200.48xlarge
+# (B200) have 8 GPUs, which matches every MIG-capable instance type we currently run.
+_FULL_GPUS_PER_MIG_NODE = 8
 def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool = False) -> dict:
     """Get resource limits for pod based on GPU type and deployment mode"""
     gpu_count = int(gpu_count)
@@ -3741,13 +3785,19 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
             resource_name = config.get("k8s_resource", "nvidia.com/gpu")
             limits[resource_name] = str(gpu_count)
-            gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
-            # Calculate proportional limits with CPU overprovisioning for burst capacity
-            # Give 1.5x CPU limit to allow burst, capped at node total
-            fractional_cpu = config["cpus"] * gpu_ratio
-            proportional_cpu_limit = min(config["cpus"], int(fractional_cpu * 1.5))
-            proportional_memory_limit = int(config["memory_gb"] * gpu_ratio)
+            if "mig" in gpu_type:
+                # Scale by GPC fraction (slice of one GPU), not slice count over max slices.
+                slice_fraction = _mig_slice_fraction(gpu_type)
+                cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
+                mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
+                fractional_cpu = cpu_per_full_gpu * slice_fraction * gpu_count
+                proportional_cpu_limit = max(1, min(config["cpus"], int(fractional_cpu * 1.5)))
+                proportional_memory_limit = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count))
+            else:
+                gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
+                fractional_cpu = config["cpus"] * gpu_ratio
+                proportional_cpu_limit = min(config["cpus"], int(fractional_cpu * 1.5))
+                proportional_memory_limit = int(config["memory_gb"] * gpu_ratio)
             limits.update({
                 "cpu": str(proportional_cpu_limit),
@@ -3787,13 +3837,16 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
         if gpu_count > 0:
             resource_name = config.get("k8s_resource", "nvidia.com/gpu")
             requests[resource_name] = str(gpu_count)
-            gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
-            # Calculate proportional requests (reserve 10% for system overhead)
-            # This ensures requests don't exceed node allocatable resources
-            # Limits can be higher for burst capacity (Burstable QoS)
-            proportional_cpu_request = int(config["cpus"] * gpu_ratio * 0.9)
-            proportional_memory_request = int(config["memory_gb"] * gpu_ratio * 0.9)
+            if "mig" in gpu_type:
+                slice_fraction = _mig_slice_fraction(gpu_type)
+                cpu_per_full_gpu = config["cpus"] / _FULL_GPUS_PER_MIG_NODE
+                mem_per_full_gpu = config["memory_gb"] / _FULL_GPUS_PER_MIG_NODE
+                proportional_cpu_request = max(1, int(cpu_per_full_gpu * slice_fraction * gpu_count * 0.9))
+                proportional_memory_request = max(1, int(mem_per_full_gpu * slice_fraction * gpu_count * 0.9))
+            else:
+                gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
+                proportional_cpu_request = int(config["cpus"] * gpu_ratio * 0.9)
+                proportional_memory_request = int(config["memory_gb"] * gpu_ratio * 0.9)
             requests.update({
                 "cpu": str(proportional_cpu_request),
@@ -3896,6 +3949,30 @@ def get_nccl_env_vars(gpu_type: str) -> list:
     return env_vars
+def _get_multinode_env_vars(peer_pods: list, rank: int) -> list:
+    """Build env vars exposing peer hostnames/rank/master to the pod.
+    Hostnames use the per-pod headless service we already create elsewhere, so they
+    resolve to the current pod IP via cluster DNS even if a pod is recreated. We
+    don\'t inject IPs at pod-creation time (they aren\'t known until kube schedules
+    everyone) — the bashrc/zshrc helper resolves and exports MULTINODE_IPS at shell
+    start, and a /usr/local/bin/multinode-ips helper is available for non-interactive
+    callers.
+    """
+    if not peer_pods or len(peer_pods) <= 1:
+        return []
+    namespace = "gpu-dev"
+    hosts = [f"{p}-headless.{namespace}.svc.cluster.local" for p in peer_pods]
+    return [
+        client.V1EnvVar(name="MULTINODE_HOSTS", value=",".join(hosts)),
+        client.V1EnvVar(name="MULTINODE_PEER_PODS", value=",".join(peer_pods)),
+        client.V1EnvVar(name="MULTINODE_RANK", value=str(rank)),
+        client.V1EnvVar(name="MULTINODE_SIZE", value=str(len(peer_pods))),
+        client.V1EnvVar(name="MASTER_ADDR", value=hosts[0]),
+        client.V1EnvVar(name="MASTER_PORT", value="29500"),
+    ]
 def create_pod(
     k8s_client,
     pod_name: str,
@@ -3913,6 +3990,8 @@ def create_pod(
     dockerimage: str = None,
     target_az: str = None,
     target_node: str = None,
+    multinode_peer_pods: list = None,
+    multinode_rank: int = 0,
     preserve_entrypoint: bool = False,
     node_labels: dict = None,
     trace_data: dict = None,
@@ -4397,6 +4476,16 @@ EOF_PROFILE
 # User identification
 export GPU_DEV_USER_ID="{user_id or 'dev'}"
+# Multinode peer info — inlined from container env at pod startup. sshd strips
+# container env vars from login shells, so we materialize the values into rc files.
+# Skipped (empty exports) for single-node reservations where MULTINODE_* aren't set.
+export MULTINODE_HOSTS="$MULTINODE_HOSTS"
+export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
+export MULTINODE_RANK="$MULTINODE_RANK"
+export MULTINODE_SIZE="$MULTINODE_SIZE"
+export MASTER_ADDR="$MASTER_ADDR"
+export MASTER_PORT="$MASTER_PORT"
 # Function to check for GPU reservation expiry warnings and startup script status
 check_warnings() {{
     # Check for startup script still running
@@ -4415,6 +4504,22 @@ check_warnings() {{
 # Run warning check before every command prompt
 PROMPT_COMMAND="check_warnings; \$PROMPT_COMMAND"
+# Multinode peer IP resolution: MULTINODE_HOSTS is baked at pod creation, but per-pod
+# IPs are only known once kube schedules them. Resolve at shell start so users can do
+# torchrun --master_addr=\$MASTER_ADDR or mpirun -H "\$MULTINODE_IPS" without extra steps.
+if [ -n "\$MULTINODE_HOSTS" ]; then
+    _MULTINODE_IPS=""
+    for _h in \$(echo "\$MULTINODE_HOSTS" | tr ',' ' '); do
+        _ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
+        if [ -n "\$_ip" ]; then
+            _MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
+        fi
+    done
+    export MULTINODE_IPS="\$_MULTINODE_IPS"
+    [ -n "\$MULTINODE_IPS" ] && export MASTER_IP=\$(echo "\$MULTINODE_IPS" | cut -d, -f1)
+    unset _MULTINODE_IPS _h _ip
+fi
 EOF_BASHRC_EXT
                         cat > /home/dev/.zshrc_ext << EOF_ZSHRC_EXT
@@ -4425,6 +4530,15 @@ EOF_BASHRC_EXT
 # User identification
 export GPU_DEV_USER_ID="{user_id or 'dev'}"
+# Multinode peer info — inlined from container env at pod startup. sshd strips
+# container env vars from login shells, so we materialize the values into rc files.
+export MULTINODE_HOSTS="$MULTINODE_HOSTS"
+export MULTINODE_PEER_PODS="$MULTINODE_PEER_PODS"
+export MULTINODE_RANK="$MULTINODE_RANK"
+export MULTINODE_SIZE="$MULTINODE_SIZE"
+export MASTER_ADDR="$MASTER_ADDR"
+export MASTER_PORT="$MASTER_PORT"
 # Function to check for GPU reservation expiry warnings and startup script status
 check_warnings() {{
     # Check for startup script still running
@@ -4444,6 +4558,20 @@ check_warnings() {{
 # Run warning check before every command prompt (zsh hook)
 precmd() {{ check_warnings }}
+# Multinode peer IP resolution (see .bashrc_ext for rationale)
+if [[ -n "\$MULTINODE_HOSTS" ]]; then
+    _MULTINODE_IPS=""
+    for _h in \${{(s:,:)MULTINODE_HOSTS}}; do
+        _ip=\$(getent hosts "\$_h" 2>/dev/null | awk '{{print \$1}}' | head -1)
+        if [[ -n "\$_ip" ]]; then
+            _MULTINODE_IPS="\${{_MULTINODE_IPS:+\$_MULTINODE_IPS,}}\$_ip"
+        fi
+    done
+    export MULTINODE_IPS="\$_MULTINODE_IPS"
+    [[ -n "\$MULTINODE_IPS" ]] && export MASTER_IP="\${{MULTINODE_IPS%%,*}}"
+    unset _MULTINODE_IPS _h _ip
+fi
 EOF_ZSHRC_EXT
                         chown 1081:1081 /home/dev/.bashrc_ext /home/dev/.zshrc_ext
@@ -5174,7 +5302,7 @@ EOF
                         client.V1EnvVar(
                             name="NVIDIA_DRIVER_CAPABILITIES", value="compute,utility"
                         )
-                    ] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type),
+                    ] + get_nccl_env_vars(gpu_type) + get_cpu_thread_env_vars(gpu_count, gpu_type) + _get_multinode_env_vars(multinode_peer_pods, multinode_rank),
                     resources=client.V1ResourceRequirements(
                         limits=get_pod_resource_limits(
                             gpu_count, gpu_type, is_multinode),
@@ -6319,6 +6447,9 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
             "nvidia.com/mig-3g.40gb": "h100-mig-3g",
             "nvidia.com/mig-4g.40gb": "h100-mig-4g",
             "nvidia.com/mig-7g.80gb": "h100-mig-7g",
+            "nvidia.com/mig-1g.23gb": "b200-mig-1g",
+            "nvidia.com/mig-2g.45gb": "b200-mig-2g",
+            "nvidia.com/mig-3g.90gb": "b200-mig-3g",
         }
         if pod.spec.containers:
             for c in pod.spec.containers:

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_client.py RENAMED Viewed

@@ -31,9 +31,14 @@ def get_bearer_token() -> str:
     """
     Create a k8s-aws-v1 bearer token by presigning STS:GetCallerIdentity.
     IMPORTANT: base64url-encode the FULL presigned URL, then strip padding.
+    expires_in must match _EFFECTIVE_TOKEN_TTL: previously this was 60s while the cache
+    held the token for 14 min, so warm Lambda containers handed EKS expired URLs and got
+    401s for ~13 min until the next refresh. 900s is the typical EKS get-token default
+    and the max for IAM-role-derived presigned URLs.
     """
     logger.info("Starting bearer token generation")
-    STS_TOKEN_EXPIRES_IN = 60
+    STS_TOKEN_EXPIRES_IN = 900
     session = boto3.session.Session(region_name=REGION)
     logger.info(f"Created boto3 session for region {REGION}")

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda.tf RENAMED Viewed

@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
       HOSTED_ZONE_ID                     = local.effective_domain_name != "" ? local.hosted_zone_id : ""
       SSH_DOMAIN_MAPPINGS_TABLE          = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
       SSL_CERTIFICATE_ARN                = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
-      LAMBDA_VERSION                     = "0.5.17"
+      LAMBDA_VERSION                     = "0.5.22"
       MIN_CLI_VERSION                    = "0.5.16"
       DISK_CONTENTS_BUCKET               = aws_s3_bucket.disk_contents.bucket
       OPERATIONS_TABLE                   = aws_dynamodb_table.operations.name

gpu_dev-0.5.19/tests/submit/README.md ADDED Viewed

@@ -0,0 +1,63 @@
+# `gpu-dev submit` smoke tests
+Three tests that exercise `gpu-dev submit` end-to-end. Each test lives in its
+own folder so you can `--runtime` it directly. Output files written by the
+script are pulled back into the same folder via the post-run rsync.
+> Requires `gpu-dev >= 0.5.19`. No Lambda update needed.
+## 1. success — single T4 GPU, exit 0
+```bash
+cd tests/submit/success
+gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
+echo $?    # 0
+ls         # nvidia-info.txt, compute.txt, status.txt all created
+```
+## 2. fail — single T4 GPU, exit 7
+Writes a partial file before exploding so you can confirm rsync still pulls
+output on failure and the local exit code is the remote's.
+```bash
+cd tests/submit/fail
+gpu-dev submit --gpu-type t4 --gpus 1 --runtime ./ -- bash run.sh
+echo $?    # 7
+ls         # step1.txt, step2.txt, gpus-before-fail.txt — but no step3.txt
+```
+## 3. multinode — 2x H100 nodes, exit 0
+Reserves 16 H100s (= 2 nodes), verifies env vars + peer ssh + NCCL all_reduce
+across the whole cluster via mpirun (orchestrated entirely from rank 0).
+```bash
+cd tests/submit/multinode
+gpu-dev submit --gpu-type h100 --gpus 16 --runtime ./ -- bash run.sh
+echo $?    # 0
+cat multinode-env.txt resolved-ips.txt peer-ssh.txt nccl-all_reduce.log
+```
+## What each test proves
+| Test       | Proves                                                                        |
+|------------|-------------------------------------------------------------------------------|
+| success    | reserve → rsync up → exec → rsync back → cancel → exit 0                      |
+| fail       | exit code propagation; rsync-back still runs on non-zero exit; cancel fires   |
+| multinode  | MULTINODE_* env vars; peer DNS / passwordless ssh; cross-node NCCL via mpirun |
+After every run, `gpu-dev list` should show neither reservation — both auto-cancelled.
+Use `--keep-alive` on any of them if you want to debug interactively afterward.
+## Other submit flags (forwarded to `reserve`)
+- `--hours N` — reservation lifetime ceiling (default 1.0)
+- `--disk NAME` — attach a persistent disk to the master node
+- `--no-persistent-disk` — skip persistent disk
+- `--dockerfile PATH` — build a custom image from this Dockerfile
+- `--dockerimage REF` — use a pre-built container image
+- `--preserve-entrypoint` — keep the custom image's ENTRYPOINT (you must run sshd yourself for submit to work)
+- `--timeout MINUTES` — wait-for-active timeout (default 1440 = 24h, since reservations may queue)
+- `--no-pull` — skip the post-run sync-back
+- `--keep-alive` — skip auto-cancel

gpu_dev-0.5.19/tests/submit/fail/run.sh ADDED Viewed

@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Failure test for `gpu-dev submit`: writes a partial output, then exits 7.
+# Verifies the post-run rsync still pulls the partial files even on failure,
+# the auto-cancel runs on non-zero exit, and the local exit code is preserved.
+set -e
+echo "=== host ==="
+hostname
+date -u
+# Write a partial file so we can verify it was synced back
+echo "step1 done at $(date -u)" > step1.txt
+nvidia-smi -L > gpus-before-fail.txt
+# Now error out
+echo "About to fail..." > step2.txt
+python3 -c "import sys; sys.exit(7)"
+# Should not reach here
+echo "should-not-appear" > step3.txt

gpu_dev-0.5.19/tests/submit/multinode/run.sh ADDED Viewed

@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Multinode test for `gpu-dev submit`: runs on rank 0 only and orchestrates the
+# whole cluster via mpirun (uses passwordless ssh + the headless service DNS we
+# already set up). Verifies env vars, peer connectivity, and an actual NCCL
+# all_reduce across all nodes.
+set -euo pipefail
+cd "$(dirname "$0")"
+echo "=== rank 0 host: $(hostname) at $(date -u) ==="
+echo "=== multinode env ==="
+{
+  echo "MULTINODE_HOSTS=$MULTINODE_HOSTS"
+  echo "MULTINODE_PEER_PODS=$MULTINODE_PEER_PODS"
+  echo "MULTINODE_RANK=$MULTINODE_RANK"
+  echo "MULTINODE_SIZE=$MULTINODE_SIZE"
+  echo "MASTER_ADDR=$MASTER_ADDR"
+  echo "MASTER_PORT=$MASTER_PORT"
+  echo "MULTINODE_IPS=${MULTINODE_IPS:-(not set)}"
+} | tee multinode-env.txt
+if [[ -z "${MULTINODE_HOSTS:-}" ]]; then
+    echo "ERROR: MULTINODE_HOSTS empty — submit with --gpus >= 16 on h100" >&2
+    exit 2
+fi
+# Resolve IPs even if the bashrc helper didn't run (defensive)
+IPS=""
+for h in $(echo "$MULTINODE_HOSTS" | tr ',' ' '); do
+    ip=$(getent hosts "$h" | awk '{print $1}' | head -1)
+    [[ -n "$ip" ]] && IPS="${IPS:+$IPS,}$ip"
+done
+echo "Resolved IPS=$IPS" | tee resolved-ips.txt
+echo "=== peer ssh check (port 2222 inside cluster) ==="
+peer_host=$(echo "$MULTINODE_HOSTS" | cut -d, -f2)
+ssh -o StrictHostKeyChecking=no -p 2222 "$peer_host" 'hostname; nvidia-smi -L | wc -l' \
+    | tee peer-ssh.txt
+GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
+echo "GPUS_PER_NODE=$GPUS_PER_NODE" | tee gpus-per-node.txt
+# Build --host arg: ip1:N,ip2:N,...
+HOST_ARG=$(echo "$IPS" | awk -v g="$GPUS_PER_NODE" -F, '{out=""; for(i=1;i<=NF;i++){out=out ($i ":" g) (i<NF?",":"")}; print out}')
+echo "HOST_ARG=$HOST_ARG"
+echo "=== NCCL all_reduce_perf via mpirun ==="
+# Note: -g 1 = 1 GPU per process, -n 20 iterations. Sweep 1M..1G in factor-of-2 steps.
+mpirun --host "$HOST_ARG" \
+    --mca plm_rsh_args "-p 2222 -o StrictHostKeyChecking=no" \
+    -x PATH -x LD_LIBRARY_PATH \
+    -x FI_PROVIDER -x FI_EFA_USE_DEVICE_RDMA \
+    -x NCCL_NET_GDR_LEVEL -x NCCL_ALGO \
+    -x NCCL_SOCKET_IFNAME -x NCCL_DEBUG -x NCCL_IB_HCA \
+    /opt/nccl-tests/build/all_reduce_perf -b 1M -e 1G -f 2 -g 1 -n 20 \
+    2>&1 | tee nccl-all_reduce.log
+echo "=== summary ==="
+{
+    echo "rank=$MULTINODE_RANK size=$MULTINODE_SIZE"
+    echo "host_arg=$HOST_ARG"
+    echo "completed at $(date -u)"
+} | tee summary.txt
+echo "DONE"

gpu_dev-0.5.19/tests/submit/success/run.sh ADDED Viewed

@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Smoke test for `gpu-dev submit`: runs on a single GPU, expected exit 0.
+set -euo pipefail
+echo "=== host ==="
+hostname
+date -u
+echo "=== nvidia-smi ==="
+nvidia-smi | tee nvidia-info.txt
+echo "=== compute ==="
+python3 - <<'PY' | tee compute.txt
+import torch
+assert torch.cuda.is_available(), "CUDA not available"
+n = torch.cuda.device_count()
+x = torch.arange(1_000_000, device="cuda", dtype=torch.float32)
+s = x.sum().item()
+print(f"devices={n} sum(0..999_999)={s}")
+PY
+echo "ok at $(date -u)" > status.txt
+echo "DONE"

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/.github/workflows/no-gitlinks.yml RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/.github/workflows/publish.yml RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/.gitignore RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/CLAUDE.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/PROGRESS.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/PR_DESCRIPTION.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/TODO.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/README.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/generate_stats.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/admin/requirements.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/README.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/gpu-dev-cli/minimal-iam-policy.json RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/cli-tools/scripts/clear_stale_disk_locks.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/USER_GUIDE.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/devgpu-features.html RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/docker-mark-blue.svg RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/docs/icons8-cursor-ai.svg RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/post.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/setup.cfg RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/.claude/skills/deploy.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/.terraform.lock.hcl RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/README.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/alb.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/availability.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/backend.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/.dockerignore RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/Dockerfile RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/backup-dotfiles RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bash_profile RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/bashrc_ext RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/build-with-efa.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/list-dotfile-versions RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/motd_script RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/nproc_wrapper RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/profile RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/restore-dotfiles-version RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/setup-dotfiles-persistence RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/shell_env RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/ssh_config RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zprofile RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker/zshrc_ext RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-build.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/Dockerfile RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/docker-example/hello.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ecr.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/efs.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/eks.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/expiry.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/git-cache.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/kubernetes.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/index.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/index.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/__init__.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/alb_utils.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/dns_utils.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/requirements.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/main.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-config.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/mig-parted-config.yaml RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/check_snapshots.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/migrations/run_backfill.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/monitoring.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/outputs.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/pyproject.toml RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/queue.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/route53.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/s3-disk-contents.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/scripts/inspect_user_data.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/Dockerfile RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/proxy.py RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy/requirements.txt RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy-service.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/ssh-proxy.tf RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/switch-to.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/al2023-user-data.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data-self-managed.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/templates/user-data.sh RENAMED Viewed

File without changes

{gpu_dev-0.5.17 → gpu_dev-0.5.19}/terraform-gpu-devservers/variables.tf RENAMED Viewed

File without changes

gpu-dev 0.5.17__tar.gz → 0.5.19__tar.gz

gpu-dev 0.5.17tar.gz → 0.5.19tar.gz