PyPI - gpu-dev - Versions diffs - 0.4.1__tar.gz → 0.5.0__tar.gz - Mend

gpu-dev 0.4.1tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.4.1
+Version: 0.5.0
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.4.1
+Version: 0.5.0
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py RENAMED Viewed

@@ -498,9 +498,9 @@ def main(ctx: click.Context) -> None:
     "--gpu-type",
     "-t",
     type=click.Choice(
-        ["b200", "h200", "h100", "a100", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
+        ["b200", "h200", "h100", "a100", "g7e", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
     ),
-    help="GPU type to reserve (b200/h200/h100/a100/a10g/t4/l4/t4-small/cpu-arm/cpu-x86)",
+    help="GPU type to reserve (b200/h200/h100/a100/g7e/a10g/t4/l4/t4-small/cpu-arm/cpu-x86)",
 )
 @click.option(
     "--hours",
@@ -652,6 +652,7 @@ def reserve(
             "t4": {"max_gpus": 4, "instance_type": "g4dn.12xlarge"},
             "l4": {"max_gpus": 4, "instance_type": "g6.12xlarge"},
             "a10g": {"max_gpus": 4, "instance_type": "g5.12xlarge"},
+            "g7e": {"max_gpus": 4, "instance_type": "g7e.24xlarge"},
             "t4-small": {"max_gpus": 1, "instance_type": "g4dn.xlarge"},
             "a100": {"max_gpus": 8, "instance_type": "p4d.24xlarge"},
             "h100": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
@@ -2397,6 +2398,7 @@ def _show_availability() -> None:
                 "a100": "Ampere (sm80)",
                 "a10g": "Ampere (sm80)",
                 "l4": "Ada Lovelace (sm89)",
+                "g7e": "Blackwell (sm120)",
                 "t4": "Turing (sm75)",
                 "cpu-x86": "CPU (x86_64)",
                 "cpu-arm": "CPU (arm64)",
@@ -2405,6 +2407,7 @@ def _show_availability() -> None:
             # Sort order: newest GPU architectures first, then CPUs at the bottom
             arch_priority = {
                 "Blackwell (sm100)": 0,
+                "Blackwell (sm120)": 0,
                 "Hopper (sm90)": 1,
                 "Ada Lovelace (sm89)": 2,
                 "Ampere (sm80)": 3,
@@ -2544,6 +2547,7 @@ def _show_availability_watch(interval: int) -> None:
                             "a100": "Ampere (sm80)",
                             "a10g": "Ampere (sm80)",
                             "l4": "Ada Lovelace (sm89)",
+                            "g7e": "Blackwell (sm120)",
                             "t4": "Turing (sm75)",
                             "cpu-x86": "CPU (x86_64)",
                             "cpu-arm": "CPU (arm64)",
@@ -2552,6 +2556,7 @@ def _show_availability_watch(interval: int) -> None:
                         # Sort order: newest GPU architectures first, then CPUs at the bottom
                         arch_priority = {
                             "Blackwell (sm100)": 0,
+                            "Blackwell (sm120)": 0,
                             "Hopper (sm90)": 1,
                             "Ada Lovelace (sm89)": 2,
                             "Ampere (sm80)": 3,

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py RENAMED Viewed

@@ -153,7 +153,7 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
         # CPU instances don't have GPUs, but we still need a "count" for nodes
         valid_counts = [0]  # 0 GPUs for CPU-only instances
         multinode_counts = []  # No multinode for CPU instances
-    elif gpu_type in ["t4", "l4", "a10g"]:
+    elif gpu_type in ["t4", "l4", "a10g", "g7e"]:
         valid_counts = [1, 2, 4]
         # Add multinode options
         multinode_counts = [8, 12, 16, 20, 24]  # multiples of 4

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py RENAMED Viewed

@@ -535,6 +535,7 @@ class ReservationManager:
                 "t4": {"max_gpus": 4},
                 "l4": {"max_gpus": 4},
                 "a10g": {"max_gpus": 4},
+                "g7e": {"max_gpus": 4},
                 "t4-small": {"max_gpus": 1},
                 "g5g": {"max_gpus": 2},
                 "a100": {"max_gpus": 8},

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gpu-dev"
-version = "0.4.1"
+version = "0.5.0"
 description = "CLI tool for PyTorch GPU developer server reservations"
 authors = [{name = "PyTorch Team"}]
 readme = "cli-tools/gpu-dev-cli/README.md"

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/Dockerfile RENAMED Viewed

@@ -1,6 +1,6 @@
 # Custom PyTorch GPU Development Server Image
-# Based on pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel
-FROM pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel
+# Based on pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
+FROM pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
 # Set environment variables for non-interactive installation
 ENV DEBIAN_FRONTEND=noninteractive
@@ -41,23 +41,22 @@ RUN apt-get install -y --no-install-recommends \
 RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
     apt-get install -y nodejs
-# Install CUDA 13.0 alongside existing CUDA 12.9
+# Install CUDA 12.9, 13.0, 13.1, 13.2 alongside base CUDA 12.8
+# Base image already has NVIDIA repo configured, no need for cuda-keyring
 RUN apt-get update && apt-get install -y --no-install-recommends \
-        software-properties-common \
-    && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
-    && dpkg -i cuda-keyring_1.0-1_all.deb \
-    && apt-get update \
-    && apt-get install -y --no-install-recommends \
+        cuda-toolkit-12-9 \
         cuda-toolkit-13-0 \
-    && rm cuda-keyring_1.0-1_all.deb \
+        cuda-toolkit-13-1 \
+        cuda-toolkit-13-2 \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
-# Set CUDA paths for both versions - 12.8 as default for PyTorch compatibility
-ENV CUDA_12_PATH=/usr/local/cuda-12.8
-ENV CUDA_13_PATH=/usr/local/cuda-13.0
-ENV PATH=/usr/local/cuda-12.8/bin:/usr/local/cuda-13.0/bin:${PATH}
-ENV LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:/usr/local/cuda-13.0/lib64:${LD_LIBRARY_PATH}
+# CUDA 12.8 is the default (PyTorch compiled against it)
+# All versions available at /usr/local/cuda-{12.8,12.9,13.0,13.1,13.2}/
+# Switch with: export CUDA_HOME=/usr/local/cuda-13.2
+ENV CUDA_HOME=/usr/local/cuda-12.8
+ENV PATH=/usr/local/cuda-12.8/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:${LD_LIBRARY_PATH}
 # Install EFA stack (prebuilt libfabric + OpenMPI + aws-ofi-nccl with GPU/RDMA support)
 # Uses AWS EFA installer which bundles tested, compatible versions of all components
@@ -80,10 +79,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 # Clone and build NCCL tests with MPI support for multi-node benchmarking
+RUN apt-get update && apt-get install -y --no-install-recommends libnccl-dev \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
 RUN cd /opt && \
     git clone https://github.com/NVIDIA/nccl-tests.git && \
     cd nccl-tests && \
-    make MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/usr/lib/x86_64-linux-gnu -j$(nproc)
+    make MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/usr -j$(nproc)
 # Set environment variables for EFA and NCCL
 ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}
@@ -101,7 +102,7 @@ ENV NCCL_ASYNC_ERROR_HANDLING=1
 ENV SUPPORTS_EFA=true
 # Install Python packages (Jupyter and common ML packages)
-RUN pip install --no-cache-dir \
+RUN pip install --no-cache-dir --break-system-packages \
         jupyterlab \
         ipywidgets \
         matplotlib \

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/shell_env RENAMED Viewed

@@ -1,10 +1,11 @@
 # Clean PATH setup (no duplicates)
 export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
-# CUDA environment
-export CUDA_HOME=/usr/local/cuda
-export PATH="/usr/local/cuda/bin:$PATH"
-export LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}"
+# CUDA environment (12.8 default, also available: 12.9, 13.0, 13.1, 13.2)
+# Switch with: export CUDA_HOME=/usr/local/cuda-13.2 && export PATH="$CUDA_HOME/bin:$PATH"
+export CUDA_HOME=/usr/local/cuda-12.8
+export PATH="$CUDA_HOME/bin:$PATH"
+export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
 # EFA and OpenMPI environment for multi-node GPU communication
 export PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:$PATH"
@@ -31,5 +32,4 @@ export CCACHE_DIR="/ccache_shared"
 export CCACHE_MAXSIZE="10G"
 # Claude Code configuration for Bedrock
-export CLAUDE_CODE_USE_BEDROCK=1
-export ANTHROPIC_MODEL="us.anthropic.claude-sonnet-4-20250514-v1:0"
+export CLAUDE_CODE_USE_BEDROCK=1

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/eks.tf RENAMED Viewed

@@ -83,13 +83,13 @@ resource "aws_iam_role_policy" "eks_node_bedrock_policy" {
         Effect = "Allow"
         Action = [
           "bedrock:InvokeModel",
-          "bedrock:InvokeModelWithResponseStream"
-        ]
-        Resource = [
-          "arn:aws:bedrock:*:*:foundation-model/anthropic.claude-*",
-          "arn:aws:bedrock:*:*:inference-profile/us.anthropic.claude-*",
-          "arn:aws:bedrock:*:*:inference-profile/global.anthropic.claude-*"
+          "bedrock:InvokeModelWithResponseStream",
+          "bedrock:ListInferenceProfiles",
+          "bedrock:GetInferenceProfile",
+          "bedrock:ListFoundationModels",
+          "bedrock-mantle:*"
         ]
+        Resource = "*"
       },
       {
         Effect = "Allow"
@@ -185,6 +185,7 @@ locals {
     "t4-az2"   = "t4" # Both t4 and t4-az2 should be labeled as "t4" in Kubernetes
     "l4"       = "l4"
     "a10g"     = "a10g"
+    "g7e"      = "g7e"
     "h100"     = "h100"
     "h200"     = "h200"
     "b200"     = "b200"

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/git-cache.tf RENAMED Viewed

@@ -228,29 +228,29 @@ NGINXCONF
                   fi
                 done
-                # Create bare .git tarball (much faster - no checkout needed!)
-                echo "[CACHE] Creating pytorch .git tarball..."
+                # Create tarballs for main repo + ALL submodules
+                # Naming convention: org_repo-git.tar.gz (matches git-clone-cached client)
+                echo "[CACHE] Creating tarballs..."
                 cd /git-cache
-                rm -f pytorch-git.tar.gz.tmp
-                # Just tar up the bare repo (pack files only, no working tree)
-                # Client will do git checkout after download (unavoidable anyway)
-                tar -czf pytorch-git.tar.gz.tmp -C /git-cache pytorch.git
-                mv pytorch-git.tar.gz.tmp pytorch-git.tar.gz
+                # Main pytorch repo — name must match org_repo convention
+                echo "[CACHE]   Creating pytorch_pytorch-git.tar.gz..."
+                rm -f pytorch_pytorch-git.tar.gz.tmp
+                tar -czf pytorch_pytorch-git.tar.gz.tmp -C /git-cache pytorch.git
+                mv pytorch_pytorch-git.tar.gz.tmp pytorch_pytorch-git.tar.gz
+                SIZE=$(du -sh pytorch_pytorch-git.tar.gz | awk '{print $1}')
+                echo "[CACHE]   pytorch_pytorch: $SIZE"
-                SIZE=$(du -sh pytorch-git.tar.gz | awk '{print $1}')
-                echo "[CACHE] Bare .git tarball created: $SIZE"
-                # Create tarballs for largest submodules (top 10 by size)
-                echo "[CACHE] Creating submodule tarballs..."
-                for repo in $(du -s /git-cache/*.git 2>/dev/null | sort -rn | head -11 | tail -10 | awk '{print $2}'); do
+                # All submodule repos (already named org_repo.git by init container)
+                for repo in /git-cache/*.git; do
                   name=$(basename "$repo")
+                  [ "$name" = "pytorch.git" ] && continue
                   tarball="$${name%.git}-git.tar.gz"
                   echo "[CACHE]   Creating $tarball..."
                   rm -f "$tarball.tmp" 2>/dev/null
                   tar -czf "$tarball.tmp" -C /git-cache "$name" 2>/dev/null && mv "$tarball.tmp" "$tarball" || echo "[CACHE]   WARNING: Failed to create $tarball"
                 done
-                echo "[CACHE] Submodule tarballs created"
+                echo "[CACHE] All tarballs created"
               fi
               echo "[CACHE] Refresh complete at $(date). Next in 3600s (1 hour)..."

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/availability_updater/index.py RENAMED Viewed

@@ -150,7 +150,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
             else:
                 available_gpus = total_gpus
         else:
-            # GPU nodes - use existing logic
+            # GPU nodes - use K8s schedulable node count for total if available
             total_gpus = running_instances * gpus_per_instance
             logger.info(
                 f"ASG calculation: {running_instances} instances * {gpus_per_instance} GPUs = {total_gpus} total GPUs")
@@ -181,6 +181,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
                 nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
                 single_node_max = 0  # Max available on any single node
+                schedulable_total_gpus = 0  # Total GPUs on schedulable (non-cordoned) nodes
                 for node in nodes.items:
                     if is_node_ready_and_schedulable(node):
                         available_on_node = get_available_gpus_on_node(v1, node)
@@ -192,6 +193,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
                             except (ValueError, TypeError):
                                 pass
+                        schedulable_total_gpus += total_on_node
                         # Track max available on any single node
                         single_node_max = max(single_node_max, available_on_node)
@@ -199,6 +202,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
                         if total_on_node > 0 and available_on_node == total_on_node:
                             full_nodes_available += 1
+                total_gpus = schedulable_total_gpus
                 # Calculate max reservable considering multinode scenarios
                 # Only high-end GPU types support multinode (up to 4 nodes = 32 GPUs)
                 multinode_gpu_types = ['h100', 'h200', 'b200', 'a100']

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py RENAMED Viewed

@@ -49,7 +49,7 @@ DEFAULT_TIMEOUT_HOURS = int(os.environ["DEFAULT_TIMEOUT_HOURS"])
 QUEUE_URL = os.environ["QUEUE_URL"]
 PRIMARY_AVAILABILITY_ZONE = os.environ["PRIMARY_AVAILABILITY_ZONE"]
 GPU_DEV_CONTAINER_IMAGE = os.environ.get(
-    "GPU_DEV_CONTAINER_IMAGE", "pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel")
+    "GPU_DEV_CONTAINER_IMAGE", "pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel")
 EFS_SECURITY_GROUP_ID = os.environ.get("EFS_SECURITY_GROUP_ID")
 EFS_SUBNET_IDS = os.environ.get("EFS_SUBNET_IDS", "").split(
     ",") if os.environ.get("EFS_SUBNET_IDS") else []
@@ -66,6 +66,7 @@ GPU_CONFIG = {
     "t4": {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0},
     "l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
     "a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
+    "g7e": {"instance_type": "g7e.24xlarge", "max_gpus": 4, "cpus": 96, "memory_gb": 1024, "efa_count": 2},
     "t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
     "g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
     "a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
@@ -2150,7 +2151,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
     gpu_type = request.get("gpu_type", "")
     # Validate GPU type
-    valid_gpu_types = ["t4", "l4", "a10g", "t4-small", "a100",
+    valid_gpu_types = ["t4", "l4", "a10g", "g7e", "t4-small", "a100",
                        "h100", "h200", "b200", "cpu-arm", "cpu-x86"]
     if gpu_type not in valid_gpu_types:
         error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
@@ -2381,6 +2382,7 @@ def update_gpu_availability_table(
             "t4": {"gpus_per_instance": 4},
             "l4": {"gpus_per_instance": 4},
             "a10g": {"gpus_per_instance": 4},
+            "g7e": {"gpus_per_instance": 4},
             "a100": {"gpus_per_instance": 8},
             "h100": {"gpus_per_instance": 8},
             "h200": {"gpus_per_instance": 8},
@@ -4570,7 +4572,7 @@ EOFREADME
                         cat > /usr/local/bin/git-clone-cached << 'GITCACHESCRIPT'
 #!/bin/bash
-# Clones from bare .git tarball if available in cache (10x faster than git protocol)
+# Clones repo + submodules from in-cluster cache (much faster than GitHub)
 CACHE_URL="http://git-cache.management.svc.cluster.local:8080"
 GIT="/usr/bin/git"
 GITHUB_URL="${{1}}"
@@ -4582,59 +4584,95 @@ if [ -z "$GITHUB_URL" ]; then
     DEST="${{DEST:-pytorch}}"
 fi
+# Handle short names: "pytorch" -> "https://github.com/pytorch/pytorch.git"
+if [[ ! "$GITHUB_URL" =~ ^https?:// ]] && [[ ! "$GITHUB_URL" =~ ^git@ ]]; then
+    GITHUB_URL="https://github.com/pytorch/$GITHUB_URL.git"
+    DEST="${{DEST:-${{1}}}}"
+fi
 # Extract org/repo from GitHub URL and create cache tarball name
-# https://github.com/pytorch/pytorch.git -> pytorch_pytorch-git.tar.gz
-# https://github.com/ROCm/aiter.git -> ROCm_aiter-git.tar.gz
 if [[ "$GITHUB_URL" =~ github\.com[/:]([^/]+)/([^/\.]+) ]]; then
     ORG="${{BASH_REMATCH[1]}}"
     REPO="${{BASH_REMATCH[2]}}"
     TARBALL="${{ORG}}_${{REPO}}-git.tar.gz"
 else
-    # Not a GitHub URL, fall back to direct clone
-    exec "$GIT" clone "$GITHUB_URL" "${{DEST:+"$DEST"}}"
-fi
-# Default destination to repo name if not specified
-if [ -z "$DEST" ]; then
-    DEST="$REPO"
+    exec "$GIT" clone --recurse-submodules --jobs 8 "$GITHUB_URL" "${{DEST:+"$DEST"}}"
 fi
-if [ -d "$DEST" ]; then
-    echo "Error: $DEST already exists"
-    exit 1
-fi
+if [ -z "$DEST" ]; then DEST="$REPO"; fi
+if [ -d "$DEST" ]; then echo "Error: $DEST already exists"; exit 1; fi
-# Try to download from cache
-echo "[git-cache] Checking cache for $ORG/$REPO..."
+echo "[git-cache] Cloning $ORG/$REPO..."
 TOTAL_START=$(date +%s)
+# --- Main repo ---
 mkdir -p "$DEST/.git"
 START=$(date +%s)
 if curl -sf "$CACHE_URL/$TARBALL" | tar -xz -C "$DEST/.git" --strip-components=1 2>/dev/null; then
     END=$(date +%s)
-    echo "[git-cache] Downloaded .git in $((END - START))s"
+    echo "[git-cache] Main repo .git: $((END - START))s"
-    # Configure as non-bare repository and set origin
     cd "$DEST"
     "$GIT" config --file .git/config core.bare false
     "$GIT" config --file .git/config remote.origin.url "$GITHUB_URL"
     "$GIT" config --file .git/config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
-    echo "[git-cache] Checking out working tree..."
     START=$(date +%s)
     "$GIT" checkout -f HEAD 2>/dev/null
     END=$(date +%s)
-    echo "[git-cache] Checkout took $((END - START))s"
+    echo "[git-cache] Checkout: $((END - START))s"
+    # --- Submodules from cache ---
+    if [ -f .gitmodules ]; then
+        echo "[git-cache] Setting up submodules..."
+        SUB_START=$(date +%s)
+        "$GIT" submodule init
+        ABS_ROOT="$(pwd)"
+        "$GIT" config --file .gitmodules --get-regexp 'submodule\..*\.url' | while read key url; do
+            name=$(echo "$key" | sed 's/^submodule\.//;s/\.url$//')
+            path=$("$GIT" config --file .gitmodules "submodule.$name.path")
+            [ -z "$path" ] && continue
+            COMMIT=$("$GIT" ls-tree HEAD "$path" 2>/dev/null | awk '{{print $3}}')
+            [ -z "$COMMIT" ] && continue
+            if [[ "$url" =~ github\.com[/:]([^/]+)/([^/.]+) ]]; then
+                SUB_TARBALL="${{BASH_REMATCH[1]}}_${{BASH_REMATCH[2]}}-git.tar.gz"
+                MODULES_DIR="$ABS_ROOT/.git/modules/$name"
+                mkdir -p "$MODULES_DIR"
+                if curl -sf "$CACHE_URL/$SUB_TARBALL" | tar -xz -C "$MODULES_DIR" --strip-components=1 2>/dev/null; then
+                    "$GIT" -C "$MODULES_DIR" config core.bare false
+                    "$GIT" -C "$MODULES_DIR" config core.worktree "$ABS_ROOT/$path"
+                    "$GIT" -C "$MODULES_DIR" config remote.origin.url "$url"
+                    "$GIT" -C "$MODULES_DIR" config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
+                    mkdir -p "$ABS_ROOT/$path"
+                    echo "gitdir: $MODULES_DIR" > "$ABS_ROOT/$path/.git"
+                    "$GIT" -C "$ABS_ROOT/$path" checkout -f "$COMMIT" 2>/dev/null
+                else
+                    rm -rf "$MODULES_DIR"
+                fi
+            fi
+        done
+        # Fetch remaining/recursive submodules from GitHub
+        "$GIT" -c protocol.file.allow=always submodule update --init --recursive --jobs 8 2>/dev/null
+        SUB_END=$(date +%s)
+        echo "[git-cache] Submodules: $((SUB_END - SUB_START))s"
+    fi
     TOTAL_END=$(date +%s)
-    echo "[git-cache] Total: $((TOTAL_END - TOTAL_START))s (from cache)"
+    echo "[git-cache] Total: $((TOTAL_END - TOTAL_START))s"
     exit 0
 fi
 # Fallback: clone from GitHub
 echo "[git-cache] Cache miss, cloning from GitHub..."
 rm -rf "$DEST"
-"$GIT" clone "$GITHUB_URL" "${{DEST:+"$DEST"}}"
+"$GIT" clone --recurse-submodules --jobs 8 "$GITHUB_URL" "${{DEST:+"$DEST"}}"
 GITCACHESCRIPT
                         chmod +x /usr/local/bin/git-clone-cached
                         echo "[STARTUP] ✓ git-clone-cached available (opt-in: use 'git-clone-cached pytorch' for cache)"

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda.tf RENAMED Viewed

@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
       HOSTED_ZONE_ID                     = local.effective_domain_name != "" ? local.hosted_zone_id : ""
       SSH_DOMAIN_MAPPINGS_TABLE          = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
       SSL_CERTIFICATE_ARN                = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
-      LAMBDA_VERSION                     = "0.4.1"
-      MIN_CLI_VERSION                    = "0.4.0"
+      LAMBDA_VERSION                     = "0.5.0"
+      MIN_CLI_VERSION                    = "0.5.0"
       DISK_CONTENTS_BUCKET               = aws_s3_bucket.disk_contents.bucket
       OPERATIONS_TABLE                   = aws_dynamodb_table.operations.name
     }, local.alb_env_vars)

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/main.tf RENAMED Viewed

@@ -79,7 +79,7 @@ locals {
         "cpu-arm" = {
           instance_type       = "c7g.4xlarge"
           instance_types      = null
-          instance_count      = 3
+          instance_count      = 1
           gpus_per_instance   = 0
           use_placement_group = false
           architecture        = "arm64"
@@ -88,7 +88,7 @@ locals {
         "cpu-x86" = {
           instance_type       = "c7i.4xlarge"
           instance_types      = null
-          instance_count      = 3
+          instance_count      = 1
           gpus_per_instance   = 0
           use_placement_group = false
           architecture        = "x86_64"
@@ -97,7 +97,7 @@ locals {
         "t4" = {
           instance_type       = "g4dn.12xlarge"
           instance_types      = null
-          instance_count      = 2 # 2 instances in primary AZ
+          instance_count      = 1
           gpus_per_instance   = 4
           use_placement_group = true
           architecture        = "x86_64"
@@ -106,7 +106,7 @@ locals {
         "t4-az2" = {
           instance_type       = "g4dn.12xlarge"
           instance_types      = null
-          instance_count      = 2 # 2 instances in secondary AZ
+          instance_count      = 0 # Disabled - use primary AZ only for testing
           gpus_per_instance   = 4
           use_placement_group = true
           architecture        = "x86_64"
@@ -115,7 +115,7 @@ locals {
         "h100" = {
           instance_type       = "p5.48xlarge"
           instance_types      = null
-          instance_count      = 2 # Fallback default (not used when capacity_reservations defined)
+          instance_count      = 0 # Disabled - only use via CR when needed
           gpus_per_instance   = 8
           use_placement_group = false
           architecture        = "x86_64"
@@ -124,7 +124,7 @@ locals {
         "t4-small" = {
           instance_type       = "g4dn.2xlarge"
           instance_types      = null
-          instance_count      = 1
+          instance_count      = 0 # Disabled
           gpus_per_instance   = 1
           use_placement_group = false
           architecture        = "x86_64"
@@ -183,7 +183,7 @@ locals {
         "t4" = {
           instance_type       = "g4dn.12xlarge"
           instance_types      = null
-          instance_count      = 5 # Fallback default (not used when capacity_reservations defined)
+          instance_count      = 2
           gpus_per_instance   = 4
           use_placement_group = true
           architecture        = "x86_64"
@@ -192,7 +192,7 @@ locals {
         "l4" = {
           instance_type       = "g6.12xlarge"
           instance_types      = null
-          instance_count      = 5 # Fallback default (not used when capacity_reservations defined)
+          instance_count      = 2
           gpus_per_instance   = 4 # 4x L4 GPUs
           use_placement_group = false
           architecture        = "x86_64"
@@ -201,16 +201,25 @@ locals {
         "a10g" = {
           instance_type       = "g5.12xlarge"
           instance_types      = null
-          instance_count      = 2
+          instance_count      = 1
           gpus_per_instance   = 4 # 4x A10G GPUs
           use_placement_group = false
           architecture        = "x86_64"
           efa_network_cards   = 1
         }
+        "g7e" = {
+          instance_type       = "g7e.24xlarge"
+          instance_types      = null
+          instance_count      = 2
+          gpus_per_instance   = 4 # 4x RTX PRO 6000 Blackwell GPUs
+          use_placement_group = false
+          architecture        = "x86_64"
+          efa_network_cards   = 2
+        }
         "cpu-arm" = {
           instance_type       = "c7g.8xlarge"
           instance_types      = null
-          instance_count      = 30
+          instance_count      = 10
           gpus_per_instance   = 0
           use_placement_group = false
           architecture        = "arm64"
@@ -219,7 +228,7 @@ locals {
         "cpu-x86" = {
           instance_type       = "c7i.8xlarge"
           instance_types      = null
-          instance_count      = 30
+          instance_count      = 10
           gpus_per_instance   = 0
           use_placement_group = false
           architecture        = "x86_64"
@@ -257,6 +266,7 @@ locals {
       h100 = [
         { key = "cr0", id = "cr-0a3f49b96fe03ca04", instance_count = 4 }, # H100 reservation us-east-2c (p5.48xlarge)
         { key = "cr1", id = null, instance_count = 2 },                   # H100 on-demand (2 instances)
+        { key = "cr2", id = "cr-044bc72b0a6b56062", instance_count = 4 }, # H100 reservation us-east-2a (4 instances)
       ]
       h200 = [
         { key = "cr0", id = "cr-0f6d0766f5d3339e6", instance_count = 2 }, # H200 capacity block (may be expired - keep to prevent ASG destroy)
@@ -264,9 +274,10 @@ locals {
         { key = "cr2", id = null, instance_count = 2 },                   # H200 on-demand (2 instances)
       ]
       b200 = [
-        { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 0 }, # B200 reservation (disabled - CR expired)
+        { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 0 }, # B200 reservation us-east-2a (disabled - CR freed)
         { key = "cr1", id = "cr-08e7fee0b8dc3de5e", instance_count = 3 }, # B200 reservation (3 instances)
         { key = "cr2", id = null, instance_count = 2 },                   # B200 on-demand (2 instances)
+        { key = "cr3", id = "cr-0f5f6bb30a8fe3c68", instance_count = 2 }, # B200 reservation us-east-2b (2 instances)
       ]
       # T4 and L4 don't have capacity reservations - managed via supported_gpu_types fallback
     }
@@ -292,6 +303,7 @@ locals {
       t4        = "primary"
       l4        = "secondary"
       a10g      = "secondary"
+      g7e       = "secondary"
       "cpu-arm" = "primary"
       "cpu-x86" = "primary"
     }
@@ -307,11 +319,13 @@ locals {
       "cr-0c366fb8339a10f69" = "primary"   # us-east-2a
       "cr-0122dff5e01d566dc" = "secondary" # us-east-2b
       "cr-08e7fee0b8dc3de5e" = "secondary" # us-east-2b
+      "cr-0f5f6bb30a8fe3c68" = "secondary" # us-east-2b
       # H200 capacity reservations
       "cr-0f6d0766f5d3339e6" = "tertiary" # us-east-2c (may be expired - kept to prevent ASG destroy)
       "cr-06c9c978dea756a26" = "tertiary"  # us-east-2c
-      # H100 capacity reservation
+      # H100 capacity reservations
       "cr-0a3f49b96fe03ca04" = "tertiary" # us-east-2c (p5.48xlarge)
+      "cr-044bc72b0a6b56062" = "primary"  # us-east-2a (p5.48xlarge)
       # A100 capacity reservation
       "cr-01cc0f00f28b095af" = "primary" # us-east-2a
     }

{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/templates/al2023-user-data.sh RENAMED Viewed

@@ -11,7 +11,7 @@ systemctl disable nodeadm-run.service || true
 systemctl stop nodeadm-config.service || true
 systemctl stop nodeadm-run.service || true
-# Install NVIDIA driver 580.82.07 directly on host for CUDA 13 support
+# Install latest NVIDIA driver on host (595.x branch supports CUDA 13.2)
 # GPU Operator will handle toolkit/device-plugin only
 # Configure NVIDIA profiling BEFORE driver installation (driver install auto-loads modules)