gpu-dev 0.4.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/PKG-INFO +1 -1
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +7 -2
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +1 -1
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +1 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/pyproject.toml +1 -1
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/Dockerfile +17 -16
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/shell_env +6 -6
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/eks.tf +7 -6
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/git-cache.tf +14 -14
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/availability_updater/index.py +6 -1
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py +62 -24
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/main.tf +27 -13
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/templates/al2023-user-data.sh +1 -1
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/.gitignore +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/CLAUDE.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/PROGRESS.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/TODO.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/admin/README.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/admin/generate_stats.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/admin/requirements.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/post.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/setup.cfg +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -498,9 +498,9 @@ def main(ctx: click.Context) -> None:
|
|
|
498
498
|
"--gpu-type",
|
|
499
499
|
"-t",
|
|
500
500
|
type=click.Choice(
|
|
501
|
-
["b200", "h200", "h100", "a100", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
501
|
+
["b200", "h200", "h100", "a100", "g7e", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
502
502
|
),
|
|
503
|
-
help="GPU type to reserve (b200/h200/h100/a100/a10g/t4/l4/t4-small/cpu-arm/cpu-x86)",
|
|
503
|
+
help="GPU type to reserve (b200/h200/h100/a100/g7e/a10g/t4/l4/t4-small/cpu-arm/cpu-x86)",
|
|
504
504
|
)
|
|
505
505
|
@click.option(
|
|
506
506
|
"--hours",
|
|
@@ -652,6 +652,7 @@ def reserve(
|
|
|
652
652
|
"t4": {"max_gpus": 4, "instance_type": "g4dn.12xlarge"},
|
|
653
653
|
"l4": {"max_gpus": 4, "instance_type": "g6.12xlarge"},
|
|
654
654
|
"a10g": {"max_gpus": 4, "instance_type": "g5.12xlarge"},
|
|
655
|
+
"g7e": {"max_gpus": 4, "instance_type": "g7e.24xlarge"},
|
|
655
656
|
"t4-small": {"max_gpus": 1, "instance_type": "g4dn.xlarge"},
|
|
656
657
|
"a100": {"max_gpus": 8, "instance_type": "p4d.24xlarge"},
|
|
657
658
|
"h100": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
|
|
@@ -2397,6 +2398,7 @@ def _show_availability() -> None:
|
|
|
2397
2398
|
"a100": "Ampere (sm80)",
|
|
2398
2399
|
"a10g": "Ampere (sm80)",
|
|
2399
2400
|
"l4": "Ada Lovelace (sm89)",
|
|
2401
|
+
"g7e": "Blackwell (sm120)",
|
|
2400
2402
|
"t4": "Turing (sm75)",
|
|
2401
2403
|
"cpu-x86": "CPU (x86_64)",
|
|
2402
2404
|
"cpu-arm": "CPU (arm64)",
|
|
@@ -2405,6 +2407,7 @@ def _show_availability() -> None:
|
|
|
2405
2407
|
# Sort order: newest GPU architectures first, then CPUs at the bottom
|
|
2406
2408
|
arch_priority = {
|
|
2407
2409
|
"Blackwell (sm100)": 0,
|
|
2410
|
+
"Blackwell (sm120)": 0,
|
|
2408
2411
|
"Hopper (sm90)": 1,
|
|
2409
2412
|
"Ada Lovelace (sm89)": 2,
|
|
2410
2413
|
"Ampere (sm80)": 3,
|
|
@@ -2544,6 +2547,7 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2544
2547
|
"a100": "Ampere (sm80)",
|
|
2545
2548
|
"a10g": "Ampere (sm80)",
|
|
2546
2549
|
"l4": "Ada Lovelace (sm89)",
|
|
2550
|
+
"g7e": "Blackwell (sm120)",
|
|
2547
2551
|
"t4": "Turing (sm75)",
|
|
2548
2552
|
"cpu-x86": "CPU (x86_64)",
|
|
2549
2553
|
"cpu-arm": "CPU (arm64)",
|
|
@@ -2552,6 +2556,7 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2552
2556
|
# Sort order: newest GPU architectures first, then CPUs at the bottom
|
|
2553
2557
|
arch_priority = {
|
|
2554
2558
|
"Blackwell (sm100)": 0,
|
|
2559
|
+
"Blackwell (sm120)": 0,
|
|
2555
2560
|
"Hopper (sm90)": 1,
|
|
2556
2561
|
"Ada Lovelace (sm89)": 2,
|
|
2557
2562
|
"Ampere (sm80)": 3,
|
|
@@ -153,7 +153,7 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
|
|
|
153
153
|
# CPU instances don't have GPUs, but we still need a "count" for nodes
|
|
154
154
|
valid_counts = [0] # 0 GPUs for CPU-only instances
|
|
155
155
|
multinode_counts = [] # No multinode for CPU instances
|
|
156
|
-
elif gpu_type in ["t4", "l4", "a10g"]:
|
|
156
|
+
elif gpu_type in ["t4", "l4", "a10g", "g7e"]:
|
|
157
157
|
valid_counts = [1, 2, 4]
|
|
158
158
|
# Add multinode options
|
|
159
159
|
multinode_counts = [8, 12, 16, 20, 24] # multiples of 4
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.5.0"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Custom PyTorch GPU Development Server Image
|
|
2
|
-
# Based on pytorch/pytorch:2.
|
|
3
|
-
FROM pytorch/pytorch:2.
|
|
2
|
+
# Based on pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
|
|
3
|
+
FROM pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
|
|
4
4
|
|
|
5
5
|
# Set environment variables for non-interactive installation
|
|
6
6
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
@@ -41,23 +41,22 @@ RUN apt-get install -y --no-install-recommends \
|
|
|
41
41
|
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
|
|
42
42
|
apt-get install -y nodejs
|
|
43
43
|
|
|
44
|
-
# Install CUDA 13.0 alongside
|
|
44
|
+
# Install CUDA 12.9, 13.0, 13.1, 13.2 alongside base CUDA 12.8
|
|
45
|
+
# Base image already has NVIDIA repo configured, no need for cuda-keyring
|
|
45
46
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
46
|
-
|
|
47
|
-
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
|
|
48
|
-
&& dpkg -i cuda-keyring_1.0-1_all.deb \
|
|
49
|
-
&& apt-get update \
|
|
50
|
-
&& apt-get install -y --no-install-recommends \
|
|
47
|
+
cuda-toolkit-12-9 \
|
|
51
48
|
cuda-toolkit-13-0 \
|
|
52
|
-
|
|
49
|
+
cuda-toolkit-13-1 \
|
|
50
|
+
cuda-toolkit-13-2 \
|
|
53
51
|
&& apt-get clean \
|
|
54
52
|
&& rm -rf /var/lib/apt/lists/*
|
|
55
53
|
|
|
56
|
-
#
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
ENV
|
|
60
|
-
ENV
|
|
54
|
+
# CUDA 12.8 is the default (PyTorch compiled against it)
|
|
55
|
+
# All versions available at /usr/local/cuda-{12.8,12.9,13.0,13.1,13.2}/
|
|
56
|
+
# Switch with: export CUDA_HOME=/usr/local/cuda-13.2
|
|
57
|
+
ENV CUDA_HOME=/usr/local/cuda-12.8
|
|
58
|
+
ENV PATH=/usr/local/cuda-12.8/bin:${PATH}
|
|
59
|
+
ENV LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:${LD_LIBRARY_PATH}
|
|
61
60
|
|
|
62
61
|
# Install EFA stack (prebuilt libfabric + OpenMPI + aws-ofi-nccl with GPU/RDMA support)
|
|
63
62
|
# Uses AWS EFA installer which bundles tested, compatible versions of all components
|
|
@@ -80,10 +79,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
80
79
|
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
|
81
80
|
|
|
82
81
|
# Clone and build NCCL tests with MPI support for multi-node benchmarking
|
|
82
|
+
RUN apt-get update && apt-get install -y --no-install-recommends libnccl-dev \
|
|
83
|
+
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
|
83
84
|
RUN cd /opt && \
|
|
84
85
|
git clone https://github.com/NVIDIA/nccl-tests.git && \
|
|
85
86
|
cd nccl-tests && \
|
|
86
|
-
make MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/usr
|
|
87
|
+
make MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/usr -j$(nproc)
|
|
87
88
|
|
|
88
89
|
# Set environment variables for EFA and NCCL
|
|
89
90
|
ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}
|
|
@@ -101,7 +102,7 @@ ENV NCCL_ASYNC_ERROR_HANDLING=1
|
|
|
101
102
|
ENV SUPPORTS_EFA=true
|
|
102
103
|
|
|
103
104
|
# Install Python packages (Jupyter and common ML packages)
|
|
104
|
-
RUN pip install --no-cache-dir \
|
|
105
|
+
RUN pip install --no-cache-dir --break-system-packages \
|
|
105
106
|
jupyterlab \
|
|
106
107
|
ipywidgets \
|
|
107
108
|
matplotlib \
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# Clean PATH setup (no duplicates)
|
|
2
2
|
export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
|
3
3
|
|
|
4
|
-
# CUDA environment
|
|
5
|
-
export CUDA_HOME=/usr/local/cuda
|
|
6
|
-
export
|
|
7
|
-
export
|
|
4
|
+
# CUDA environment (12.8 default, also available: 12.9, 13.0, 13.1, 13.2)
|
|
5
|
+
# Switch with: export CUDA_HOME=/usr/local/cuda-13.2 && export PATH="$CUDA_HOME/bin:$PATH"
|
|
6
|
+
export CUDA_HOME=/usr/local/cuda-12.8
|
|
7
|
+
export PATH="$CUDA_HOME/bin:$PATH"
|
|
8
|
+
export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
|
|
8
9
|
|
|
9
10
|
# EFA and OpenMPI environment for multi-node GPU communication
|
|
10
11
|
export PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:$PATH"
|
|
@@ -31,5 +32,4 @@ export CCACHE_DIR="/ccache_shared"
|
|
|
31
32
|
export CCACHE_MAXSIZE="10G"
|
|
32
33
|
|
|
33
34
|
# Claude Code configuration for Bedrock
|
|
34
|
-
export CLAUDE_CODE_USE_BEDROCK=1
|
|
35
|
-
export ANTHROPIC_MODEL="us.anthropic.claude-sonnet-4-20250514-v1:0"
|
|
35
|
+
export CLAUDE_CODE_USE_BEDROCK=1
|
|
@@ -83,13 +83,13 @@ resource "aws_iam_role_policy" "eks_node_bedrock_policy" {
|
|
|
83
83
|
Effect = "Allow"
|
|
84
84
|
Action = [
|
|
85
85
|
"bedrock:InvokeModel",
|
|
86
|
-
"bedrock:InvokeModelWithResponseStream"
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
"
|
|
90
|
-
"
|
|
91
|
-
"arn:aws:bedrock:*:*:inference-profile/global.anthropic.claude-*"
|
|
86
|
+
"bedrock:InvokeModelWithResponseStream",
|
|
87
|
+
"bedrock:ListInferenceProfiles",
|
|
88
|
+
"bedrock:GetInferenceProfile",
|
|
89
|
+
"bedrock:ListFoundationModels",
|
|
90
|
+
"bedrock-mantle:*"
|
|
92
91
|
]
|
|
92
|
+
Resource = "*"
|
|
93
93
|
},
|
|
94
94
|
{
|
|
95
95
|
Effect = "Allow"
|
|
@@ -185,6 +185,7 @@ locals {
|
|
|
185
185
|
"t4-az2" = "t4" # Both t4 and t4-az2 should be labeled as "t4" in Kubernetes
|
|
186
186
|
"l4" = "l4"
|
|
187
187
|
"a10g" = "a10g"
|
|
188
|
+
"g7e" = "g7e"
|
|
188
189
|
"h100" = "h100"
|
|
189
190
|
"h200" = "h200"
|
|
190
191
|
"b200" = "b200"
|
|
@@ -228,29 +228,29 @@ NGINXCONF
|
|
|
228
228
|
fi
|
|
229
229
|
done
|
|
230
230
|
|
|
231
|
-
# Create
|
|
232
|
-
|
|
231
|
+
# Create tarballs for main repo + ALL submodules
|
|
232
|
+
# Naming convention: org_repo-git.tar.gz (matches git-clone-cached client)
|
|
233
|
+
echo "[CACHE] Creating tarballs..."
|
|
233
234
|
cd /git-cache
|
|
234
|
-
rm -f pytorch-git.tar.gz.tmp
|
|
235
235
|
|
|
236
|
-
#
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
236
|
+
# Main pytorch repo — name must match org_repo convention
|
|
237
|
+
echo "[CACHE] Creating pytorch_pytorch-git.tar.gz..."
|
|
238
|
+
rm -f pytorch_pytorch-git.tar.gz.tmp
|
|
239
|
+
tar -czf pytorch_pytorch-git.tar.gz.tmp -C /git-cache pytorch.git
|
|
240
|
+
mv pytorch_pytorch-git.tar.gz.tmp pytorch_pytorch-git.tar.gz
|
|
241
|
+
SIZE=$(du -sh pytorch_pytorch-git.tar.gz | awk '{print $1}')
|
|
242
|
+
echo "[CACHE] pytorch_pytorch: $SIZE"
|
|
240
243
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
# Create tarballs for largest submodules (top 10 by size)
|
|
245
|
-
echo "[CACHE] Creating submodule tarballs..."
|
|
246
|
-
for repo in $(du -s /git-cache/*.git 2>/dev/null | sort -rn | head -11 | tail -10 | awk '{print $2}'); do
|
|
244
|
+
# All submodule repos (already named org_repo.git by init container)
|
|
245
|
+
for repo in /git-cache/*.git; do
|
|
247
246
|
name=$(basename "$repo")
|
|
247
|
+
[ "$name" = "pytorch.git" ] && continue
|
|
248
248
|
tarball="$${name%.git}-git.tar.gz"
|
|
249
249
|
echo "[CACHE] Creating $tarball..."
|
|
250
250
|
rm -f "$tarball.tmp" 2>/dev/null
|
|
251
251
|
tar -czf "$tarball.tmp" -C /git-cache "$name" 2>/dev/null && mv "$tarball.tmp" "$tarball" || echo "[CACHE] WARNING: Failed to create $tarball"
|
|
252
252
|
done
|
|
253
|
-
echo "[CACHE]
|
|
253
|
+
echo "[CACHE] All tarballs created"
|
|
254
254
|
fi
|
|
255
255
|
|
|
256
256
|
echo "[CACHE] Refresh complete at $(date). Next in 3600s (1 hour)..."
|
{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
@@ -150,7 +150,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
150
150
|
else:
|
|
151
151
|
available_gpus = total_gpus
|
|
152
152
|
else:
|
|
153
|
-
# GPU nodes - use
|
|
153
|
+
# GPU nodes - use K8s schedulable node count for total if available
|
|
154
154
|
total_gpus = running_instances * gpus_per_instance
|
|
155
155
|
logger.info(
|
|
156
156
|
f"ASG calculation: {running_instances} instances * {gpus_per_instance} GPUs = {total_gpus} total GPUs")
|
|
@@ -181,6 +181,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
181
181
|
nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
|
|
182
182
|
|
|
183
183
|
single_node_max = 0 # Max available on any single node
|
|
184
|
+
schedulable_total_gpus = 0 # Total GPUs on schedulable (non-cordoned) nodes
|
|
184
185
|
for node in nodes.items:
|
|
185
186
|
if is_node_ready_and_schedulable(node):
|
|
186
187
|
available_on_node = get_available_gpus_on_node(v1, node)
|
|
@@ -192,6 +193,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
192
193
|
except (ValueError, TypeError):
|
|
193
194
|
pass
|
|
194
195
|
|
|
196
|
+
schedulable_total_gpus += total_on_node
|
|
197
|
+
|
|
195
198
|
# Track max available on any single node
|
|
196
199
|
single_node_max = max(single_node_max, available_on_node)
|
|
197
200
|
|
|
@@ -199,6 +202,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
199
202
|
if total_on_node > 0 and available_on_node == total_on_node:
|
|
200
203
|
full_nodes_available += 1
|
|
201
204
|
|
|
205
|
+
total_gpus = schedulable_total_gpus
|
|
206
|
+
|
|
202
207
|
# Calculate max reservable considering multinode scenarios
|
|
203
208
|
# Only high-end GPU types support multinode (up to 4 nodes = 32 GPUs)
|
|
204
209
|
multinode_gpu_types = ['h100', 'h200', 'b200', 'a100']
|
{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -49,7 +49,7 @@ DEFAULT_TIMEOUT_HOURS = int(os.environ["DEFAULT_TIMEOUT_HOURS"])
|
|
|
49
49
|
QUEUE_URL = os.environ["QUEUE_URL"]
|
|
50
50
|
PRIMARY_AVAILABILITY_ZONE = os.environ["PRIMARY_AVAILABILITY_ZONE"]
|
|
51
51
|
GPU_DEV_CONTAINER_IMAGE = os.environ.get(
|
|
52
|
-
"GPU_DEV_CONTAINER_IMAGE", "pytorch/pytorch:2.
|
|
52
|
+
"GPU_DEV_CONTAINER_IMAGE", "pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel")
|
|
53
53
|
EFS_SECURITY_GROUP_ID = os.environ.get("EFS_SECURITY_GROUP_ID")
|
|
54
54
|
EFS_SUBNET_IDS = os.environ.get("EFS_SUBNET_IDS", "").split(
|
|
55
55
|
",") if os.environ.get("EFS_SUBNET_IDS") else []
|
|
@@ -66,6 +66,7 @@ GPU_CONFIG = {
|
|
|
66
66
|
"t4": {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0},
|
|
67
67
|
"l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
|
|
68
68
|
"a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
|
|
69
|
+
"g7e": {"instance_type": "g7e.24xlarge", "max_gpus": 4, "cpus": 96, "memory_gb": 1024, "efa_count": 2},
|
|
69
70
|
"t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
70
71
|
"g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
71
72
|
"a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
|
|
@@ -2150,7 +2151,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2150
2151
|
gpu_type = request.get("gpu_type", "")
|
|
2151
2152
|
|
|
2152
2153
|
# Validate GPU type
|
|
2153
|
-
valid_gpu_types = ["t4", "l4", "a10g", "t4-small", "a100",
|
|
2154
|
+
valid_gpu_types = ["t4", "l4", "a10g", "g7e", "t4-small", "a100",
|
|
2154
2155
|
"h100", "h200", "b200", "cpu-arm", "cpu-x86"]
|
|
2155
2156
|
if gpu_type not in valid_gpu_types:
|
|
2156
2157
|
error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
|
|
@@ -2381,6 +2382,7 @@ def update_gpu_availability_table(
|
|
|
2381
2382
|
"t4": {"gpus_per_instance": 4},
|
|
2382
2383
|
"l4": {"gpus_per_instance": 4},
|
|
2383
2384
|
"a10g": {"gpus_per_instance": 4},
|
|
2385
|
+
"g7e": {"gpus_per_instance": 4},
|
|
2384
2386
|
"a100": {"gpus_per_instance": 8},
|
|
2385
2387
|
"h100": {"gpus_per_instance": 8},
|
|
2386
2388
|
"h200": {"gpus_per_instance": 8},
|
|
@@ -4570,7 +4572,7 @@ EOFREADME
|
|
|
4570
4572
|
|
|
4571
4573
|
cat > /usr/local/bin/git-clone-cached << 'GITCACHESCRIPT'
|
|
4572
4574
|
#!/bin/bash
|
|
4573
|
-
# Clones
|
|
4575
|
+
# Clones repo + submodules from in-cluster cache (much faster than GitHub)
|
|
4574
4576
|
CACHE_URL="http://git-cache.management.svc.cluster.local:8080"
|
|
4575
4577
|
GIT="/usr/bin/git"
|
|
4576
4578
|
GITHUB_URL="${{1}}"
|
|
@@ -4582,59 +4584,95 @@ if [ -z "$GITHUB_URL" ]; then
|
|
|
4582
4584
|
DEST="${{DEST:-pytorch}}"
|
|
4583
4585
|
fi
|
|
4584
4586
|
|
|
4587
|
+
# Handle short names: "pytorch" -> "https://github.com/pytorch/pytorch.git"
|
|
4588
|
+
if [[ ! "$GITHUB_URL" =~ ^https?:// ]] && [[ ! "$GITHUB_URL" =~ ^git@ ]]; then
|
|
4589
|
+
GITHUB_URL="https://github.com/pytorch/$GITHUB_URL.git"
|
|
4590
|
+
DEST="${{DEST:-${{1}}}}"
|
|
4591
|
+
fi
|
|
4592
|
+
|
|
4585
4593
|
# Extract org/repo from GitHub URL and create cache tarball name
|
|
4586
|
-
# https://github.com/pytorch/pytorch.git -> pytorch_pytorch-git.tar.gz
|
|
4587
|
-
# https://github.com/ROCm/aiter.git -> ROCm_aiter-git.tar.gz
|
|
4588
4594
|
if [[ "$GITHUB_URL" =~ github\.com[/:]([^/]+)/([^/\.]+) ]]; then
|
|
4589
4595
|
ORG="${{BASH_REMATCH[1]}}"
|
|
4590
4596
|
REPO="${{BASH_REMATCH[2]}}"
|
|
4591
4597
|
TARBALL="${{ORG}}_${{REPO}}-git.tar.gz"
|
|
4592
4598
|
else
|
|
4593
|
-
|
|
4594
|
-
exec "$GIT" clone "$GITHUB_URL" "${{DEST:+"$DEST"}}"
|
|
4595
|
-
fi
|
|
4596
|
-
|
|
4597
|
-
# Default destination to repo name if not specified
|
|
4598
|
-
if [ -z "$DEST" ]; then
|
|
4599
|
-
DEST="$REPO"
|
|
4599
|
+
exec "$GIT" clone --recurse-submodules --jobs 8 "$GITHUB_URL" "${{DEST:+"$DEST"}}"
|
|
4600
4600
|
fi
|
|
4601
4601
|
|
|
4602
|
-
if [ -
|
|
4603
|
-
|
|
4604
|
-
exit 1
|
|
4605
|
-
fi
|
|
4602
|
+
if [ -z "$DEST" ]; then DEST="$REPO"; fi
|
|
4603
|
+
if [ -d "$DEST" ]; then echo "Error: $DEST already exists"; exit 1; fi
|
|
4606
4604
|
|
|
4607
|
-
|
|
4608
|
-
echo "[git-cache] Checking cache for $ORG/$REPO..."
|
|
4605
|
+
echo "[git-cache] Cloning $ORG/$REPO..."
|
|
4609
4606
|
TOTAL_START=$(date +%s)
|
|
4610
4607
|
|
|
4608
|
+
# --- Main repo ---
|
|
4611
4609
|
mkdir -p "$DEST/.git"
|
|
4612
4610
|
START=$(date +%s)
|
|
4613
4611
|
if curl -sf "$CACHE_URL/$TARBALL" | tar -xz -C "$DEST/.git" --strip-components=1 2>/dev/null; then
|
|
4614
4612
|
END=$(date +%s)
|
|
4615
|
-
echo "[git-cache]
|
|
4613
|
+
echo "[git-cache] Main repo .git: $((END - START))s"
|
|
4616
4614
|
|
|
4617
|
-
# Configure as non-bare repository and set origin
|
|
4618
4615
|
cd "$DEST"
|
|
4619
4616
|
"$GIT" config --file .git/config core.bare false
|
|
4620
4617
|
"$GIT" config --file .git/config remote.origin.url "$GITHUB_URL"
|
|
4621
4618
|
"$GIT" config --file .git/config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
|
|
4622
4619
|
|
|
4623
|
-
echo "[git-cache] Checking out working tree..."
|
|
4624
4620
|
START=$(date +%s)
|
|
4625
4621
|
"$GIT" checkout -f HEAD 2>/dev/null
|
|
4626
4622
|
END=$(date +%s)
|
|
4627
|
-
echo "[git-cache] Checkout
|
|
4623
|
+
echo "[git-cache] Checkout: $((END - START))s"
|
|
4624
|
+
|
|
4625
|
+
# --- Submodules from cache ---
|
|
4626
|
+
if [ -f .gitmodules ]; then
|
|
4627
|
+
echo "[git-cache] Setting up submodules..."
|
|
4628
|
+
SUB_START=$(date +%s)
|
|
4629
|
+
|
|
4630
|
+
"$GIT" submodule init
|
|
4631
|
+
ABS_ROOT="$(pwd)"
|
|
4632
|
+
|
|
4633
|
+
"$GIT" config --file .gitmodules --get-regexp 'submodule\..*\.url' | while read key url; do
|
|
4634
|
+
name=$(echo "$key" | sed 's/^submodule\.//;s/\.url$//')
|
|
4635
|
+
path=$("$GIT" config --file .gitmodules "submodule.$name.path")
|
|
4636
|
+
[ -z "$path" ] && continue
|
|
4637
|
+
|
|
4638
|
+
COMMIT=$("$GIT" ls-tree HEAD "$path" 2>/dev/null | awk '{{print $3}}')
|
|
4639
|
+
[ -z "$COMMIT" ] && continue
|
|
4640
|
+
|
|
4641
|
+
if [[ "$url" =~ github\.com[/:]([^/]+)/([^/.]+) ]]; then
|
|
4642
|
+
SUB_TARBALL="${{BASH_REMATCH[1]}}_${{BASH_REMATCH[2]}}-git.tar.gz"
|
|
4643
|
+
MODULES_DIR="$ABS_ROOT/.git/modules/$name"
|
|
4644
|
+
|
|
4645
|
+
mkdir -p "$MODULES_DIR"
|
|
4646
|
+
if curl -sf "$CACHE_URL/$SUB_TARBALL" | tar -xz -C "$MODULES_DIR" --strip-components=1 2>/dev/null; then
|
|
4647
|
+
"$GIT" -C "$MODULES_DIR" config core.bare false
|
|
4648
|
+
"$GIT" -C "$MODULES_DIR" config core.worktree "$ABS_ROOT/$path"
|
|
4649
|
+
"$GIT" -C "$MODULES_DIR" config remote.origin.url "$url"
|
|
4650
|
+
"$GIT" -C "$MODULES_DIR" config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
|
|
4651
|
+
mkdir -p "$ABS_ROOT/$path"
|
|
4652
|
+
echo "gitdir: $MODULES_DIR" > "$ABS_ROOT/$path/.git"
|
|
4653
|
+
"$GIT" -C "$ABS_ROOT/$path" checkout -f "$COMMIT" 2>/dev/null
|
|
4654
|
+
else
|
|
4655
|
+
rm -rf "$MODULES_DIR"
|
|
4656
|
+
fi
|
|
4657
|
+
fi
|
|
4658
|
+
done
|
|
4659
|
+
|
|
4660
|
+
# Fetch remaining/recursive submodules from GitHub
|
|
4661
|
+
"$GIT" -c protocol.file.allow=always submodule update --init --recursive --jobs 8 2>/dev/null
|
|
4662
|
+
|
|
4663
|
+
SUB_END=$(date +%s)
|
|
4664
|
+
echo "[git-cache] Submodules: $((SUB_END - SUB_START))s"
|
|
4665
|
+
fi
|
|
4628
4666
|
|
|
4629
4667
|
TOTAL_END=$(date +%s)
|
|
4630
|
-
echo "[git-cache] Total: $((TOTAL_END - TOTAL_START))s
|
|
4668
|
+
echo "[git-cache] Total: $((TOTAL_END - TOTAL_START))s"
|
|
4631
4669
|
exit 0
|
|
4632
4670
|
fi
|
|
4633
4671
|
|
|
4634
4672
|
# Fallback: clone from GitHub
|
|
4635
4673
|
echo "[git-cache] Cache miss, cloning from GitHub..."
|
|
4636
4674
|
rm -rf "$DEST"
|
|
4637
|
-
"$GIT" clone "$GITHUB_URL" "${{DEST:+"$DEST"}}"
|
|
4675
|
+
"$GIT" clone --recurse-submodules --jobs 8 "$GITHUB_URL" "${{DEST:+"$DEST"}}"
|
|
4638
4676
|
GITCACHESCRIPT
|
|
4639
4677
|
chmod +x /usr/local/bin/git-clone-cached
|
|
4640
4678
|
echo "[STARTUP] ✓ git-clone-cached available (opt-in: use 'git-clone-cached pytorch' for cache)"
|
|
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.
|
|
184
|
-
MIN_CLI_VERSION = "0.
|
|
183
|
+
LAMBDA_VERSION = "0.5.0"
|
|
184
|
+
MIN_CLI_VERSION = "0.5.0"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
187
|
}, local.alb_env_vars)
|
|
@@ -79,7 +79,7 @@ locals {
|
|
|
79
79
|
"cpu-arm" = {
|
|
80
80
|
instance_type = "c7g.4xlarge"
|
|
81
81
|
instance_types = null
|
|
82
|
-
instance_count =
|
|
82
|
+
instance_count = 1
|
|
83
83
|
gpus_per_instance = 0
|
|
84
84
|
use_placement_group = false
|
|
85
85
|
architecture = "arm64"
|
|
@@ -88,7 +88,7 @@ locals {
|
|
|
88
88
|
"cpu-x86" = {
|
|
89
89
|
instance_type = "c7i.4xlarge"
|
|
90
90
|
instance_types = null
|
|
91
|
-
instance_count =
|
|
91
|
+
instance_count = 1
|
|
92
92
|
gpus_per_instance = 0
|
|
93
93
|
use_placement_group = false
|
|
94
94
|
architecture = "x86_64"
|
|
@@ -97,7 +97,7 @@ locals {
|
|
|
97
97
|
"t4" = {
|
|
98
98
|
instance_type = "g4dn.12xlarge"
|
|
99
99
|
instance_types = null
|
|
100
|
-
instance_count =
|
|
100
|
+
instance_count = 1
|
|
101
101
|
gpus_per_instance = 4
|
|
102
102
|
use_placement_group = true
|
|
103
103
|
architecture = "x86_64"
|
|
@@ -106,7 +106,7 @@ locals {
|
|
|
106
106
|
"t4-az2" = {
|
|
107
107
|
instance_type = "g4dn.12xlarge"
|
|
108
108
|
instance_types = null
|
|
109
|
-
instance_count =
|
|
109
|
+
instance_count = 0 # Disabled - use primary AZ only for testing
|
|
110
110
|
gpus_per_instance = 4
|
|
111
111
|
use_placement_group = true
|
|
112
112
|
architecture = "x86_64"
|
|
@@ -115,7 +115,7 @@ locals {
|
|
|
115
115
|
"h100" = {
|
|
116
116
|
instance_type = "p5.48xlarge"
|
|
117
117
|
instance_types = null
|
|
118
|
-
instance_count =
|
|
118
|
+
instance_count = 0 # Disabled - only use via CR when needed
|
|
119
119
|
gpus_per_instance = 8
|
|
120
120
|
use_placement_group = false
|
|
121
121
|
architecture = "x86_64"
|
|
@@ -124,7 +124,7 @@ locals {
|
|
|
124
124
|
"t4-small" = {
|
|
125
125
|
instance_type = "g4dn.2xlarge"
|
|
126
126
|
instance_types = null
|
|
127
|
-
instance_count =
|
|
127
|
+
instance_count = 0 # Disabled
|
|
128
128
|
gpus_per_instance = 1
|
|
129
129
|
use_placement_group = false
|
|
130
130
|
architecture = "x86_64"
|
|
@@ -183,7 +183,7 @@ locals {
|
|
|
183
183
|
"t4" = {
|
|
184
184
|
instance_type = "g4dn.12xlarge"
|
|
185
185
|
instance_types = null
|
|
186
|
-
instance_count =
|
|
186
|
+
instance_count = 2
|
|
187
187
|
gpus_per_instance = 4
|
|
188
188
|
use_placement_group = true
|
|
189
189
|
architecture = "x86_64"
|
|
@@ -192,7 +192,7 @@ locals {
|
|
|
192
192
|
"l4" = {
|
|
193
193
|
instance_type = "g6.12xlarge"
|
|
194
194
|
instance_types = null
|
|
195
|
-
instance_count =
|
|
195
|
+
instance_count = 2
|
|
196
196
|
gpus_per_instance = 4 # 4x L4 GPUs
|
|
197
197
|
use_placement_group = false
|
|
198
198
|
architecture = "x86_64"
|
|
@@ -201,16 +201,25 @@ locals {
|
|
|
201
201
|
"a10g" = {
|
|
202
202
|
instance_type = "g5.12xlarge"
|
|
203
203
|
instance_types = null
|
|
204
|
-
instance_count =
|
|
204
|
+
instance_count = 1
|
|
205
205
|
gpus_per_instance = 4 # 4x A10G GPUs
|
|
206
206
|
use_placement_group = false
|
|
207
207
|
architecture = "x86_64"
|
|
208
208
|
efa_network_cards = 1
|
|
209
209
|
}
|
|
210
|
+
"g7e" = {
|
|
211
|
+
instance_type = "g7e.24xlarge"
|
|
212
|
+
instance_types = null
|
|
213
|
+
instance_count = 2
|
|
214
|
+
gpus_per_instance = 4 # 4x RTX PRO 6000 Blackwell GPUs
|
|
215
|
+
use_placement_group = false
|
|
216
|
+
architecture = "x86_64"
|
|
217
|
+
efa_network_cards = 2
|
|
218
|
+
}
|
|
210
219
|
"cpu-arm" = {
|
|
211
220
|
instance_type = "c7g.8xlarge"
|
|
212
221
|
instance_types = null
|
|
213
|
-
instance_count =
|
|
222
|
+
instance_count = 10
|
|
214
223
|
gpus_per_instance = 0
|
|
215
224
|
use_placement_group = false
|
|
216
225
|
architecture = "arm64"
|
|
@@ -219,7 +228,7 @@ locals {
|
|
|
219
228
|
"cpu-x86" = {
|
|
220
229
|
instance_type = "c7i.8xlarge"
|
|
221
230
|
instance_types = null
|
|
222
|
-
instance_count =
|
|
231
|
+
instance_count = 10
|
|
223
232
|
gpus_per_instance = 0
|
|
224
233
|
use_placement_group = false
|
|
225
234
|
architecture = "x86_64"
|
|
@@ -257,6 +266,7 @@ locals {
|
|
|
257
266
|
h100 = [
|
|
258
267
|
{ key = "cr0", id = "cr-0a3f49b96fe03ca04", instance_count = 4 }, # H100 reservation us-east-2c (p5.48xlarge)
|
|
259
268
|
{ key = "cr1", id = null, instance_count = 2 }, # H100 on-demand (2 instances)
|
|
269
|
+
{ key = "cr2", id = "cr-044bc72b0a6b56062", instance_count = 4 }, # H100 reservation us-east-2a (4 instances)
|
|
260
270
|
]
|
|
261
271
|
h200 = [
|
|
262
272
|
{ key = "cr0", id = "cr-0f6d0766f5d3339e6", instance_count = 2 }, # H200 capacity block (may be expired - keep to prevent ASG destroy)
|
|
@@ -264,9 +274,10 @@ locals {
|
|
|
264
274
|
{ key = "cr2", id = null, instance_count = 2 }, # H200 on-demand (2 instances)
|
|
265
275
|
]
|
|
266
276
|
b200 = [
|
|
267
|
-
{ key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 0 }, # B200 reservation (disabled - CR
|
|
277
|
+
{ key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 0 }, # B200 reservation us-east-2a (disabled - CR freed)
|
|
268
278
|
{ key = "cr1", id = "cr-08e7fee0b8dc3de5e", instance_count = 3 }, # B200 reservation (3 instances)
|
|
269
279
|
{ key = "cr2", id = null, instance_count = 2 }, # B200 on-demand (2 instances)
|
|
280
|
+
{ key = "cr3", id = "cr-0f5f6bb30a8fe3c68", instance_count = 2 }, # B200 reservation us-east-2b (2 instances)
|
|
270
281
|
]
|
|
271
282
|
# T4 and L4 don't have capacity reservations - managed via supported_gpu_types fallback
|
|
272
283
|
}
|
|
@@ -292,6 +303,7 @@ locals {
|
|
|
292
303
|
t4 = "primary"
|
|
293
304
|
l4 = "secondary"
|
|
294
305
|
a10g = "secondary"
|
|
306
|
+
g7e = "secondary"
|
|
295
307
|
"cpu-arm" = "primary"
|
|
296
308
|
"cpu-x86" = "primary"
|
|
297
309
|
}
|
|
@@ -307,11 +319,13 @@ locals {
|
|
|
307
319
|
"cr-0c366fb8339a10f69" = "primary" # us-east-2a
|
|
308
320
|
"cr-0122dff5e01d566dc" = "secondary" # us-east-2b
|
|
309
321
|
"cr-08e7fee0b8dc3de5e" = "secondary" # us-east-2b
|
|
322
|
+
"cr-0f5f6bb30a8fe3c68" = "secondary" # us-east-2b
|
|
310
323
|
# H200 capacity reservations
|
|
311
324
|
"cr-0f6d0766f5d3339e6" = "tertiary" # us-east-2c (may be expired - kept to prevent ASG destroy)
|
|
312
325
|
"cr-06c9c978dea756a26" = "tertiary" # us-east-2c
|
|
313
|
-
# H100 capacity
|
|
326
|
+
# H100 capacity reservations
|
|
314
327
|
"cr-0a3f49b96fe03ca04" = "tertiary" # us-east-2c (p5.48xlarge)
|
|
328
|
+
"cr-044bc72b0a6b56062" = "primary" # us-east-2a (p5.48xlarge)
|
|
315
329
|
# A100 capacity reservation
|
|
316
330
|
"cr-01cc0f00f28b095af" = "primary" # us-east-2a
|
|
317
331
|
}
|
|
@@ -11,7 +11,7 @@ systemctl disable nodeadm-run.service || true
|
|
|
11
11
|
systemctl stop nodeadm-config.service || true
|
|
12
12
|
systemctl stop nodeadm-run.service || true
|
|
13
13
|
|
|
14
|
-
# Install NVIDIA driver
|
|
14
|
+
# Install latest NVIDIA driver on host (595.x branch supports CUDA 13.2)
|
|
15
15
|
# GPU Operator will handle toolkit/device-plugin only
|
|
16
16
|
|
|
17
17
|
# Configure NVIDIA profiling BEFORE driver installation (driver install auto-loads modules)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.4.1 → gpu_dev-0.5.0}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|