gpu-dev 0.4.1__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/PKG-INFO +1 -1
  2. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +7 -2
  4. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +1 -1
  5. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +1 -0
  6. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/pyproject.toml +1 -1
  7. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/Dockerfile +17 -16
  8. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/shell_env +6 -6
  9. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/eks.tf +8 -7
  10. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/git-cache.tf +14 -14
  11. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/availability_updater/index.py +6 -1
  12. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/reservation_processor/index.py +62 -24
  13. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda.tf +2 -2
  14. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/main.tf +28 -14
  15. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/templates/al2023-user-data.sh +1 -1
  16. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/.github/workflows/no-gitlinks.yml +0 -0
  17. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/.github/workflows/publish.yml +0 -0
  18. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/.gitignore +0 -0
  19. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/CLAUDE.md +0 -0
  20. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/PROGRESS.md +0 -0
  21. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/PR_DESCRIPTION.md +0 -0
  22. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/TODO.md +0 -0
  23. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/admin/README.md +0 -0
  24. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/admin/generate_stats.py +0 -0
  25. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/admin/requirements.txt +0 -0
  26. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/README.md +0 -0
  27. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  28. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  29. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  30. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  31. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  32. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  33. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  34. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  35. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  36. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  37. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  38. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  39. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  40. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  41. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/docs/USER_GUIDE.md +0 -0
  42. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/docs/devgpu-features.html +0 -0
  43. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/docs/docker-mark-blue.svg +0 -0
  44. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/docs/icons8-cursor-ai.svg +0 -0
  45. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/post.md +0 -0
  46. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/setup.cfg +0 -0
  47. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  48. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  49. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/README.md +0 -0
  50. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/alb.tf +0 -0
  51. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/availability.tf +0 -0
  52. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/backend.tf +0 -0
  53. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  54. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  55. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/bash_profile +0 -0
  56. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/bashrc +0 -0
  57. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  58. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  59. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  60. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  61. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/motd_script +0 -0
  62. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  63. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/profile +0 -0
  64. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  65. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  66. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  67. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/ssh_config +0 -0
  68. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/zprofile +0 -0
  69. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/zshrc +0 -0
  70. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  71. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker-build.tf +0 -0
  72. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  73. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  74. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/ecr.tf +0 -0
  75. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/efs.tf +0 -0
  76. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/expiry.tf +0 -0
  77. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/kubernetes.tf +0 -0
  78. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  84. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  85. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  86. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  87. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  88. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  89. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  90. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  91. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  92. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  93. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  94. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  95. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  96. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/monitoring.tf +0 -0
  97. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/outputs.tf +0 -0
  98. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/pyproject.toml +0 -0
  99. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/queue.tf +0 -0
  100. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/route53.tf +0 -0
  101. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  102. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  103. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  104. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  105. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  106. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  107. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  108. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  109. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  110. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  111. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/switch-to.sh +0 -0
  112. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  113. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  114. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  115. {gpu_dev-0.4.1 → gpu_dev-0.5.1}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.4.1
3
+ Version: 0.5.1
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.4.1
3
+ Version: 0.5.1
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -498,9 +498,9 @@ def main(ctx: click.Context) -> None:
498
498
  "--gpu-type",
499
499
  "-t",
500
500
  type=click.Choice(
501
- ["b200", "h200", "h100", "a100", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
501
+ ["b200", "h200", "h100", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
502
502
  ),
503
- help="GPU type to reserve (b200/h200/h100/a100/a10g/t4/l4/t4-small/cpu-arm/cpu-x86)",
503
+ help="GPU type to reserve (b200/h200/h100/a100/rtxpro6000/a10g/t4/l4/t4-small/cpu-arm/cpu-x86)",
504
504
  )
505
505
  @click.option(
506
506
  "--hours",
@@ -652,6 +652,7 @@ def reserve(
652
652
  "t4": {"max_gpus": 4, "instance_type": "g4dn.12xlarge"},
653
653
  "l4": {"max_gpus": 4, "instance_type": "g6.12xlarge"},
654
654
  "a10g": {"max_gpus": 4, "instance_type": "g5.12xlarge"},
655
+ "rtxpro6000": {"max_gpus": 4, "instance_type": "g7e.24xlarge"},
655
656
  "t4-small": {"max_gpus": 1, "instance_type": "g4dn.xlarge"},
656
657
  "a100": {"max_gpus": 8, "instance_type": "p4d.24xlarge"},
657
658
  "h100": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
@@ -2397,6 +2398,7 @@ def _show_availability() -> None:
2397
2398
  "a100": "Ampere (sm80)",
2398
2399
  "a10g": "Ampere (sm80)",
2399
2400
  "l4": "Ada Lovelace (sm89)",
2401
+ "rtxpro6000": "Blackwell (sm120)",
2400
2402
  "t4": "Turing (sm75)",
2401
2403
  "cpu-x86": "CPU (x86_64)",
2402
2404
  "cpu-arm": "CPU (arm64)",
@@ -2405,6 +2407,7 @@ def _show_availability() -> None:
2405
2407
  # Sort order: newest GPU architectures first, then CPUs at the bottom
2406
2408
  arch_priority = {
2407
2409
  "Blackwell (sm100)": 0,
2410
+ "Blackwell (sm120)": 0,
2408
2411
  "Hopper (sm90)": 1,
2409
2412
  "Ada Lovelace (sm89)": 2,
2410
2413
  "Ampere (sm80)": 3,
@@ -2544,6 +2547,7 @@ def _show_availability_watch(interval: int) -> None:
2544
2547
  "a100": "Ampere (sm80)",
2545
2548
  "a10g": "Ampere (sm80)",
2546
2549
  "l4": "Ada Lovelace (sm89)",
2550
+ "rtxpro6000": "Blackwell (sm120)",
2547
2551
  "t4": "Turing (sm75)",
2548
2552
  "cpu-x86": "CPU (x86_64)",
2549
2553
  "cpu-arm": "CPU (arm64)",
@@ -2552,6 +2556,7 @@ def _show_availability_watch(interval: int) -> None:
2552
2556
  # Sort order: newest GPU architectures first, then CPUs at the bottom
2553
2557
  arch_priority = {
2554
2558
  "Blackwell (sm100)": 0,
2559
+ "Blackwell (sm120)": 0,
2555
2560
  "Hopper (sm90)": 1,
2556
2561
  "Ada Lovelace (sm89)": 2,
2557
2562
  "Ampere (sm80)": 3,
@@ -153,7 +153,7 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
153
153
  # CPU instances don't have GPUs, but we still need a "count" for nodes
154
154
  valid_counts = [0] # 0 GPUs for CPU-only instances
155
155
  multinode_counts = [] # No multinode for CPU instances
156
- elif gpu_type in ["t4", "l4", "a10g"]:
156
+ elif gpu_type in ["t4", "l4", "a10g", "rtxpro6000"]:
157
157
  valid_counts = [1, 2, 4]
158
158
  # Add multinode options
159
159
  multinode_counts = [8, 12, 16, 20, 24] # multiples of 4
@@ -535,6 +535,7 @@ class ReservationManager:
535
535
  "t4": {"max_gpus": 4},
536
536
  "l4": {"max_gpus": 4},
537
537
  "a10g": {"max_gpus": 4},
538
+ "rtxpro6000": {"max_gpus": 4},
538
539
  "t4-small": {"max_gpus": 1},
539
540
  "g5g": {"max_gpus": 2},
540
541
  "a100": {"max_gpus": 8},
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.4.1"
7
+ version = "0.5.1"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -1,6 +1,6 @@
1
1
  # Custom PyTorch GPU Development Server Image
2
- # Based on pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel
3
- FROM pytorch/pytorch:2.9.1-cuda12.8-cudnn9-devel
2
+ # Based on pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
3
+ FROM pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
4
4
 
5
5
  # Set environment variables for non-interactive installation
6
6
  ENV DEBIAN_FRONTEND=noninteractive
@@ -41,23 +41,22 @@ RUN apt-get install -y --no-install-recommends \
41
41
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
42
42
  apt-get install -y nodejs
43
43
 
44
- # Install CUDA 13.0 alongside existing CUDA 12.9
44
+ # Install CUDA 12.9, 13.0, 13.1, 13.2 alongside base CUDA 12.8
45
+ # Base image already has NVIDIA repo configured, no need for cuda-keyring
45
46
  RUN apt-get update && apt-get install -y --no-install-recommends \
46
- software-properties-common \
47
- && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb \
48
- && dpkg -i cuda-keyring_1.0-1_all.deb \
49
- && apt-get update \
50
- && apt-get install -y --no-install-recommends \
47
+ cuda-toolkit-12-9 \
51
48
  cuda-toolkit-13-0 \
52
- && rm cuda-keyring_1.0-1_all.deb \
49
+ cuda-toolkit-13-1 \
50
+ cuda-toolkit-13-2 \
53
51
  && apt-get clean \
54
52
  && rm -rf /var/lib/apt/lists/*
55
53
 
56
- # Set CUDA paths for both versions - 12.8 as default for PyTorch compatibility
57
- ENV CUDA_12_PATH=/usr/local/cuda-12.8
58
- ENV CUDA_13_PATH=/usr/local/cuda-13.0
59
- ENV PATH=/usr/local/cuda-12.8/bin:/usr/local/cuda-13.0/bin:${PATH}
60
- ENV LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:/usr/local/cuda-13.0/lib64:${LD_LIBRARY_PATH}
54
+ # CUDA 12.8 is the default (PyTorch compiled against it)
55
+ # All versions available at /usr/local/cuda-{12.8,12.9,13.0,13.1,13.2}/
56
+ # Switch with: export CUDA_HOME=/usr/local/cuda-13.2
57
+ ENV CUDA_HOME=/usr/local/cuda-12.8
58
+ ENV PATH=/usr/local/cuda-12.8/bin:${PATH}
59
+ ENV LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:${LD_LIBRARY_PATH}
61
60
 
62
61
  # Install EFA stack (prebuilt libfabric + OpenMPI + aws-ofi-nccl with GPU/RDMA support)
63
62
  # Uses AWS EFA installer which bundles tested, compatible versions of all components
@@ -80,10 +79,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
80
79
  && apt-get clean && rm -rf /var/lib/apt/lists/*
81
80
 
82
81
  # Clone and build NCCL tests with MPI support for multi-node benchmarking
82
+ RUN apt-get update && apt-get install -y --no-install-recommends libnccl-dev \
83
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
83
84
  RUN cd /opt && \
84
85
  git clone https://github.com/NVIDIA/nccl-tests.git && \
85
86
  cd nccl-tests && \
86
- make MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/usr/lib/x86_64-linux-gnu -j$(nproc)
87
+ make MPI=1 MPI_HOME=/opt/amazon/openmpi CUDA_HOME=/usr/local/cuda NCCL_HOME=/usr -j$(nproc)
87
88
 
88
89
  # Set environment variables for EFA and NCCL
89
90
  ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:${PATH}
@@ -101,7 +102,7 @@ ENV NCCL_ASYNC_ERROR_HANDLING=1
101
102
  ENV SUPPORTS_EFA=true
102
103
 
103
104
  # Install Python packages (Jupyter and common ML packages)
104
- RUN pip install --no-cache-dir \
105
+ RUN pip install --no-cache-dir --break-system-packages \
105
106
  jupyterlab \
106
107
  ipywidgets \
107
108
  matplotlib \
@@ -1,10 +1,11 @@
1
1
  # Clean PATH setup (no duplicates)
2
2
  export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
3
3
 
4
- # CUDA environment
5
- export CUDA_HOME=/usr/local/cuda
6
- export PATH="/usr/local/cuda/bin:$PATH"
7
- export LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}"
4
+ # CUDA environment (12.8 default, also available: 12.9, 13.0, 13.1, 13.2)
5
+ # Switch with: export CUDA_HOME=/usr/local/cuda-13.2 && export PATH="$CUDA_HOME/bin:$PATH"
6
+ export CUDA_HOME=/usr/local/cuda-12.8
7
+ export PATH="$CUDA_HOME/bin:$PATH"
8
+ export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
8
9
 
9
10
  # EFA and OpenMPI environment for multi-node GPU communication
10
11
  export PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:$PATH"
@@ -31,5 +32,4 @@ export CCACHE_DIR="/ccache_shared"
31
32
  export CCACHE_MAXSIZE="10G"
32
33
 
33
34
  # Claude Code configuration for Bedrock
34
- export CLAUDE_CODE_USE_BEDROCK=1
35
- export ANTHROPIC_MODEL="us.anthropic.claude-sonnet-4-20250514-v1:0"
35
+ export CLAUDE_CODE_USE_BEDROCK=1
@@ -83,13 +83,13 @@ resource "aws_iam_role_policy" "eks_node_bedrock_policy" {
83
83
  Effect = "Allow"
84
84
  Action = [
85
85
  "bedrock:InvokeModel",
86
- "bedrock:InvokeModelWithResponseStream"
87
- ]
88
- Resource = [
89
- "arn:aws:bedrock:*:*:foundation-model/anthropic.claude-*",
90
- "arn:aws:bedrock:*:*:inference-profile/us.anthropic.claude-*",
91
- "arn:aws:bedrock:*:*:inference-profile/global.anthropic.claude-*"
86
+ "bedrock:InvokeModelWithResponseStream",
87
+ "bedrock:ListInferenceProfiles",
88
+ "bedrock:GetInferenceProfile",
89
+ "bedrock:ListFoundationModels",
90
+ "bedrock-mantle:*"
92
91
  ]
92
+ Resource = "*"
93
93
  },
94
94
  {
95
95
  Effect = "Allow"
@@ -184,7 +184,8 @@ locals {
184
184
  "t4" = "t4"
185
185
  "t4-az2" = "t4" # Both t4 and t4-az2 should be labeled as "t4" in Kubernetes
186
186
  "l4" = "l4"
187
- "a10g" = "a10g"
187
+ "a10g" = "a10g"
188
+ "rtxpro6000" = "rtxpro6000"
188
189
  "h100" = "h100"
189
190
  "h200" = "h200"
190
191
  "b200" = "b200"
@@ -228,29 +228,29 @@ NGINXCONF
228
228
  fi
229
229
  done
230
230
 
231
- # Create bare .git tarball (much faster - no checkout needed!)
232
- echo "[CACHE] Creating pytorch .git tarball..."
231
+ # Create tarballs for main repo + ALL submodules
232
+ # Naming convention: org_repo-git.tar.gz (matches git-clone-cached client)
233
+ echo "[CACHE] Creating tarballs..."
233
234
  cd /git-cache
234
- rm -f pytorch-git.tar.gz.tmp
235
235
 
236
- # Just tar up the bare repo (pack files only, no working tree)
237
- # Client will do git checkout after download (unavoidable anyway)
238
- tar -czf pytorch-git.tar.gz.tmp -C /git-cache pytorch.git
239
- mv pytorch-git.tar.gz.tmp pytorch-git.tar.gz
236
+ # Main pytorch repo name must match org_repo convention
237
+ echo "[CACHE] Creating pytorch_pytorch-git.tar.gz..."
238
+ rm -f pytorch_pytorch-git.tar.gz.tmp
239
+ tar -czf pytorch_pytorch-git.tar.gz.tmp -C /git-cache pytorch.git
240
+ mv pytorch_pytorch-git.tar.gz.tmp pytorch_pytorch-git.tar.gz
241
+ SIZE=$(du -sh pytorch_pytorch-git.tar.gz | awk '{print $1}')
242
+ echo "[CACHE] pytorch_pytorch: $SIZE"
240
243
 
241
- SIZE=$(du -sh pytorch-git.tar.gz | awk '{print $1}')
242
- echo "[CACHE] Bare .git tarball created: $SIZE"
243
-
244
- # Create tarballs for largest submodules (top 10 by size)
245
- echo "[CACHE] Creating submodule tarballs..."
246
- for repo in $(du -s /git-cache/*.git 2>/dev/null | sort -rn | head -11 | tail -10 | awk '{print $2}'); do
244
+ # All submodule repos (already named org_repo.git by init container)
245
+ for repo in /git-cache/*.git; do
247
246
  name=$(basename "$repo")
247
+ [ "$name" = "pytorch.git" ] && continue
248
248
  tarball="$${name%.git}-git.tar.gz"
249
249
  echo "[CACHE] Creating $tarball..."
250
250
  rm -f "$tarball.tmp" 2>/dev/null
251
251
  tar -czf "$tarball.tmp" -C /git-cache "$name" 2>/dev/null && mv "$tarball.tmp" "$tarball" || echo "[CACHE] WARNING: Failed to create $tarball"
252
252
  done
253
- echo "[CACHE] Submodule tarballs created"
253
+ echo "[CACHE] All tarballs created"
254
254
  fi
255
255
 
256
256
  echo "[CACHE] Refresh complete at $(date). Next in 3600s (1 hour)..."
@@ -150,7 +150,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
150
150
  else:
151
151
  available_gpus = total_gpus
152
152
  else:
153
- # GPU nodes - use existing logic
153
+ # GPU nodes - use K8s schedulable node count for total if available
154
154
  total_gpus = running_instances * gpus_per_instance
155
155
  logger.info(
156
156
  f"ASG calculation: {running_instances} instances * {gpus_per_instance} GPUs = {total_gpus} total GPUs")
@@ -181,6 +181,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
181
181
  nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
182
182
 
183
183
  single_node_max = 0 # Max available on any single node
184
+ schedulable_total_gpus = 0 # Total GPUs on schedulable (non-cordoned) nodes
184
185
  for node in nodes.items:
185
186
  if is_node_ready_and_schedulable(node):
186
187
  available_on_node = get_available_gpus_on_node(v1, node)
@@ -192,6 +193,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
192
193
  except (ValueError, TypeError):
193
194
  pass
194
195
 
196
+ schedulable_total_gpus += total_on_node
197
+
195
198
  # Track max available on any single node
196
199
  single_node_max = max(single_node_max, available_on_node)
197
200
 
@@ -199,6 +202,8 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
199
202
  if total_on_node > 0 and available_on_node == total_on_node:
200
203
  full_nodes_available += 1
201
204
 
205
+ total_gpus = schedulable_total_gpus
206
+
202
207
  # Calculate max reservable considering multinode scenarios
203
208
  # Only high-end GPU types support multinode (up to 4 nodes = 32 GPUs)
204
209
  multinode_gpu_types = ['h100', 'h200', 'b200', 'a100']
@@ -49,7 +49,7 @@ DEFAULT_TIMEOUT_HOURS = int(os.environ["DEFAULT_TIMEOUT_HOURS"])
49
49
  QUEUE_URL = os.environ["QUEUE_URL"]
50
50
  PRIMARY_AVAILABILITY_ZONE = os.environ["PRIMARY_AVAILABILITY_ZONE"]
51
51
  GPU_DEV_CONTAINER_IMAGE = os.environ.get(
52
- "GPU_DEV_CONTAINER_IMAGE", "pytorch/pytorch:2.8.0-cuda12.9-cudnn9-devel")
52
+ "GPU_DEV_CONTAINER_IMAGE", "pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel")
53
53
  EFS_SECURITY_GROUP_ID = os.environ.get("EFS_SECURITY_GROUP_ID")
54
54
  EFS_SUBNET_IDS = os.environ.get("EFS_SUBNET_IDS", "").split(
55
55
  ",") if os.environ.get("EFS_SUBNET_IDS") else []
@@ -66,6 +66,7 @@ GPU_CONFIG = {
66
66
  "t4": {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0},
67
67
  "l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
68
68
  "a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
69
+ "rtxpro6000": {"instance_type": "g7e.24xlarge", "max_gpus": 4, "cpus": 96, "memory_gb": 1024, "efa_count": 2},
69
70
  "t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
70
71
  "g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
71
72
  "a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
@@ -2150,7 +2151,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2150
2151
  gpu_type = request.get("gpu_type", "")
2151
2152
 
2152
2153
  # Validate GPU type
2153
- valid_gpu_types = ["t4", "l4", "a10g", "t4-small", "a100",
2154
+ valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
2154
2155
  "h100", "h200", "b200", "cpu-arm", "cpu-x86"]
2155
2156
  if gpu_type not in valid_gpu_types:
2156
2157
  error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
@@ -2381,6 +2382,7 @@ def update_gpu_availability_table(
2381
2382
  "t4": {"gpus_per_instance": 4},
2382
2383
  "l4": {"gpus_per_instance": 4},
2383
2384
  "a10g": {"gpus_per_instance": 4},
2385
+ "rtxpro6000": {"gpus_per_instance": 4},
2384
2386
  "a100": {"gpus_per_instance": 8},
2385
2387
  "h100": {"gpus_per_instance": 8},
2386
2388
  "h200": {"gpus_per_instance": 8},
@@ -4570,7 +4572,7 @@ EOFREADME
4570
4572
 
4571
4573
  cat > /usr/local/bin/git-clone-cached << 'GITCACHESCRIPT'
4572
4574
  #!/bin/bash
4573
- # Clones from bare .git tarball if available in cache (10x faster than git protocol)
4575
+ # Clones repo + submodules from in-cluster cache (much faster than GitHub)
4574
4576
  CACHE_URL="http://git-cache.management.svc.cluster.local:8080"
4575
4577
  GIT="/usr/bin/git"
4576
4578
  GITHUB_URL="${{1}}"
@@ -4582,59 +4584,95 @@ if [ -z "$GITHUB_URL" ]; then
4582
4584
  DEST="${{DEST:-pytorch}}"
4583
4585
  fi
4584
4586
 
4587
+ # Handle short names: "pytorch" -> "https://github.com/pytorch/pytorch.git"
4588
+ if [[ ! "$GITHUB_URL" =~ ^https?:// ]] && [[ ! "$GITHUB_URL" =~ ^git@ ]]; then
4589
+ GITHUB_URL="https://github.com/pytorch/$GITHUB_URL.git"
4590
+ DEST="${{DEST:-${{1}}}}"
4591
+ fi
4592
+
4585
4593
  # Extract org/repo from GitHub URL and create cache tarball name
4586
- # https://github.com/pytorch/pytorch.git -> pytorch_pytorch-git.tar.gz
4587
- # https://github.com/ROCm/aiter.git -> ROCm_aiter-git.tar.gz
4588
4594
  if [[ "$GITHUB_URL" =~ github\.com[/:]([^/]+)/([^/\.]+) ]]; then
4589
4595
  ORG="${{BASH_REMATCH[1]}}"
4590
4596
  REPO="${{BASH_REMATCH[2]}}"
4591
4597
  TARBALL="${{ORG}}_${{REPO}}-git.tar.gz"
4592
4598
  else
4593
- # Not a GitHub URL, fall back to direct clone
4594
- exec "$GIT" clone "$GITHUB_URL" "${{DEST:+"$DEST"}}"
4595
- fi
4596
-
4597
- # Default destination to repo name if not specified
4598
- if [ -z "$DEST" ]; then
4599
- DEST="$REPO"
4599
+ exec "$GIT" clone --recurse-submodules --jobs 8 "$GITHUB_URL" "${{DEST:+"$DEST"}}"
4600
4600
  fi
4601
4601
 
4602
- if [ -d "$DEST" ]; then
4603
- echo "Error: $DEST already exists"
4604
- exit 1
4605
- fi
4602
+ if [ -z "$DEST" ]; then DEST="$REPO"; fi
4603
+ if [ -d "$DEST" ]; then echo "Error: $DEST already exists"; exit 1; fi
4606
4604
 
4607
- # Try to download from cache
4608
- echo "[git-cache] Checking cache for $ORG/$REPO..."
4605
+ echo "[git-cache] Cloning $ORG/$REPO..."
4609
4606
  TOTAL_START=$(date +%s)
4610
4607
 
4608
+ # --- Main repo ---
4611
4609
  mkdir -p "$DEST/.git"
4612
4610
  START=$(date +%s)
4613
4611
  if curl -sf "$CACHE_URL/$TARBALL" | tar -xz -C "$DEST/.git" --strip-components=1 2>/dev/null; then
4614
4612
  END=$(date +%s)
4615
- echo "[git-cache] Downloaded .git in $((END - START))s"
4613
+ echo "[git-cache] Main repo .git: $((END - START))s"
4616
4614
 
4617
- # Configure as non-bare repository and set origin
4618
4615
  cd "$DEST"
4619
4616
  "$GIT" config --file .git/config core.bare false
4620
4617
  "$GIT" config --file .git/config remote.origin.url "$GITHUB_URL"
4621
4618
  "$GIT" config --file .git/config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
4622
4619
 
4623
- echo "[git-cache] Checking out working tree..."
4624
4620
  START=$(date +%s)
4625
4621
  "$GIT" checkout -f HEAD 2>/dev/null
4626
4622
  END=$(date +%s)
4627
- echo "[git-cache] Checkout took $((END - START))s"
4623
+ echo "[git-cache] Checkout: $((END - START))s"
4624
+
4625
+ # --- Submodules from cache ---
4626
+ if [ -f .gitmodules ]; then
4627
+ echo "[git-cache] Setting up submodules..."
4628
+ SUB_START=$(date +%s)
4629
+
4630
+ "$GIT" submodule init
4631
+ ABS_ROOT="$(pwd)"
4632
+
4633
+ "$GIT" config --file .gitmodules --get-regexp 'submodule\..*\.url' | while read key url; do
4634
+ name=$(echo "$key" | sed 's/^submodule\.//;s/\.url$//')
4635
+ path=$("$GIT" config --file .gitmodules "submodule.$name.path")
4636
+ [ -z "$path" ] && continue
4637
+
4638
+ COMMIT=$("$GIT" ls-tree HEAD "$path" 2>/dev/null | awk '{{print $3}}')
4639
+ [ -z "$COMMIT" ] && continue
4640
+
4641
+ if [[ "$url" =~ github\.com[/:]([^/]+)/([^/.]+) ]]; then
4642
+ SUB_TARBALL="${{BASH_REMATCH[1]}}_${{BASH_REMATCH[2]}}-git.tar.gz"
4643
+ MODULES_DIR="$ABS_ROOT/.git/modules/$name"
4644
+
4645
+ mkdir -p "$MODULES_DIR"
4646
+ if curl -sf "$CACHE_URL/$SUB_TARBALL" | tar -xz -C "$MODULES_DIR" --strip-components=1 2>/dev/null; then
4647
+ "$GIT" -C "$MODULES_DIR" config core.bare false
4648
+ "$GIT" -C "$MODULES_DIR" config core.worktree "$ABS_ROOT/$path"
4649
+ "$GIT" -C "$MODULES_DIR" config remote.origin.url "$url"
4650
+ "$GIT" -C "$MODULES_DIR" config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*"
4651
+ mkdir -p "$ABS_ROOT/$path"
4652
+ echo "gitdir: $MODULES_DIR" > "$ABS_ROOT/$path/.git"
4653
+ "$GIT" -C "$ABS_ROOT/$path" checkout -f "$COMMIT" 2>/dev/null
4654
+ else
4655
+ rm -rf "$MODULES_DIR"
4656
+ fi
4657
+ fi
4658
+ done
4659
+
4660
+ # Fetch remaining/recursive submodules from GitHub
4661
+ "$GIT" -c protocol.file.allow=always submodule update --init --recursive --jobs 8 2>/dev/null
4662
+
4663
+ SUB_END=$(date +%s)
4664
+ echo "[git-cache] Submodules: $((SUB_END - SUB_START))s"
4665
+ fi
4628
4666
 
4629
4667
  TOTAL_END=$(date +%s)
4630
- echo "[git-cache] Total: $((TOTAL_END - TOTAL_START))s (from cache)"
4668
+ echo "[git-cache] Total: $((TOTAL_END - TOTAL_START))s"
4631
4669
  exit 0
4632
4670
  fi
4633
4671
 
4634
4672
  # Fallback: clone from GitHub
4635
4673
  echo "[git-cache] Cache miss, cloning from GitHub..."
4636
4674
  rm -rf "$DEST"
4637
- "$GIT" clone "$GITHUB_URL" "${{DEST:+"$DEST"}}"
4675
+ "$GIT" clone --recurse-submodules --jobs 8 "$GITHUB_URL" "${{DEST:+"$DEST"}}"
4638
4676
  GITCACHESCRIPT
4639
4677
  chmod +x /usr/local/bin/git-clone-cached
4640
4678
  echo "[STARTUP] ✓ git-clone-cached available (opt-in: use 'git-clone-cached pytorch' for cache)"
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.4.1"
184
- MIN_CLI_VERSION = "0.4.0"
183
+ LAMBDA_VERSION = "0.5.1"
184
+ MIN_CLI_VERSION = "0.5.1"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
187
  }, local.alb_env_vars)
@@ -79,7 +79,7 @@ locals {
79
79
  "cpu-arm" = {
80
80
  instance_type = "c7g.4xlarge"
81
81
  instance_types = null
82
- instance_count = 3
82
+ instance_count = 1
83
83
  gpus_per_instance = 0
84
84
  use_placement_group = false
85
85
  architecture = "arm64"
@@ -88,7 +88,7 @@ locals {
88
88
  "cpu-x86" = {
89
89
  instance_type = "c7i.4xlarge"
90
90
  instance_types = null
91
- instance_count = 3
91
+ instance_count = 1
92
92
  gpus_per_instance = 0
93
93
  use_placement_group = false
94
94
  architecture = "x86_64"
@@ -97,7 +97,7 @@ locals {
97
97
  "t4" = {
98
98
  instance_type = "g4dn.12xlarge"
99
99
  instance_types = null
100
- instance_count = 2 # 2 instances in primary AZ
100
+ instance_count = 1
101
101
  gpus_per_instance = 4
102
102
  use_placement_group = true
103
103
  architecture = "x86_64"
@@ -106,7 +106,7 @@ locals {
106
106
  "t4-az2" = {
107
107
  instance_type = "g4dn.12xlarge"
108
108
  instance_types = null
109
- instance_count = 2 # 2 instances in secondary AZ
109
+ instance_count = 0 # Disabled - use primary AZ only for testing
110
110
  gpus_per_instance = 4
111
111
  use_placement_group = true
112
112
  architecture = "x86_64"
@@ -115,7 +115,7 @@ locals {
115
115
  "h100" = {
116
116
  instance_type = "p5.48xlarge"
117
117
  instance_types = null
118
- instance_count = 2 # Fallback default (not used when capacity_reservations defined)
118
+ instance_count = 0 # Disabled - only use via CR when needed
119
119
  gpus_per_instance = 8
120
120
  use_placement_group = false
121
121
  architecture = "x86_64"
@@ -124,7 +124,7 @@ locals {
124
124
  "t4-small" = {
125
125
  instance_type = "g4dn.2xlarge"
126
126
  instance_types = null
127
- instance_count = 1
127
+ instance_count = 0 # Disabled
128
128
  gpus_per_instance = 1
129
129
  use_placement_group = false
130
130
  architecture = "x86_64"
@@ -183,7 +183,7 @@ locals {
183
183
  "t4" = {
184
184
  instance_type = "g4dn.12xlarge"
185
185
  instance_types = null
186
- instance_count = 5 # Fallback default (not used when capacity_reservations defined)
186
+ instance_count = 2
187
187
  gpus_per_instance = 4
188
188
  use_placement_group = true
189
189
  architecture = "x86_64"
@@ -192,7 +192,7 @@ locals {
192
192
  "l4" = {
193
193
  instance_type = "g6.12xlarge"
194
194
  instance_types = null
195
- instance_count = 5 # Fallback default (not used when capacity_reservations defined)
195
+ instance_count = 2
196
196
  gpus_per_instance = 4 # 4x L4 GPUs
197
197
  use_placement_group = false
198
198
  architecture = "x86_64"
@@ -201,16 +201,25 @@ locals {
201
201
  "a10g" = {
202
202
  instance_type = "g5.12xlarge"
203
203
  instance_types = null
204
- instance_count = 2
204
+ instance_count = 1
205
205
  gpus_per_instance = 4 # 4x A10G GPUs
206
206
  use_placement_group = false
207
207
  architecture = "x86_64"
208
208
  efa_network_cards = 1
209
209
  }
210
+ "rtxpro6000" = {
211
+ instance_type = "g7e.24xlarge"
212
+ instance_types = null
213
+ instance_count = 2
214
+ gpus_per_instance = 4 # 4x RTX PRO 6000 Blackwell GPUs
215
+ use_placement_group = false
216
+ architecture = "x86_64"
217
+ efa_network_cards = 2
218
+ }
210
219
  "cpu-arm" = {
211
220
  instance_type = "c7g.8xlarge"
212
221
  instance_types = null
213
- instance_count = 30
222
+ instance_count = 10
214
223
  gpus_per_instance = 0
215
224
  use_placement_group = false
216
225
  architecture = "arm64"
@@ -219,7 +228,7 @@ locals {
219
228
  "cpu-x86" = {
220
229
  instance_type = "c7i.8xlarge"
221
230
  instance_types = null
222
- instance_count = 30
231
+ instance_count = 10
223
232
  gpus_per_instance = 0
224
233
  use_placement_group = false
225
234
  architecture = "x86_64"
@@ -257,6 +266,7 @@ locals {
257
266
  h100 = [
258
267
  { key = "cr0", id = "cr-0a3f49b96fe03ca04", instance_count = 4 }, # H100 reservation us-east-2c (p5.48xlarge)
259
268
  { key = "cr1", id = null, instance_count = 2 }, # H100 on-demand (2 instances)
269
+ { key = "cr2", id = "cr-044bc72b0a6b56062", instance_count = 4 }, # H100 reservation us-east-2a (4 instances)
260
270
  ]
261
271
  h200 = [
262
272
  { key = "cr0", id = "cr-0f6d0766f5d3339e6", instance_count = 2 }, # H200 capacity block (may be expired - keep to prevent ASG destroy)
@@ -264,9 +274,10 @@ locals {
264
274
  { key = "cr2", id = null, instance_count = 2 }, # H200 on-demand (2 instances)
265
275
  ]
266
276
  b200 = [
267
- { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 0 }, # B200 reservation (disabled - CR expired)
277
+ { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 0 }, # B200 reservation us-east-2a (disabled - CR freed)
268
278
  { key = "cr1", id = "cr-08e7fee0b8dc3de5e", instance_count = 3 }, # B200 reservation (3 instances)
269
279
  { key = "cr2", id = null, instance_count = 2 }, # B200 on-demand (2 instances)
280
+ { key = "cr3", id = "cr-0f5f6bb30a8fe3c68", instance_count = 2 }, # B200 reservation us-east-2b (2 instances)
270
281
  ]
271
282
  # T4 and L4 don't have capacity reservations - managed via supported_gpu_types fallback
272
283
  }
@@ -291,7 +302,8 @@ locals {
291
302
  a100 = "primary"
292
303
  t4 = "primary"
293
304
  l4 = "secondary"
294
- a10g = "secondary"
305
+ a10g = "secondary"
306
+ rtxpro6000 = "secondary"
295
307
  "cpu-arm" = "primary"
296
308
  "cpu-x86" = "primary"
297
309
  }
@@ -307,11 +319,13 @@ locals {
307
319
  "cr-0c366fb8339a10f69" = "primary" # us-east-2a
308
320
  "cr-0122dff5e01d566dc" = "secondary" # us-east-2b
309
321
  "cr-08e7fee0b8dc3de5e" = "secondary" # us-east-2b
322
+ "cr-0f5f6bb30a8fe3c68" = "secondary" # us-east-2b
310
323
  # H200 capacity reservations
311
324
  "cr-0f6d0766f5d3339e6" = "tertiary" # us-east-2c (may be expired - kept to prevent ASG destroy)
312
325
  "cr-06c9c978dea756a26" = "tertiary" # us-east-2c
313
- # H100 capacity reservation
326
+ # H100 capacity reservations
314
327
  "cr-0a3f49b96fe03ca04" = "tertiary" # us-east-2c (p5.48xlarge)
328
+ "cr-044bc72b0a6b56062" = "primary" # us-east-2a (p5.48xlarge)
315
329
  # A100 capacity reservation
316
330
  "cr-01cc0f00f28b095af" = "primary" # us-east-2a
317
331
  }
@@ -11,7 +11,7 @@ systemctl disable nodeadm-run.service || true
11
11
  systemctl stop nodeadm-config.service || true
12
12
  systemctl stop nodeadm-run.service || true
13
13
 
14
- # Install NVIDIA driver 580.82.07 directly on host for CUDA 13 support
14
+ # Install latest NVIDIA driver on host (595.x branch supports CUDA 13.2)
15
15
  # GPU Operator will handle toolkit/device-plugin only
16
16
 
17
17
  # Configure NVIDIA profiling BEFORE driver installation (driver install auto-loads modules)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes