gpu-dev 0.5.31__tar.gz → 0.5.32__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/CLAUDE.md +52 -0
  2. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/PKG-INFO +1 -1
  3. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  4. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -4
  5. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +65 -30
  6. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +1 -1
  7. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +40 -23
  8. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +14 -2
  9. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/pyproject.toml +1 -1
  10. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ami-baker.tf +22 -3
  11. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/availability.tf +1 -1
  12. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/Dockerfile +9 -9
  13. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ecr.tf +73 -4
  14. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/eks.tf +45 -5
  15. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/expiry.tf +1 -1
  16. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/kubernetes.tf +13 -13
  17. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/availability_updater/index.py +7 -5
  18. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_processor/index.py +80 -21
  19. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda.tf +1 -1
  20. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/main.tf +26 -2
  21. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy-service.tf +8 -7
  22. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/al2023-user-data.sh +15 -6
  23. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +4 -3
  24. gpu_dev-0.5.31/PROGRESS.md +0 -288
  25. gpu_dev-0.5.31/PR_DESCRIPTION.md +0 -168
  26. gpu_dev-0.5.31/TODO.md +0 -64
  27. gpu_dev-0.5.31/post.md +0 -233
  28. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/.github/workflows/no-gitlinks.yml +0 -0
  29. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/.github/workflows/publish.yml +0 -0
  30. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/.gitignore +0 -0
  31. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/README.md +0 -0
  32. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/admin/README.md +0 -0
  33. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/admin/generate_stats.py +0 -0
  34. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/admin/requirements.txt +0 -0
  35. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/README.md +0 -0
  36. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  37. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  38. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  39. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  40. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  41. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  42. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  43. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  44. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  45. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  46. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  47. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  48. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/docs/USER_GUIDE.md +0 -0
  49. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/docs/devgpu-features.html +0 -0
  50. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/docs/docker-mark-blue.svg +0 -0
  51. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/docs/icons8-cursor-ai.svg +0 -0
  52. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/setup.cfg +0 -0
  53. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  54. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  55. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/README.md +0 -0
  56. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/alb.tf +0 -0
  57. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/backend.tf +0 -0
  58. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/check_b200.py +0 -0
  59. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  60. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  61. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  62. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  63. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/bash_profile +0 -0
  64. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/bashrc +0 -0
  65. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  66. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  67. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  68. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  69. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/motd_script +0 -0
  70. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  71. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/profile +0 -0
  72. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  73. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  74. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  75. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/shell_env +0 -0
  76. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/ssh_config +0 -0
  77. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/zprofile +0 -0
  78. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/zshrc +0 -0
  79. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  80. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker-build.tf +0 -0
  81. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  82. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  83. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/efs.tf +0 -0
  84. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/git-cache.tf +0 -0
  85. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  86. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  87. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  88. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  89. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  90. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  91. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  92. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  93. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  94. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  95. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  96. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  97. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  98. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  99. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/list_b200.py +0 -0
  100. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/mig-config.tf +0 -0
  101. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  102. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  103. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  104. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  105. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  106. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  107. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/monitoring.tf +0 -0
  108. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  109. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/outputs.tf +0 -0
  110. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/pyproject.toml +0 -0
  111. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/queue.tf +0 -0
  112. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/route53.tf +0 -0
  113. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  114. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  115. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  116. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  117. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  118. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  119. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  120. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  121. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  122. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  123. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/switch-to.sh +0 -0
  124. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  125. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  126. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  127. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/terraform-gpu-devservers/variables.tf +0 -0
  128. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/tests/submit/README.md +0 -0
  129. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/tests/submit/fail/run.sh +0 -0
  130. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/tests/submit/multinode/run.sh +0 -0
  131. {gpu_dev-0.5.31 → gpu_dev-0.5.32}/tests/submit/success/run.sh +0 -0
@@ -183,6 +183,55 @@ kubectl port-forward -n monitoring svc/kube-prometheus-stack-prometheus 9090:909
183
183
  kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
184
184
  ```
185
185
 
186
+ ## Multi-Region Single-State Refactor (Research Notes, May 2026)
187
+
188
+ **Goal:** One `tf apply` manages all regions. No more `tf-all`, no double Docker builds, no double AMI bakes.
189
+
190
+ **Approach:** Module-per-region pattern.
191
+ ```hcl
192
+ # root main.tf
193
+ module "us_east_2" {
194
+ source = "./modules/region"
195
+ region = "us-east-2"
196
+ gpu_types = { h100 = {...}, b200 = {...}, ... }
197
+ spot_types = []
198
+ providers = { aws = aws.us_east_2 }
199
+ }
200
+ module "us_east_1" {
201
+ source = "./modules/region"
202
+ region = "us-east-1"
203
+ gpu_types = { b300 = {...}, t4 = {...}, ... }
204
+ spot_types = ["b300", "b200", "h100", ...]
205
+ providers = { aws = aws.us_east_1 }
206
+ }
207
+ ```
208
+
209
+ **What goes in the module:** VPC, subnets, EKS cluster, ASGs, launch templates, Lambda functions, DDB tables, EFS, monitoring, DNS. Basically everything in the current root except provider config and shared resources.
210
+
211
+ **What stays at root:** Provider blocks with aliases, ECR replication config, AMI copy (`aws_ami_copy` from primary to secondary regions), global IAM roles if any, CLI config.
212
+
213
+ **AMI sharing:** Build baked AMI in us-east-2 (primary), `aws_ami_copy` to other regions. One build, replicated. The `ami_baker` stays in root, outputs AMI ID, each module receives it as a variable.
214
+
215
+ **Docker sharing:** ECR replication already set up. Docker builds once in primary region, auto-replicates.
216
+
217
+ **Migration plan (since nobody uses east1 yet):**
218
+ 1. `tofu workspace select prod-east1 && tofu destroy` — clean slate
219
+ 2. Move all resources into `modules/region/`
220
+ 3. Create provider aliases in root
221
+ 4. Import prod (us-east-2) resources into new module state: `tofu import module.us_east_2.aws_vpc.gpu_dev_vpc vpc-xxx`
222
+ 5. Add us-east-1 module — fresh create, no import needed
223
+ 6. Delete workspace: `tofu workspace delete prod-east1`
224
+
225
+ **Risks:**
226
+ - Import step for prod is tedious (~50+ resources) but mechanical
227
+ - Lambda zip paths need to be relative to module, not root
228
+ - EKS auth (aws-auth ConfigMap) is per-cluster — each module manages its own
229
+ - CLI needs to know which region to query — already handled by config
230
+
231
+ **Estimated effort:** 1 dedicated session (~4-6 hours). Most time on the module extraction + prod import.
232
+
233
+ **Prerequisite for:** Adding us-west-1, us-west-2, or any future region (becomes one module block each).
234
+
186
235
  ## Recent Fixes (Oct 27, 2025)
187
236
 
188
237
  **NVIDIA Profiling Bootstrap Configuration (Oct 27, 2025):**
@@ -232,6 +281,9 @@ kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
232
281
 
233
282
  ### 📋 Remaining Tasks
234
283
 
284
+ - **Merge multi-region into single tf state** - HIGH PRIORITY. Kill prod-east1 workspace, refactor into module-per-region in one state. See research notes below. Enables: one `tf apply`, shared AMI (aws_ami_copy), shared Docker (ECR replication already set up), no double builds. Prerequisite for adding west regions.
285
+ - **Add us-west-1 and us-west-2 spot regions** - BLOCKED on single-state refactor. After refactor, adding a region = adding one module block.
286
+ - **Spot UX improvements** - Queue position should be #1 for each type (not cross-type FIFO). Status should show "queued (waiting for capacity)" not just "queued". Interactive picker should show spot GPU counts from east1 not prod.
235
287
  - **FQDN for devservers** - Set up proper domain names for development server access
236
288
  - **Automated SSH config per reservation** - ✅ DONE - Each reservation now gets `~/.devgpu/<reservation_id>-sshconfig` file, use with `ssh -F ~/.devgpu/<reservation_id>-sshconfig <pod_name>`
237
289
  - **Custom Docker image scaffold** - Create Dockerfile with pre-installed packages (Jupyter, etc.)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.31
3
+ Version: 0.5.32
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.31
3
+ Version: 0.5.32
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,10 +1,6 @@
1
1
  .gitignore
2
2
  CLAUDE.md
3
- PROGRESS.md
4
- PR_DESCRIPTION.md
5
3
  README.md
6
- TODO.md
7
- post.md
8
4
  pyproject.toml
9
5
  .github/workflows/no-gitlinks.yml
10
6
  .github/workflows/publish.yml
@@ -526,7 +526,7 @@ def main(ctx: click.Context) -> None:
526
526
  "--gpu-type",
527
527
  "-t",
528
528
  type=click.Choice(
529
- ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
529
+ ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"], case_sensitive=False
530
530
  ),
531
531
  help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
532
532
  )
@@ -698,6 +698,7 @@ def reserve(
698
698
  "b300": {"max_gpus": 8, "instance_type": "p6-b300.48xlarge"},
699
699
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
700
700
  "cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
701
+ "cpu-spot": {"max_gpus": 0, "instance_type": "c7i.2xlarge"},
701
702
  }
702
703
 
703
704
  # Early validation of GPU type to extract max_gpus (needed for disk selection)
@@ -1418,7 +1419,7 @@ def reserve(
1418
1419
 
1419
1420
  _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1420
1421
  "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
1421
- "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
1422
+ "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"]
1422
1423
 
1423
1424
 
1424
1425
  @main.command(context_settings={"ignore_unknown_options": True})
@@ -1837,7 +1838,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1837
1838
  ended = item.get("reservation_ended") or item.get("expired_at") or item.get("created_at", "")
1838
1839
  if ended and ended < one_hour_ago:
1839
1840
  continue
1840
- item["_region"] = "us-east-1"
1841
+ item["_region"] = "east1"
1841
1842
  results.append(item)
1842
1843
  return results
1843
1844
  except Exception:
@@ -1847,11 +1848,45 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1847
1848
  active_future = executor.submit(fetch_active)
1848
1849
  failures_future = executor.submit(fetch_recent_failures)
1849
1850
  east1_future = executor.submit(fetch_east1)
1850
- reservations = active_future.result() + failures_future.result() + east1_future.result()
1851
+ prod_results = active_future.result() + failures_future.result()
1852
+ for r in prod_results:
1853
+ if "_region" not in r:
1854
+ r["_region"] = "prod"
1855
+ east1_results = east1_future.result()
1856
+ for r in east1_results:
1857
+ if "_region" not in r:
1858
+ r["_region"] = "east1"
1859
+ reservations = prod_results + east1_results
1851
1860
  else:
1852
- reservations = reservation_mgr.list_reservations(
1861
+ prod_res = reservation_mgr.list_reservations(
1853
1862
  user_filter=user_filter, statuses_to_include=statuses_to_include
1854
1863
  )
1864
+ for r in prod_res:
1865
+ if "_region" not in r:
1866
+ r["_region"] = "prod"
1867
+ east1_res = fetch_east1() if not status else []
1868
+ if not east1_res:
1869
+ try:
1870
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
1871
+ if east1_env and config.user_config.get("environment") == "prod":
1872
+ import boto3 as _b3
1873
+ east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
1874
+ east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
1875
+ for s in (statuses_to_include or ["active", "preparing", "queued", "pending"]):
1876
+ resp = east1_table.query(
1877
+ IndexName="StatusIndex",
1878
+ KeyConditionExpression="#s = :status",
1879
+ ExpressionAttributeNames={"#s": "status"},
1880
+ ExpressionAttributeValues={":status": s},
1881
+ )
1882
+ for item in resp.get("Items", []):
1883
+ if user_filter and item.get("user_id") != user_filter:
1884
+ continue
1885
+ item["_region"] = "east1"
1886
+ east1_res.append(item)
1887
+ except Exception:
1888
+ pass
1889
+ reservations = prod_res + east1_res
1855
1890
  except RuntimeError as e:
1856
1891
  rprint(f"[red]❌ {str(e)}[/red]")
1857
1892
  return False
@@ -1883,7 +1918,8 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1883
1918
 
1884
1919
  # Create table with enhanced columns for queue info
1885
1920
  # Check if we have cross-region reservations
1886
- _has_east1 = any(r.get("_region") == "us-east-1" for r in reservations)
1921
+ _regions = frozenset(r.get("_region", "") for r in reservations if r.get("_region"))
1922
+ _has_multi_region = len(_regions) > 1 or "east1" in _regions
1887
1923
 
1888
1924
  table = Table(title="GPU Reservations")
1889
1925
  table.add_column("ID", style="cyan", no_wrap=True)
@@ -1894,7 +1930,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1894
1930
  table.add_column("Queue Info", style="cyan")
1895
1931
  table.add_column("Created", style="blue")
1896
1932
  table.add_column("Expires/ETA", style="red")
1897
- if _has_east1:
1933
+ if _has_multi_region:
1898
1934
  table.add_column("Region", style="dim")
1899
1935
  if details:
1900
1936
  table.add_column("CLI Ver", style="dim", no_wrap=True)
@@ -1935,13 +1971,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1935
1971
  # Use the new helper that shows time + remaining
1936
1972
  expires_formatted = _format_expires_with_remaining(expires_at)
1937
1973
  elif res_status in ["queued", "pending"]:
1938
- # Show estimated wait time if available
1939
1974
  estimated_wait = reservation.get(
1940
1975
  "estimated_wait_minutes", "?")
1941
- if estimated_wait != "?" and estimated_wait is not None:
1976
+ if estimated_wait and estimated_wait not in ("?", "None", None):
1942
1977
  expires_formatted = f"~{estimated_wait}min"
1943
1978
  else:
1944
- expires_formatted = "Calculating..."
1979
+ expires_formatted = "Waiting..."
1945
1980
  elif res_status in ("expired", "failed", "cancelled"):
1946
1981
  reason = reservation.get("failure_reason", "")
1947
1982
  ended = reservation.get("reservation_ended") or reservation.get("expired_at", "")
@@ -1968,15 +2003,11 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1968
2003
  # Format queue info for queued reservations
1969
2004
  queue_info = ""
1970
2005
  if res_status in ["queued", "pending"]:
1971
- queue_position = reservation.get("queue_position", "?")
1972
- estimated_wait = reservation.get(
1973
- "estimated_wait_minutes", "?")
1974
- if queue_position != "?" and queue_position is not None:
1975
- queue_info = f"#{queue_position}"
1976
- if estimated_wait != "?" and estimated_wait is not None:
1977
- queue_info += f" (~{estimated_wait}min)"
2006
+ detail = reservation.get("current_detailed_status") or reservation.get("detailed_status") or ""
2007
+ if "capacity" in detail.lower() or "spot" in detail.lower():
2008
+ queue_info = "Waiting for spot"
1978
2009
  else:
1979
- queue_info = "Calculating..."
2010
+ queue_info = "Spot pending"
1980
2011
  elif res_status == "active":
1981
2012
  # Show pod IP for multinode, SSH hint for single-node
1982
2013
  pod_ip = reservation.get("pod_ip", "")
@@ -2099,9 +2130,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
2099
2130
  row_data.append(
2100
2131
  f"[dim]{lambda_version_display}[/dim]" if dim_row else lambda_version_display)
2101
2132
 
2102
- if _has_east1:
2103
- region = reservation.get("_region", "us-east-2")
2104
- row_data.append("[yellow]east1[/yellow]" if region == "us-east-1" else "prod")
2133
+ if _has_multi_region:
2134
+ region = reservation.get("_region", "prod")
2135
+ if region in ("us-east-1", "east1"):
2136
+ row_data.append("[yellow]east1[/yellow]")
2137
+ else:
2138
+ row_data.append("prod")
2105
2139
 
2106
2140
  table.add_row(*row_data)
2107
2141
 
@@ -2279,8 +2313,11 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
2279
2313
 
2280
2314
  queue_info = ""
2281
2315
  if res_status in ["queued", "pending"]:
2282
- queue_position = reservation.get("queue_position", "?")
2283
- queue_info = f"#{queue_position}" if queue_position != "?" else "Calculating..."
2316
+ detail = reservation.get("current_detailed_status") or reservation.get("detailed_status") or ""
2317
+ if "capacity" in detail.lower() or "spot" in detail.lower():
2318
+ queue_info = "Waiting for spot"
2319
+ else:
2320
+ queue_info = "Spot pending"
2284
2321
  elif res_status == "active":
2285
2322
  queue_info = "Ready"
2286
2323
 
@@ -2313,10 +2350,10 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
2313
2350
  expires_formatted = _format_expires_with_remaining(expires_at)
2314
2351
  elif res_status in ["queued", "pending"]:
2315
2352
  estimated_wait = reservation.get("estimated_wait_minutes", "?")
2316
- if estimated_wait != "?" and estimated_wait is not None:
2353
+ if estimated_wait and estimated_wait not in ("?", "None", None):
2317
2354
  expires_formatted = f"~{estimated_wait}min"
2318
2355
  else:
2319
- expires_formatted = "Calculating..."
2356
+ expires_formatted = "Waiting..."
2320
2357
  else:
2321
2358
  expires_formatted = "N/A"
2322
2359
 
@@ -2971,7 +3008,7 @@ def _show_availability() -> None:
2971
3008
  spot_table.add_column("Avail\nNow", style="green")
2972
3009
  spot_table.add_column("Per\nNode", style="bright_green")
2973
3010
  spot_table.add_column("Status", style="magenta")
2974
- spot_table.add_column("Availability", style="dim")
3011
+ spot_table.add_column("Spot Discount", style="dim")
2975
3012
  _on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
2976
3013
  for gt, info in sorted(spot_region_info.items()):
2977
3014
  avail = info.get("available", 0)
@@ -2981,14 +3018,12 @@ def _show_availability() -> None:
2981
3018
  si = info.get("spot_info", {}) or {}
2982
3019
  sp = si.get("spot_price", "") if isinstance(si, dict) else ""
2983
3020
  if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
2984
- avail_signal = "[red]Not offered[/red]"
3021
+ avail_signal = "[green]Available[/green]" if avail > 0 else "[dim]No price data[/dim]"
2985
3022
  else:
2986
3023
  try:
2987
3024
  ratio = float(sp) / _on_demand.get(gt, 50)
2988
3025
  pct = int((1 - ratio) * 100)
2989
- if ratio < 0.4: avail_signal = f"[green]High ({pct}% off)[/green]"
2990
- elif ratio < 0.7: avail_signal = f"[yellow]Medium ({pct}% off)[/yellow]"
2991
- else: avail_signal = f"[red]Low ({pct}% off)[/red]"
3026
+ avail_signal = f"[green]{pct}% off on-demand[/green]" if pct > 0 else "[dim]At on-demand price[/dim]"
2992
3027
  except (ValueError, TypeError):
2993
3028
  avail_signal = "[yellow]Unknown[/yellow]"
2994
3029
  spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
@@ -26,7 +26,7 @@ class Config:
26
26
  "region": "us-east-1",
27
27
  "workspace": "prod-east1",
28
28
  "description": "Spot-only us-east-1 environment (T4/L4/CPU)",
29
- "spot_types": ["b300", "b200", "h200", "h100", "a100"],
29
+ "spot_types": ["b300", "b200", "h200", "h100", "a100", "t4", "l4", "rtxpro6000"],
30
30
  },
31
31
  }
32
32
  DEFAULT_ENVIRONMENT = "prod"
@@ -52,11 +52,19 @@ def check_interactive_support() -> bool:
52
52
 
53
53
  def select_gpu_type_interactive(
54
54
  availability_info: Dict[str, Dict[str, Any]],
55
+ _refresh: bool = False,
55
56
  ) -> Optional[str]:
56
57
  """Interactive GPU type selection with availability table"""
57
58
  if not check_interactive_support():
58
59
  return None
59
60
 
61
+ if _refresh:
62
+ from .reservations import ReservationManager
63
+ from .config import load_config
64
+ _cfg = load_config()
65
+ _mgr = ReservationManager(_cfg)
66
+ availability_info = _mgr.get_gpu_availability_by_type() or availability_info
67
+
60
68
  # Hide MIG slice SKUs from the top-level selector — reached via the h100 submenu.
61
69
  # Direct `--gpu-type h100-mig-1g` still works for non-interactive scripts.
62
70
  visible_info = {
@@ -194,7 +202,7 @@ def select_gpu_type_interactive(
194
202
  st.add_column("Avail\nNow", style="green")
195
203
  st.add_column("Per\nNode", style="bright_green")
196
204
  st.add_column("Status", style="magenta")
197
- st.add_column("Availability", style="dim")
205
+ st.add_column("Spot Discount", style="dim")
198
206
  _on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
199
207
  for gt, info in spot_gpus.items():
200
208
  avail = info.get("available", 0)
@@ -205,7 +213,7 @@ def select_gpu_type_interactive(
205
213
  # Availability signal from spot price vs on-demand
206
214
  sp = si.get("spot_price", "") if isinstance(si, dict) else ""
207
215
  if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
208
- avail_signal = "[red]Not offered[/red]"
216
+ avail_signal = "[green]Available[/green]" if avail > 0 else "[dim]No price data[/dim]"
209
217
  else:
210
218
  try:
211
219
  ratio = float(sp) / _on_demand.get(gt, 50)
@@ -266,37 +274,46 @@ def select_gpu_type_interactive(
266
274
  si_data = info.get("spot_info", {}) or {}
267
275
  sp = si_data.get("spot_price", "") if isinstance(si_data, dict) else ""
268
276
  # Derive availability signal
277
+ avail_now = int(info.get("available", 0))
269
278
  if not sp or "No spot data" in str(si_data.get("spot_signal", "")):
270
- # Not offered — skip from choices
271
- continue
272
- try:
273
- ratio = float(sp) / _on_demand.get(gt, 50)
274
- pct = int((1 - ratio) * 100)
275
- if ratio < 0.4: signal = f"🟢 High avail ({pct}% off)"
276
- elif ratio < 0.7: signal = f"🟡 Medium ({pct}% off)"
277
- else: signal = f"🔴 Low ({pct}% off)"
278
- except (ValueError, TypeError):
279
- signal = "availability unknown"
279
+ if avail_now > 0:
280
+ signal = f"🟢 {avail_now} available now"
281
+ else:
282
+ continue
283
+ else:
284
+ try:
285
+ ratio = float(sp) / _on_demand.get(gt, 50)
286
+ pct = int((1 - ratio) * 100)
287
+ if ratio < 0.4: signal = f"🟢 High avail ({pct}% off)"
288
+ elif ratio < 0.7: signal = f"🟡 Medium ({pct}% off)"
289
+ else: signal = f"🔴 Low ({pct}% off)"
290
+ except (ValueError, TypeError):
291
+ signal = "availability unknown"
280
292
  if avail > 0:
281
293
  label = f"✅ {gt.upper()} * ({avail} free, {pn}/node, {signal})"
282
294
  else:
283
295
  label = f"⚡ {gt.upper()} * ({pn} GPUs/node, {signal})"
284
296
  choices.append(questionary.Choice(title=label, value=f"spot:{gt}"))
285
297
 
286
- console.print()
298
+ choices.append(questionary.Separator("───"))
299
+ choices.append(questionary.Choice(title="🔄 Refresh availability", value="_refresh"))
287
300
 
288
- # Interactive selection console.print()
301
+ console.print()
289
302
 
290
- # Interactive selection
291
- try:
292
- answer = questionary.select(
293
- "Select GPU type:", choices=choices, style=custom_style
294
- ).ask()
303
+ # Interactive selection — loop on refresh
304
+ while True:
305
+ try:
306
+ answer = questionary.select(
307
+ "Select GPU type:", choices=choices, style=custom_style
308
+ ).ask()
295
309
 
296
- return answer
297
- except (KeyboardInterrupt, EOFError):
298
- console.print("\n[yellow]Selection cancelled.[/yellow]")
299
- return None
310
+ if answer == "_refresh":
311
+ console.print("[dim]Refreshing...[/dim]")
312
+ return select_gpu_type_interactive(availability_info, _refresh=True)
313
+ return answer
314
+ except (KeyboardInterrupt, EOFError):
315
+ console.print("\n[yellow]Selection cancelled.[/yellow]")
316
+ return None
300
317
 
301
318
 
302
319
  def _format_eta_seconds(delta_seconds: int) -> str:
@@ -826,8 +826,20 @@ class ReservationManager:
826
826
  ]
827
827
 
828
828
  if len(matching_reservations) == 0:
829
- return None
830
- elif len(matching_reservations) > 1:
829
+ # Not found by user_id — try direct lookup (for added users viewing other's reservations)
830
+ try:
831
+ from boto3.dynamodb.conditions import Key
832
+ scan_resp = self.reservations_table.scan(
833
+ FilterExpression="begins_with(reservation_id, :rid)",
834
+ ExpressionAttributeValues={":rid": reservation_id},
835
+ Limit=10,
836
+ )
837
+ matching_reservations = scan_resp.get("Items", [])
838
+ except Exception:
839
+ pass
840
+ if not matching_reservations:
841
+ return None
842
+ if len(matching_reservations) > 1:
831
843
  return None # Ambiguous - need longer prefix
832
844
 
833
845
  reservation = matching_reservations[0]
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.31"
7
+ version = "0.5.32"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -11,6 +11,7 @@ locals {
11
11
  ami_baker_trigger = sha256(join("\n", [
12
12
  data.aws_ami.eks_gpu_ami_x86_64.id,
13
13
  filesha256("${path.module}/templates/al2023-user-data.sh"),
14
+ filesha256("${path.module}/templates/ami-baker-user-data.sh"),
14
15
  local.latest_image_uri,
15
16
  ]))
16
17
  ami_baker_name = "gpu-dev-baked-${substr(local.ami_baker_trigger, 0, 8)}"
@@ -19,11 +20,11 @@ locals {
19
20
  image_uri = local.latest_image_uri
20
21
  }))
21
22
 
22
- # Use baked AMI when available, fall back to standard.
23
- gpu_ami_id = length(data.aws_ami_ids.gpu_baked.ids) > 0 ? data.aws_ami_ids.gpu_baked.ids[0] : data.aws_ami.eks_gpu_ami_x86_64.id
23
+ # Use baked AMI when available (checked AFTER baker runs), fall back to standard.
24
+ gpu_ami_id = length(data.aws_ami_ids.gpu_baked_resolved.ids) > 0 ? data.aws_ami_ids.gpu_baked_resolved.ids[0] : data.aws_ami.eks_gpu_ami_x86_64.id
24
25
  }
25
26
 
26
- # Look up existing baked AMI uses aws_ami_ids which returns [] instead of erroring
27
+ # Pre-build check: does the baked AMI already exist? Controls whether baker runs.
27
28
  data "aws_ami_ids" "gpu_baked" {
28
29
  owners = ["self"]
29
30
 
@@ -39,6 +40,24 @@ data "aws_ami_ids" "gpu_baked" {
39
40
  sort_ascending = false
40
41
  }
41
42
 
43
+ # Post-build lookup: re-reads AFTER the baker finishes, so a freshly built AMI
44
+ # is picked up in the same apply (no second apply needed).
45
+ data "aws_ami_ids" "gpu_baked_resolved" {
46
+ depends_on = [null_resource.ami_baker]
47
+ owners = ["self"]
48
+
49
+ filter {
50
+ name = "name"
51
+ values = [local.ami_baker_name]
52
+ }
53
+ filter {
54
+ name = "state"
55
+ values = ["available"]
56
+ }
57
+
58
+ sort_ascending = false
59
+ }
60
+
42
61
  # Build the baked AMI when inputs change
43
62
  resource "null_resource" "ami_baker" {
44
63
  # Only run when the target AMI doesn't exist yet
@@ -48,7 +48,7 @@ resource "aws_lambda_function" "availability_updater" {
48
48
  EKS_CLUSTER_NAME = aws_eks_cluster.gpu_dev_cluster.name
49
49
  REGION = local.current_config.aws_region
50
50
  SPOT_GPU_TYPES = lookup({
51
- "prod-east1" = "b300,b200,h200,h100,a100"
51
+ "prod-east1" = "b300,b200,h200,h100,a100,t4,l4,rtxpro6000,cpu-spot"
52
52
  }, terraform.workspace, "")
53
53
  ASG_NAME_PREFIX = "${var.prefix}-gpu-nodes"
54
54
  }
@@ -1,6 +1,6 @@
1
1
  # Custom PyTorch GPU Development Server Image
2
- # Based on pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
3
- FROM pytorch/pytorch:2.11.0-cuda12.8-cudnn9-devel
2
+ # Based on pytorch/pytorch:2.12.0-cuda13.2-cudnn9-devel
3
+ FROM pytorch/pytorch:2.12.0-cuda13.2-cudnn9-devel
4
4
 
5
5
  # Set environment variables for non-interactive installation
6
6
  ENV DEBIAN_FRONTEND=noninteractive
@@ -42,22 +42,22 @@ RUN for attempt in 1 2 3; do \
42
42
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
43
43
  apt-get install -y nodejs
44
44
 
45
- # Install CUDA 12.9, 13.0, 13.1, 13.2 alongside base CUDA 12.8
45
+ # Install older CUDA toolkits alongside base CUDA 13.2
46
46
  # Base image already has NVIDIA repo configured, no need for cuda-keyring
47
47
  RUN apt-get update && apt-get install -y --no-install-recommends \
48
+ cuda-toolkit-12-8 \
48
49
  cuda-toolkit-12-9 \
49
50
  cuda-toolkit-13-0 \
50
51
  cuda-toolkit-13-1 \
51
- cuda-toolkit-13-2 \
52
52
  && apt-get clean \
53
53
  && rm -rf /var/lib/apt/lists/*
54
54
 
55
- # CUDA 12.8 is the default (PyTorch compiled against it)
55
+ # CUDA 13.2 is the default (PyTorch 2.12 compiled against it)
56
56
  # All versions available at /usr/local/cuda-{12.8,12.9,13.0,13.1,13.2}/
57
- # Switch with: export CUDA_HOME=/usr/local/cuda-13.2
58
- ENV CUDA_HOME=/usr/local/cuda-12.8
59
- ENV PATH=/usr/local/cuda-12.8/bin:${PATH}
60
- ENV LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:${LD_LIBRARY_PATH}
57
+ # Switch with: export CUDA_HOME=/usr/local/cuda-12.8
58
+ ENV CUDA_HOME=/usr/local/cuda-13.2
59
+ ENV PATH=/usr/local/cuda-13.2/bin:${PATH}
60
+ ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/lib64:${LD_LIBRARY_PATH}
61
61
 
62
62
  # Install EFA stack (prebuilt libfabric + OpenMPI + aws-ofi-nccl with GPU/RDMA support)
63
63
  # Uses AWS EFA installer which bundles tested, compatible versions of all components