gpu-dev 0.5.25__tar.gz → 0.5.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
  4. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +19 -2
  5. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +5 -0
  6. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +1 -0
  7. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/pyproject.toml +1 -1
  8. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/eks.tf +7 -3
  9. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/index.py +4 -1
  10. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda.tf +1 -1
  11. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/main.tf +141 -0
  12. gpu_dev-0.5.27/terraform-gpu-devservers/node-termination-handler.tf +37 -0
  13. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/route53.tf +13 -0
  14. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/.github/workflows/no-gitlinks.yml +0 -0
  15. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/.github/workflows/publish.yml +0 -0
  16. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/.gitignore +0 -0
  17. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/CLAUDE.md +0 -0
  18. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/PROGRESS.md +0 -0
  19. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/PR_DESCRIPTION.md +0 -0
  20. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/README.md +0 -0
  21. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/TODO.md +0 -0
  22. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/admin/README.md +0 -0
  23. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/admin/generate_stats.py +0 -0
  24. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/admin/requirements.txt +0 -0
  25. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/README.md +0 -0
  26. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  27. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  28. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  29. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  30. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  31. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  32. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  33. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  34. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  35. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  36. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  37. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  38. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  39. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/docs/USER_GUIDE.md +0 -0
  40. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/docs/devgpu-features.html +0 -0
  41. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/docs/docker-mark-blue.svg +0 -0
  42. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/docs/icons8-cursor-ai.svg +0 -0
  43. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/post.md +0 -0
  44. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/setup.cfg +0 -0
  45. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  46. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  47. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/README.md +0 -0
  48. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/alb.tf +0 -0
  49. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/availability.tf +0 -0
  50. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/backend.tf +0 -0
  51. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  52. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  53. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  54. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bash_profile +0 -0
  55. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bashrc +0 -0
  56. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  57. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  58. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  59. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  60. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/motd_script +0 -0
  61. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  62. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/profile +0 -0
  63. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  64. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  65. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  66. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/shell_env +0 -0
  67. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/ssh_config +0 -0
  68. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zprofile +0 -0
  69. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zshrc +0 -0
  70. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  71. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-build.tf +0 -0
  72. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  73. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  74. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ecr.tf +0 -0
  75. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/efs.tf +0 -0
  76. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/expiry.tf +0 -0
  77. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/git-cache.tf +0 -0
  78. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  79. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/kubernetes.tf +0 -0
  80. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  81. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  82. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  83. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  84. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  85. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  86. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  87. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  88. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  89. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  90. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  91. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  92. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  93. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  94. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/mig-config.tf +0 -0
  95. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  96. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  97. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  98. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  99. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  100. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  101. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/monitoring.tf +0 -0
  102. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/outputs.tf +0 -0
  103. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/pyproject.toml +0 -0
  104. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/queue.tf +0 -0
  105. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  106. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  107. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  108. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  109. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  110. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  111. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  112. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  113. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  114. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  115. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/switch-to.sh +0 -0
  116. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  117. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  118. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  119. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  120. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/terraform-gpu-devservers/variables.tf +0 -0
  121. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/tests/submit/README.md +0 -0
  122. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/tests/submit/fail/run.sh +0 -0
  123. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/tests/submit/multinode/run.sh +0 -0
  124. {gpu_dev-0.5.25 → gpu_dev-0.5.27}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.25
3
+ Version: 0.5.27
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.25
3
+ Version: 0.5.27
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -52,6 +52,7 @@ terraform-gpu-devservers/main.tf
52
52
  terraform-gpu-devservers/mig-config.tf
53
53
  terraform-gpu-devservers/mig-parted-config.yaml
54
54
  terraform-gpu-devservers/monitoring.tf
55
+ terraform-gpu-devservers/node-termination-handler.tf
55
56
  terraform-gpu-devservers/outputs.tf
56
57
  terraform-gpu-devservers/pyproject.toml
57
58
  terraform-gpu-devservers/queue.tf
@@ -496,7 +496,7 @@ def main(ctx: click.Context) -> None:
496
496
  "--gpu-type",
497
497
  "-t",
498
498
  type=click.Choice(
499
- ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
499
+ ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
500
500
  ),
501
501
  help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
502
502
  )
@@ -662,6 +662,7 @@ def reserve(
662
662
  "b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
663
663
  "h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
664
664
  "b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
665
+ "b300": {"max_gpus": 8, "instance_type": "p6e-b300.48xlarge"},
665
666
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
666
667
  "cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
667
668
  }
@@ -1350,7 +1351,7 @@ def reserve(
1350
1351
  rprint(f"[red]❌ Error: {str(e)}[/red]")
1351
1352
 
1352
1353
 
1353
- _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1354
+ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1354
1355
  "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
1355
1356
  "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
1356
1357
 
@@ -2719,6 +2720,7 @@ def _show_availability() -> None:
2719
2720
  # GPU architecture mapping (for display)
2720
2721
  gpu_architectures = {
2721
2722
  "b200": "Blackwell (sm100)",
2723
+ "b300": "Blackwell (sm100)",
2722
2724
  "h200": "Hopper (sm90)",
2723
2725
  "h100": "Hopper (sm90)",
2724
2726
  "a100": "Ampere (sm80)",
@@ -2880,6 +2882,7 @@ def _show_availability_watch(interval: int) -> None:
2880
2882
  # GPU architecture mapping (for display)
2881
2883
  gpu_architectures = {
2882
2884
  "b200": "Blackwell (sm100)",
2885
+ "b300": "Blackwell (sm100)",
2883
2886
  "h200": "Hopper (sm90)",
2884
2887
  "h100": "Hopper (sm90)",
2885
2888
  "a100": "Ampere (sm80)",
@@ -3219,6 +3222,20 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3219
3222
  if "-A" not in ssh_command and "-o ForwardAgent=yes" not in ssh_command:
3220
3223
  ssh_command = ssh_command.replace("ssh ", "ssh -A ", 1)
3221
3224
 
3225
+ # Inject AddKeysToAgent so the first connect from this laptop loads the user\'s
3226
+ # IdentityFile into ssh-agent — without this the forwarded agent is empty on
3227
+ # subsequent pod→pod hops. UseKeychain persists the passphrase across reboots on
3228
+ # macOS; IgnoreUnknown lets Linux SSH ignore the macOS-only option cleanly.
3229
+ # The same options live in ~/.gpu-dev/<id>-sshconfig but ssh only honours them
3230
+ # when the command-line target matches a Host block, which this connect command
3231
+ # bypasses by passing the FQDN directly.
3232
+ if "AddKeysToAgent" not in ssh_command:
3233
+ ssh_command = ssh_command.replace(
3234
+ "ssh ",
3235
+ "ssh -o AddKeysToAgent=yes -o IgnoreUnknown=UseKeychain -o UseKeychain=yes ",
3236
+ 1,
3237
+ )
3238
+
3222
3239
  # When running from inside a gpu-dev pod (=GPU_DEV_USER_ID env var set) and the
3223
3240
  # forwarded SSH agent is reachable but empty, the next hop is going to fail with
3224
3241
  # 'Permission denied (publickey)'. Warn upfront so the user knows to ssh-add on
@@ -22,6 +22,11 @@ class Config:
22
22
  "workspace": "prod",
23
23
  "description": "Production environment",
24
24
  },
25
+ "prod-east1": {
26
+ "region": "us-east-1",
27
+ "workspace": "prod-east1",
28
+ "description": "Spot-only us-east-1 environment (T4/L4/CPU)",
29
+ },
25
30
  }
26
31
  DEFAULT_ENVIRONMENT = "prod"
27
32
 
@@ -557,6 +557,7 @@ class ReservationManager:
557
557
  "b200-mig-3g": {"max_gpus": 2},
558
558
  "h200": {"max_gpus": 8},
559
559
  "b200": {"max_gpus": 8},
560
+ "b300": {"max_gpus": 8},
560
561
  }
561
562
 
562
563
  max_gpus_per_node = gpu_configs[gpu_type]["max_gpus"]
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.25"
7
+ version = "0.5.27"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -189,6 +189,7 @@ locals {
189
189
  "h100" = "h100"
190
190
  "h200" = "h200"
191
191
  "b200" = "b200"
192
+ "b300" = "b300"
192
193
  "a100" = "a100"
193
194
  "cpu-arm" = "cpu-arm"
194
195
  "cpu-x86" = "cpu-x86"
@@ -401,11 +402,14 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
401
402
  }
402
403
  }
403
404
 
404
- # Conditionally add instance_market_options for capacity block instances (only when capacity reservation exists)
405
+ # instance_market_options: capacity-block when bound to a reservation, spot when
406
+ # the workspace's gpu_config has use_spot=true, otherwise on-demand (no block).
407
+ # Spot is mutually exclusive with capacity reservations — AWS rejects launch templates
408
+ # carrying both, so the precedence here is CR > spot > on-demand.
405
409
  dynamic "instance_market_options" {
406
- for_each = each.value.capacity_reservation_id != null ? [1] : []
410
+ for_each = (each.value.capacity_reservation_id != null || try(each.value.gpu_config.use_spot, false)) ? [1] : []
407
411
  content {
408
- market_type = "capacity-block"
412
+ market_type = each.value.capacity_reservation_id != null ? "capacity-block" : "spot"
409
413
  }
410
414
  }
411
415
 
@@ -81,6 +81,7 @@ GPU_CONFIG = {
81
81
  "h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
82
82
  "h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
83
83
  "b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
84
+ "b300": {"instance_type": "p6e-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
84
85
  "cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
85
86
  "cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
86
87
  }
@@ -2188,7 +2189,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2188
2189
  # Validate GPU type
2189
2190
  valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
2190
2191
  "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
2191
- "h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
2192
+ "h200", "b200", "b300", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
2192
2193
  "cpu-arm", "cpu-x86"]
2193
2194
  if gpu_type not in valid_gpu_types:
2194
2195
  error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
@@ -2435,6 +2436,7 @@ def update_gpu_availability_table(
2435
2436
  "b200-mig-3g": {"gpus_per_instance": 2},
2436
2437
  "h200": {"gpus_per_instance": 8},
2437
2438
  "b200": {"gpus_per_instance": 8},
2439
+ "b300": {"gpus_per_instance": 8},
2438
2440
  }
2439
2441
 
2440
2442
  gpu_config = gpu_type_configs.get(gpu_type, {"gpus_per_instance": 8})
@@ -6529,6 +6531,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6529
6531
  "p5e.48xlarge": "H200",
6530
6532
  "p5en.48xlarge": "H200",
6531
6533
  "p6-b200.48xlarge": "B200",
6534
+ "p6e-b300.48xlarge": "B300",
6532
6535
  }
6533
6536
 
6534
6537
  gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.25"
183
+ LAMBDA_VERSION = "0.5.27"
184
184
  MIN_CLI_VERSION = "0.5.16"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
@@ -58,6 +58,13 @@ provider "helm" {
58
58
  # Data sources
59
59
  data "aws_availability_zones" "available" {
60
60
  state = "available"
61
+ # Exclude Local Zones (e.g. us-east-1-dfw-2a) and Wavelength Zones — EKS control
62
+ # plane only supports standard AZs. us-east-2 doesn't have Local Zones so the
63
+ # existing prod workspace was unaffected; us-east-1 has several (dfw, bos, …).
64
+ filter {
65
+ name = "opt-in-status"
66
+ values = ["opt-in-not-required"]
67
+ }
61
68
  }
62
69
 
63
70
  data "aws_caller_identity" "current" {}
@@ -315,6 +322,104 @@ locals {
315
322
  }
316
323
  }
317
324
  }
325
+ # us-east-1 spot-only experimental cluster.
326
+ # Same provisioning shape as prod (managed via the terraform.workspace switch) but
327
+ # backed entirely by EC2 Spot — first cheap-and-cheerful environment we can deploy
328
+ # new instance types into (B300 land here once on-demand quota arrives).
329
+ "prod-east1" = {
330
+ aws_region = "us-east-1"
331
+ environment = "prod-east1"
332
+ domain_name = "east1.devservers.io"
333
+ gpu_instance_count = 1
334
+ use_self_managed_nodes = true
335
+ instance_type = "g4dn.12xlarge"
336
+ supported_gpu_types = {
337
+ # 8-GPU spot instances. instance_count=1 means the ASG tries to maintain 1
338
+ # spot instance per type — if AWS can't grant it (capacity / quota), the ASG
339
+ # sits at 0 and gpu-dev reservations queue. Bump counts once we see what
340
+ # actually gets fulfilled in us-east-1.
341
+ "b300" = {
342
+ instance_type = "p6e-b300.48xlarge"
343
+ instance_types = null
344
+ instance_count = 1
345
+ gpus_per_instance = 8
346
+ use_placement_group = false
347
+ architecture = "x86_64"
348
+ efa_network_cards = 8
349
+ use_spot = true
350
+ }
351
+ "b200" = {
352
+ instance_type = "p6-b200.48xlarge"
353
+ instance_types = null
354
+ instance_count = 1
355
+ gpus_per_instance = 8
356
+ use_placement_group = false
357
+ architecture = "x86_64"
358
+ efa_network_cards = 8
359
+ use_spot = true
360
+ }
361
+ "h200" = {
362
+ instance_type = "p5e.48xlarge"
363
+ instance_types = null
364
+ instance_count = 1
365
+ gpus_per_instance = 8
366
+ use_placement_group = false
367
+ architecture = "x86_64"
368
+ efa_network_cards = 16
369
+ use_spot = true
370
+ }
371
+ "h100" = {
372
+ instance_type = "p5.48xlarge"
373
+ instance_types = null
374
+ instance_count = 1
375
+ gpus_per_instance = 8
376
+ use_placement_group = false
377
+ architecture = "x86_64"
378
+ efa_network_cards = 32
379
+ use_spot = true
380
+ }
381
+ "a100" = {
382
+ instance_type = "p4d.24xlarge"
383
+ instance_types = null
384
+ instance_count = 1
385
+ gpus_per_instance = 8
386
+ use_placement_group = false
387
+ architecture = "x86_64"
388
+ efa_network_cards = 4
389
+ use_spot = true
390
+ }
391
+ "t4" = {
392
+ instance_type = "g4dn.12xlarge"
393
+ instance_types = null
394
+ instance_count = 1
395
+ gpus_per_instance = 4
396
+ use_placement_group = false
397
+ architecture = "x86_64"
398
+ efa_network_cards = 0
399
+ use_spot = true
400
+ }
401
+ "l4" = {
402
+ instance_type = "g6.12xlarge"
403
+ instance_types = null
404
+ instance_count = 1
405
+ gpus_per_instance = 4
406
+ use_placement_group = false
407
+ architecture = "x86_64"
408
+ efa_network_cards = 1
409
+ use_spot = true
410
+ }
411
+ "cpu-x86" = {
412
+ instance_type = "c7i.8xlarge"
413
+ instance_types = null
414
+ instance_count = 5
415
+ gpus_per_instance = 0
416
+ use_placement_group = false
417
+ architecture = "x86_64"
418
+ efa_network_cards = 0
419
+ use_spot = true
420
+ }
421
+ }
422
+ }
318
423
  }
319
424
 
320
425
  # Current workspace configuration
@@ -322,6 +427,9 @@ locals {
322
427
 
323
428
  # Workspace-specific capacity reservations (with manual instance counts)
324
429
  capacity_reservations = {
430
+ "prod-east1" = {
431
+ # No capacity reservations — this workspace is spot-only.
432
+ }
325
433
  default = {
326
434
  # Test environment capacity reservations
327
435
  # h100 = [
@@ -366,6 +474,20 @@ locals {
366
474
 
367
475
  # Workspace-specific GPU type to subnet mappings
368
476
  gpu_subnet_assignments = {
477
+ "prod-east1" = {
478
+ # All node types land in the primary subnet (us-east-1a). Multi-EFA types
479
+ # (efa_network_cards > 1) automatically use the private subnet in the same AZ.
480
+ # Specific instance types may not have capacity in us-east-1a — those ASGs will
481
+ # sit at 0 until we widen to other AZs, that's expected for beta.
482
+ b300 = "primary"
483
+ b200 = "primary"
484
+ h200 = "primary"
485
+ h100 = "primary"
486
+ a100 = "primary"
487
+ t4 = "primary"
488
+ l4 = "primary"
489
+ "cpu-x86" = "primary"
490
+ }
369
491
  default = {
370
492
  # Test environment - T4 nodes in multiple AZs for testing
371
493
  t4 = "primary" # T4 in us-west-1a (primary AZ)
@@ -390,8 +512,27 @@ locals {
390
512
  }
391
513
  }
392
514
 
515
+ # Subdomain NS delegations to create in *this* workspace's parent zone. Lets
516
+ # prod (which owns devservers.io) auto-publish NS records pointing at child zones
517
+ # in other workspaces (prod-east1, future regions) without manual -var flags.
518
+ # The NS values come from `tofu output devservers_name_servers` in the child
519
+ # workspace once its hosted zone has been created.
520
+ prod_subdomain_delegations = {
521
+ prod = {
522
+ "east1.devservers.io" = [
523
+ "ns-1079.awsdns-06.org",
524
+ "ns-1999.awsdns-57.co.uk",
525
+ "ns-341.awsdns-42.com",
526
+ "ns-624.awsdns-14.net",
527
+ ]
528
+ }
529
+ }
530
+
393
531
  # Per-capacity-reservation AZ mappings (overrides gpu_subnet_assignments when CR is used)
394
532
  capacity_reservation_azs = {
533
+ "prod-east1" = {
534
+ # Empty — no CRs in this workspace.
535
+ }
395
536
  default = {
396
537
  "cr-04d3d1d84e127a562" = "secondary" # us-west-1c
397
538
  }
@@ -0,0 +1,37 @@
1
+ # AWS Node Termination Handler — graceful drain on spot-interrupt + ASG lifecycle events.
2
+ #
3
+ # IMDS mode (one DaemonSet per node, no SQS / no IAM role) is plenty for our use case:
4
+ # we don't care about queue-processor features (rebalance recommendations, scheduled
5
+ # events). We just want pods to get a clean SIGTERM when AWS sends the 2-minute spot
6
+ # notice via instance metadata, instead of being killed cold.
7
+ #
8
+ # Tolerates everything so it runs on the GPU nodes that have nvidia.com/gpu:NoSchedule.
9
+
10
+ resource "helm_release" "aws_node_termination_handler" {
11
+ name = "aws-node-termination-handler"
12
+ repository = "https://aws.github.io/eks-charts"
13
+ chart = "aws-node-termination-handler"
14
+ namespace = "kube-system"
15
+ # No version pin — chart versions advance frequently and my first guess (0.27.1)
16
+ # didn't exist. helm picks current latest stable. Add a pin once we hit a regression.
17
+ cleanup_on_fail = true
18
+
19
+ values = [yamlencode({
20
+ enableSpotInterruptionDraining = true
21
+ enableScheduledEventDraining = true
22
+ enableRebalanceMonitoring = true
23
+ enableRebalanceDraining = false # warning only; rebalance recommendations are too noisy
24
+ nodeSelector = {
25
+ "kubernetes.io/os" = "linux"
26
+ }
27
+ tolerations = [
28
+ { operator = "Exists" }, # tolerate every taint; we want NTH on every node, including GPU nodes
29
+ ]
30
+ resources = {
31
+ requests = { cpu = "50m", memory = "64Mi" }
32
+ limits = { cpu = "100m", memory = "128Mi" }
33
+ }
34
+ })]
35
+
36
+ depends_on = [aws_eks_cluster.gpu_dev_cluster]
37
+ }
@@ -51,6 +51,19 @@ resource "aws_route53_record" "manual_subdomain_delegation" {
51
51
  records = var.subdomain_ns_records
52
52
  }
53
53
 
54
+ # Auto-published NS delegations for child workspaces. Iterates prod_subdomain_delegations
55
+ # (defined in main.tf) for the current workspace and creates an NS record per entry in
56
+ # the parent zone — so `tofu apply` in prod automatically wires up east1.devservers.io
57
+ # (and any future region) without -var flags.
58
+ resource "aws_route53_record" "workspace_subdomain_delegations" {
59
+ for_each = local.effective_domain_name != "" && !local.is_subdomain ? try(local.prod_subdomain_delegations[terraform.workspace], {}) : {}
60
+ zone_id = data.aws_route53_zone.parent[0].zone_id
61
+ name = each.key
62
+ type = "NS"
63
+ ttl = 300
64
+ records = each.value
65
+ }
66
+
54
67
  # Use appropriate hosted zone (subdomain if created, otherwise parent)
55
68
  locals {
56
69
  hosted_zone_id = local.is_subdomain ? aws_route53_zone.subdomain[0].zone_id : (local.effective_domain_name != "" ? data.aws_route53_zone.parent[0].zone_id : "")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes