gpu-dev 0.5.26__tar.gz → 0.5.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +5 -2
  4. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +1 -0
  5. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/eks.tf +1 -0
  7. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/index.py +4 -1
  8. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda.tf +1 -1
  9. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/main.tf +79 -2
  10. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/node-termination-handler.tf +2 -1
  11. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/route53.tf +13 -0
  12. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/.github/workflows/no-gitlinks.yml +0 -0
  13. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/.github/workflows/publish.yml +0 -0
  14. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/.gitignore +0 -0
  15. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/CLAUDE.md +0 -0
  16. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/PROGRESS.md +0 -0
  17. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/PR_DESCRIPTION.md +0 -0
  18. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/README.md +0 -0
  19. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/TODO.md +0 -0
  20. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/admin/README.md +0 -0
  21. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/admin/generate_stats.py +0 -0
  22. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/admin/requirements.txt +0 -0
  23. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/README.md +0 -0
  24. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  25. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  26. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  27. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  28. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  29. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  30. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  31. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  32. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  33. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  34. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  35. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  36. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  37. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  38. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  39. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/docs/USER_GUIDE.md +0 -0
  40. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/docs/devgpu-features.html +0 -0
  41. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/docs/docker-mark-blue.svg +0 -0
  42. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/docs/icons8-cursor-ai.svg +0 -0
  43. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/post.md +0 -0
  44. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/setup.cfg +0 -0
  45. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  46. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  47. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/README.md +0 -0
  48. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/alb.tf +0 -0
  49. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/availability.tf +0 -0
  50. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/backend.tf +0 -0
  51. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  52. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  53. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  54. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bash_profile +0 -0
  55. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bashrc +0 -0
  56. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  57. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  58. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  59. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  60. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/motd_script +0 -0
  61. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  62. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/profile +0 -0
  63. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  64. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  65. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  66. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/shell_env +0 -0
  67. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/ssh_config +0 -0
  68. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zprofile +0 -0
  69. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zshrc +0 -0
  70. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  71. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-build.tf +0 -0
  72. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  73. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  74. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ecr.tf +0 -0
  75. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/efs.tf +0 -0
  76. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/expiry.tf +0 -0
  77. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/git-cache.tf +0 -0
  78. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  79. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/kubernetes.tf +0 -0
  80. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  81. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  82. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  83. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  84. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  85. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  86. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  87. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  88. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  89. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  90. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  91. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  92. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  93. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  94. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/mig-config.tf +0 -0
  95. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  96. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  97. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  98. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  99. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  100. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  101. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/monitoring.tf +0 -0
  102. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/outputs.tf +0 -0
  103. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/pyproject.toml +0 -0
  104. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/queue.tf +0 -0
  105. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  106. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  107. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  108. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  109. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  110. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  111. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  112. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  113. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  114. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  115. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/switch-to.sh +0 -0
  116. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  117. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  118. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  119. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  120. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/terraform-gpu-devservers/variables.tf +0 -0
  121. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/tests/submit/README.md +0 -0
  122. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/tests/submit/fail/run.sh +0 -0
  123. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/tests/submit/multinode/run.sh +0 -0
  124. {gpu_dev-0.5.26 → gpu_dev-0.5.27}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.26
3
+ Version: 0.5.27
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.26
3
+ Version: 0.5.27
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -496,7 +496,7 @@ def main(ctx: click.Context) -> None:
496
496
  "--gpu-type",
497
497
  "-t",
498
498
  type=click.Choice(
499
- ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
499
+ ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
500
500
  ),
501
501
  help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
502
502
  )
@@ -662,6 +662,7 @@ def reserve(
662
662
  "b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
663
663
  "h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
664
664
  "b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
665
+ "b300": {"max_gpus": 8, "instance_type": "p6e-b300.48xlarge"},
665
666
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
666
667
  "cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
667
668
  }
@@ -1350,7 +1351,7 @@ def reserve(
1350
1351
  rprint(f"[red]❌ Error: {str(e)}[/red]")
1351
1352
 
1352
1353
 
1353
- _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1354
+ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1354
1355
  "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
1355
1356
  "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
1356
1357
 
@@ -2719,6 +2720,7 @@ def _show_availability() -> None:
2719
2720
  # GPU architecture mapping (for display)
2720
2721
  gpu_architectures = {
2721
2722
  "b200": "Blackwell (sm100)",
2723
+ "b300": "Blackwell (sm100)",
2722
2724
  "h200": "Hopper (sm90)",
2723
2725
  "h100": "Hopper (sm90)",
2724
2726
  "a100": "Ampere (sm80)",
@@ -2880,6 +2882,7 @@ def _show_availability_watch(interval: int) -> None:
2880
2882
  # GPU architecture mapping (for display)
2881
2883
  gpu_architectures = {
2882
2884
  "b200": "Blackwell (sm100)",
2885
+ "b300": "Blackwell (sm100)",
2883
2886
  "h200": "Hopper (sm90)",
2884
2887
  "h100": "Hopper (sm90)",
2885
2888
  "a100": "Ampere (sm80)",
@@ -557,6 +557,7 @@ class ReservationManager:
557
557
  "b200-mig-3g": {"max_gpus": 2},
558
558
  "h200": {"max_gpus": 8},
559
559
  "b200": {"max_gpus": 8},
560
+ "b300": {"max_gpus": 8},
560
561
  }
561
562
 
562
563
  max_gpus_per_node = gpu_configs[gpu_type]["max_gpus"]
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.26"
7
+ version = "0.5.27"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -189,6 +189,7 @@ locals {
189
189
  "h100" = "h100"
190
190
  "h200" = "h200"
191
191
  "b200" = "b200"
192
+ "b300" = "b300"
192
193
  "a100" = "a100"
193
194
  "cpu-arm" = "cpu-arm"
194
195
  "cpu-x86" = "cpu-x86"
@@ -81,6 +81,7 @@ GPU_CONFIG = {
81
81
  "h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
82
82
  "h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
83
83
  "b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
84
+ "b300": {"instance_type": "p6e-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
84
85
  "cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
85
86
  "cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
86
87
  }
@@ -2188,7 +2189,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2188
2189
  # Validate GPU type
2189
2190
  valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
2190
2191
  "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
2191
- "h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
2192
+ "h200", "b200", "b300", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
2192
2193
  "cpu-arm", "cpu-x86"]
2193
2194
  if gpu_type not in valid_gpu_types:
2194
2195
  error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
@@ -2435,6 +2436,7 @@ def update_gpu_availability_table(
2435
2436
  "b200-mig-3g": {"gpus_per_instance": 2},
2436
2437
  "h200": {"gpus_per_instance": 8},
2437
2438
  "b200": {"gpus_per_instance": 8},
2439
+ "b300": {"gpus_per_instance": 8},
2438
2440
  }
2439
2441
 
2440
2442
  gpu_config = gpu_type_configs.get(gpu_type, {"gpus_per_instance": 8})
@@ -6529,6 +6531,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6529
6531
  "p5e.48xlarge": "H200",
6530
6532
  "p5en.48xlarge": "H200",
6531
6533
  "p6-b200.48xlarge": "B200",
6534
+ "p6e-b300.48xlarge": "B300",
6532
6535
  }
6533
6536
 
6534
6537
  gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.25"
183
+ LAMBDA_VERSION = "0.5.27"
184
184
  MIN_CLI_VERSION = "0.5.16"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
@@ -334,6 +334,60 @@ locals {
334
334
  use_self_managed_nodes = true
335
335
  instance_type = "g4dn.12xlarge"
336
336
  supported_gpu_types = {
337
+ # 8-GPU spot instances. instance_count=1 means the ASG tries to maintain 1
338
+ # spot instance per type — if AWS can't grant it (capacity / quota), the ASG
339
+ # sits at 0 and gpu-dev reservations queue. Bump counts once we see what
340
+ # actually gets fulfilled in us-east-1.
341
+ "b300" = {
342
+ instance_type = "p6e-b300.48xlarge"
343
+ instance_types = null
344
+ instance_count = 1
345
+ gpus_per_instance = 8
346
+ use_placement_group = false
347
+ architecture = "x86_64"
348
+ efa_network_cards = 8
349
+ use_spot = true
350
+ }
351
+ "b200" = {
352
+ instance_type = "p6-b200.48xlarge"
353
+ instance_types = null
354
+ instance_count = 1
355
+ gpus_per_instance = 8
356
+ use_placement_group = false
357
+ architecture = "x86_64"
358
+ efa_network_cards = 8
359
+ use_spot = true
360
+ }
361
+ "h200" = {
362
+ instance_type = "p5e.48xlarge"
363
+ instance_types = null
364
+ instance_count = 1
365
+ gpus_per_instance = 8
366
+ use_placement_group = false
367
+ architecture = "x86_64"
368
+ efa_network_cards = 16
369
+ use_spot = true
370
+ }
371
+ "h100" = {
372
+ instance_type = "p5.48xlarge"
373
+ instance_types = null
374
+ instance_count = 1
375
+ gpus_per_instance = 8
376
+ use_placement_group = false
377
+ architecture = "x86_64"
378
+ efa_network_cards = 32
379
+ use_spot = true
380
+ }
381
+ "a100" = {
382
+ instance_type = "p4d.24xlarge"
383
+ instance_types = null
384
+ instance_count = 1
385
+ gpus_per_instance = 8
386
+ use_placement_group = false
387
+ architecture = "x86_64"
388
+ efa_network_cards = 4
389
+ use_spot = true
390
+ }
337
391
  "t4" = {
338
392
  instance_type = "g4dn.12xlarge"
339
393
  instance_types = null
@@ -421,8 +475,15 @@ locals {
421
475
  # Workspace-specific GPU type to subnet mappings
422
476
  gpu_subnet_assignments = {
423
477
  "prod-east1" = {
424
- # All node types land in the primary subnet (us-east-1a). Spot availability is
425
- # better than placement-group-strictness on these small ASGs.
478
+ # All node types land in the primary subnet (us-east-1a). Multi-EFA types
479
+ # (efa_network_cards > 1) automatically use the private subnet in the same AZ.
480
+ # Specific instance types may not have capacity in us-east-1a — those ASGs will
481
+ # sit at 0 until we widen to other AZs, that's expected for beta.
482
+ b300 = "primary"
483
+ b200 = "primary"
484
+ h200 = "primary"
485
+ h100 = "primary"
486
+ a100 = "primary"
426
487
  t4 = "primary"
427
488
  l4 = "primary"
428
489
  "cpu-x86" = "primary"
@@ -451,6 +512,22 @@ locals {
451
512
  }
452
513
  }
453
514
 
515
+ # Subdomain NS delegations to create in *this* workspace's parent zone. Lets
516
+ # prod (which owns devservers.io) auto-publish NS records pointing at child zones
517
+ # in other workspaces (prod-east1, future regions) without manual -var flags.
518
+ # The NS values come from `tofu output devservers_name_servers` in the child
519
+ # workspace once its hosted zone has been created.
520
+ prod_subdomain_delegations = {
521
+ prod = {
522
+ "east1.devservers.io" = [
523
+ "ns-1079.awsdns-06.org",
524
+ "ns-1999.awsdns-57.co.uk",
525
+ "ns-341.awsdns-42.com",
526
+ "ns-624.awsdns-14.net",
527
+ ]
528
+ }
529
+ }
530
+
454
531
  # Per-capacity-reservation AZ mappings (overrides gpu_subnet_assignments when CR is used)
455
532
  capacity_reservation_azs = {
456
533
  "prod-east1" = {
@@ -12,7 +12,8 @@ resource "helm_release" "aws_node_termination_handler" {
12
12
  repository = "https://aws.github.io/eks-charts"
13
13
  chart = "aws-node-termination-handler"
14
14
  namespace = "kube-system"
15
- version = "0.27.1"
15
+ # No version pin — chart versions advance frequently and my first guess (0.27.1)
16
+ # didn't exist. helm picks current latest stable. Add a pin once we hit a regression.
16
17
  cleanup_on_fail = true
17
18
 
18
19
  values = [yamlencode({
@@ -51,6 +51,19 @@ resource "aws_route53_record" "manual_subdomain_delegation" {
51
51
  records = var.subdomain_ns_records
52
52
  }
53
53
 
54
+ # Auto-published NS delegations for child workspaces. Iterates prod_subdomain_delegations
55
+ # (defined in main.tf) for the current workspace and creates an NS record per entry in
56
+ # the parent zone — so `tofu apply` in prod automatically wires up east1.devservers.io
57
+ # (and any future region) without -var flags.
58
+ resource "aws_route53_record" "workspace_subdomain_delegations" {
59
+ for_each = local.effective_domain_name != "" && !local.is_subdomain ? try(local.prod_subdomain_delegations[terraform.workspace], {}) : {}
60
+ zone_id = data.aws_route53_zone.parent[0].zone_id
61
+ name = each.key
62
+ type = "NS"
63
+ ttl = 300
64
+ records = each.value
65
+ }
66
+
54
67
  # Use appropriate hosted zone (subdomain if created, otherwise parent)
55
68
  locals {
56
69
  hosted_zone_id = local.is_subdomain ? aws_route53_zone.subdomain[0].zone_id : (local.effective_domain_name != "" ? data.aws_route53_zone.parent[0].zone_id : "")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes