gpu-dev 0.5.26__tar.gz → 0.5.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +14 -5
  4. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +7 -0
  5. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/eks.tf +1 -0
  7. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_processor/index.py +18 -1
  8. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda.tf +6 -1
  9. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/main.tf +79 -2
  10. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/node-termination-handler.tf +2 -1
  11. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/route53.tf +13 -0
  12. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/.github/workflows/no-gitlinks.yml +0 -0
  13. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/.github/workflows/publish.yml +0 -0
  14. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/.gitignore +0 -0
  15. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/CLAUDE.md +0 -0
  16. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/PROGRESS.md +0 -0
  17. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/PR_DESCRIPTION.md +0 -0
  18. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/README.md +0 -0
  19. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/TODO.md +0 -0
  20. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/admin/README.md +0 -0
  21. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/admin/generate_stats.py +0 -0
  22. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/admin/requirements.txt +0 -0
  23. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/README.md +0 -0
  24. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  25. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  26. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  27. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  28. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  29. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  30. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  31. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  32. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  33. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  34. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  35. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  36. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  37. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  38. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  39. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/docs/USER_GUIDE.md +0 -0
  40. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/docs/devgpu-features.html +0 -0
  41. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/docs/docker-mark-blue.svg +0 -0
  42. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/docs/icons8-cursor-ai.svg +0 -0
  43. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/post.md +0 -0
  44. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/setup.cfg +0 -0
  45. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  46. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  47. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/README.md +0 -0
  48. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/alb.tf +0 -0
  49. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/availability.tf +0 -0
  50. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/backend.tf +0 -0
  51. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  52. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  53. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  54. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/bash_profile +0 -0
  55. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/bashrc +0 -0
  56. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  57. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  58. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  59. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  60. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/motd_script +0 -0
  61. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  62. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/profile +0 -0
  63. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  64. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  65. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  66. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/shell_env +0 -0
  67. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/ssh_config +0 -0
  68. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/zprofile +0 -0
  69. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/zshrc +0 -0
  70. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  71. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker-build.tf +0 -0
  72. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  73. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  74. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ecr.tf +0 -0
  75. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/efs.tf +0 -0
  76. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/expiry.tf +0 -0
  77. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/git-cache.tf +0 -0
  78. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  79. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/kubernetes.tf +0 -0
  80. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  81. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  82. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  83. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  84. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  85. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  86. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  87. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  88. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  89. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  90. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  91. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  92. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  93. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  94. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/mig-config.tf +0 -0
  95. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  96. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  97. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  98. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  99. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  100. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  101. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/monitoring.tf +0 -0
  102. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/outputs.tf +0 -0
  103. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/pyproject.toml +0 -0
  104. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/queue.tf +0 -0
  105. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  106. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  107. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  108. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  109. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  110. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  111. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  112. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  113. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  114. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  115. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/switch-to.sh +0 -0
  116. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  117. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  118. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  119. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  120. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/terraform-gpu-devservers/variables.tf +0 -0
  121. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/tests/submit/README.md +0 -0
  122. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/tests/submit/fail/run.sh +0 -0
  123. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/tests/submit/multinode/run.sh +0 -0
  124. {gpu_dev-0.5.26 → gpu_dev-0.5.28}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.26
3
+ Version: 0.5.28
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.26
3
+ Version: 0.5.28
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -496,7 +496,7 @@ def main(ctx: click.Context) -> None:
496
496
  "--gpu-type",
497
497
  "-t",
498
498
  type=click.Choice(
499
- ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
499
+ ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
500
500
  ),
501
501
  help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
502
502
  )
@@ -576,6 +576,8 @@ def main(ctx: click.Context) -> None:
576
576
  multiple=True,
577
577
  help="Request nodes with specific label (format: key=value). Example: --node-label nsight=true for Nsight profiling nodes",
578
578
  )
579
+ @click.option("--spot", is_flag=True, default=False,
580
+ help="Acknowledge spot instance (~1/3 cost, may be preempted with 2-min notice). Required for spot-only types.")
579
581
  @click.pass_context
580
582
  def reserve(
581
583
  ctx: click.Context,
@@ -662,6 +664,7 @@ def reserve(
662
664
  "b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
663
665
  "h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
664
666
  "b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
667
+ "b300": {"max_gpus": 8, "instance_type": "p6-b300.48xlarge"},
665
668
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
666
669
  "cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
667
670
  }
@@ -1270,6 +1273,7 @@ def reserve(
1270
1273
  no_persistent_disk=no_persistent_disk,
1271
1274
  preserve_entrypoint=preserve_entrypoint,
1272
1275
  disk_name=disk,
1276
+ spot=spot,
1273
1277
  node_labels=node_labels if node_labels else None,
1274
1278
  )
1275
1279
  else:
@@ -1288,6 +1292,7 @@ def reserve(
1288
1292
  no_persistent_disk=no_persistent_disk,
1289
1293
  preserve_entrypoint=preserve_entrypoint,
1290
1294
  disk_name=disk,
1295
+ spot=spot,
1291
1296
  node_labels=node_labels if node_labels else None,
1292
1297
  trace=trace,
1293
1298
  )
@@ -1350,7 +1355,7 @@ def reserve(
1350
1355
  rprint(f"[red]❌ Error: {str(e)}[/red]")
1351
1356
 
1352
1357
 
1353
- _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1358
+ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1354
1359
  "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
1355
1360
  "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
1356
1361
 
@@ -1361,6 +1366,8 @@ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200"
1361
1366
  @click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
1362
1367
  @click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
1363
1368
  @click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
1369
+ @click.option("--spot", is_flag=True, default=False,
1370
+ help="Acknowledge spot instance (~1/3 cost, may be preempted). Required for spot-only types.")
1364
1371
  @click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
1365
1372
  help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
1366
1373
  @click.option("--dockerimage", type=str, default=None,
@@ -1376,7 +1383,7 @@ _SUBMIT_GPU_TYPES = ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200"
1376
1383
  help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
1377
1384
  @click.argument("command", nargs=-1, required=True)
1378
1385
  @click.pass_context
1379
- def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
1386
+ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, spot, dockerfile, dockerimage, preserve_entrypoint,
1380
1387
  runtime, no_pull, keep_alive, name, timeout, command):
1381
1388
  """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
1382
1389
 
@@ -1490,7 +1497,7 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
1490
1497
  user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1491
1498
  duration_hours=hours, name=name, github_user=user_info["github_user"],
1492
1499
  no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1493
- dockerfile=dockerfile_payload, dockerimage=dockerimage,
1500
+ spot=spot, dockerfile=dockerfile_payload, dockerimage=dockerimage,
1494
1501
  preserve_entrypoint=preserve_entrypoint)
1495
1502
  if not reservation_ids:
1496
1503
  rprint("[red]❌ Failed to create multinode reservation[/red]")
@@ -1501,7 +1508,7 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
1501
1508
  user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1502
1509
  duration_hours=hours, name=name, github_user=user_info["github_user"],
1503
1510
  no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1504
- dockerfile=dockerfile_payload, dockerimage=dockerimage,
1511
+ spot=spot, dockerfile=dockerfile_payload, dockerimage=dockerimage,
1505
1512
  preserve_entrypoint=preserve_entrypoint)
1506
1513
  if not primary_id:
1507
1514
  rprint("[red]❌ Failed to create reservation[/red]")
@@ -2719,6 +2726,7 @@ def _show_availability() -> None:
2719
2726
  # GPU architecture mapping (for display)
2720
2727
  gpu_architectures = {
2721
2728
  "b200": "Blackwell (sm100)",
2729
+ "b300": "Blackwell (sm100)",
2722
2730
  "h200": "Hopper (sm90)",
2723
2731
  "h100": "Hopper (sm90)",
2724
2732
  "a100": "Ampere (sm80)",
@@ -2880,6 +2888,7 @@ def _show_availability_watch(interval: int) -> None:
2880
2888
  # GPU architecture mapping (for display)
2881
2889
  gpu_architectures = {
2882
2890
  "b200": "Blackwell (sm100)",
2891
+ "b300": "Blackwell (sm100)",
2883
2892
  "h200": "Hopper (sm90)",
2884
2893
  "h100": "Hopper (sm90)",
2885
2894
  "a100": "Ampere (sm80)",
@@ -421,6 +421,7 @@ class ReservationManager:
421
421
  disk_name: Optional[str] = None,
422
422
  node_labels: Optional[Dict[str, str]] = None,
423
423
  trace: bool = False,
424
+ spot: bool = False,
424
425
  ) -> Optional[str]:
425
426
  """Create a new GPU reservation"""
426
427
  try:
@@ -500,6 +501,9 @@ class ReservationManager:
500
501
  if node_labels:
501
502
  message["node_labels"] = node_labels
502
503
 
504
+ if spot:
505
+ message["spot"] = True
506
+
503
507
  # Add trace flag and CLI start timestamp
504
508
  if trace:
505
509
  message["trace"] = True
@@ -536,6 +540,7 @@ class ReservationManager:
536
540
  preserve_entrypoint: bool = False,
537
541
  disk_name: Optional[str] = None,
538
542
  node_labels: Optional[Dict[str, str]] = None,
543
+ spot: bool = False,
539
544
  ) -> Optional[List[str]]:
540
545
  """Create multiple GPU reservations for multinode setup"""
541
546
  try:
@@ -557,6 +562,7 @@ class ReservationManager:
557
562
  "b200-mig-3g": {"max_gpus": 2},
558
563
  "h200": {"max_gpus": 8},
559
564
  "b200": {"max_gpus": 8},
565
+ "b300": {"max_gpus": 8},
560
566
  }
561
567
 
562
568
  max_gpus_per_node = gpu_configs[gpu_type]["max_gpus"]
@@ -601,6 +607,7 @@ class ReservationManager:
601
607
  "recreate_env": recreate_env,
602
608
  "is_multinode": True,
603
609
  "no_persistent_disk": no_persistent_disk,
610
+ "spot": spot,
604
611
  }
605
612
 
606
613
  if github_user:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.26"
7
+ version = "0.5.28"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -189,6 +189,7 @@ locals {
189
189
  "h100" = "h100"
190
190
  "h200" = "h200"
191
191
  "b200" = "b200"
192
+ "b300" = "b300"
192
193
  "a100" = "a100"
193
194
  "cpu-arm" = "cpu-arm"
194
195
  "cpu-x86" = "cpu-x86"
@@ -59,6 +59,7 @@ ECR_REPOSITORY_URL = os.environ.get("ECR_REPOSITORY_URL")
59
59
  # Version validation - injected via Terraform
60
60
  LAMBDA_VERSION = os.environ.get("LAMBDA_VERSION", "0.3.9")
61
61
  MIN_CLI_VERSION = os.environ.get("MIN_CLI_VERSION", "0.3.9")
62
+ SPOT_GPU_TYPES = os.environ.get("SPOT_GPU_TYPES", "")
62
63
  OPERATIONS_TABLE = os.environ.get("OPERATIONS_TABLE", "pytorch-gpu-dev-operations")
63
64
 
64
65
  # GPU Configuration - single source of truth for all GPU type mappings
@@ -81,6 +82,7 @@ GPU_CONFIG = {
81
82
  "h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
82
83
  "h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
83
84
  "b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
85
+ "b300": {"instance_type": "p6-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
84
86
  "cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
85
87
  "cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
86
88
  }
@@ -2188,7 +2190,7 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2188
2190
  # Validate GPU type
2189
2191
  valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
2190
2192
  "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
2191
- "h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
2193
+ "h200", "b200", "b300", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
2192
2194
  "cpu-arm", "cpu-x86"]
2193
2195
  if gpu_type not in valid_gpu_types:
2194
2196
  error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
@@ -2205,6 +2207,19 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2205
2207
  logger.warning(f"User {user_id} blocked from {gpu_type}: maintenance mode")
2206
2208
  return False, error_msg
2207
2209
 
2210
+ # Spot acknowledgment: if this workspace marks the GPU type as spot-only and
2211
+ # the user didn't pass --spot, reject with a clear message.
2212
+ if SPOT_GPU_TYPES and not request.get("spot", False):
2213
+ is_spot = SPOT_GPU_TYPES.strip() == "all" or gpu_type in [t.strip() for t in SPOT_GPU_TYPES.split(",")]
2214
+ if is_spot:
2215
+ error_msg = (
2216
+ f"{gpu_type.upper()} is only available as a spot instance in this environment. "
2217
+ f"Spot instances are ~1/3 the cost but can be reclaimed by AWS with 2-min notice. "
2218
+ f"Pass --spot to confirm: gpu-dev reserve --gpu-type {gpu_type} --spot"
2219
+ )
2220
+ logger.warning(f"Reservation: spot acknowledgment missing for {gpu_type}")
2221
+ return False, error_msg
2222
+
2208
2223
  # Validate GPU count based on type
2209
2224
  if gpu_type.startswith("cpu-") and gpu_count == 0:
2210
2225
  pass # Valid CPU-only instance
@@ -2435,6 +2450,7 @@ def update_gpu_availability_table(
2435
2450
  "b200-mig-3g": {"gpus_per_instance": 2},
2436
2451
  "h200": {"gpus_per_instance": 8},
2437
2452
  "b200": {"gpus_per_instance": 8},
2453
+ "b300": {"gpus_per_instance": 8},
2438
2454
  }
2439
2455
 
2440
2456
  gpu_config = gpu_type_configs.get(gpu_type, {"gpus_per_instance": 8})
@@ -6529,6 +6545,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6529
6545
  "p5e.48xlarge": "H200",
6530
6546
  "p5en.48xlarge": "H200",
6531
6547
  "p6-b200.48xlarge": "B200",
6548
+ "p6-b300.48xlarge": "B300",
6532
6549
  }
6533
6550
 
6534
6551
  gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
@@ -180,8 +180,13 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.25"
183
+ LAMBDA_VERSION = "0.5.28"
184
184
  MIN_CLI_VERSION = "0.5.16"
185
+ # Comma-separated GPU types that require --spot flag, or "all" for every type.
186
+ # Empty = no spot types (on-demand / reserved). Set per-workspace.
187
+ SPOT_GPU_TYPES = lookup({
188
+ "prod-east1" = "all"
189
+ }, terraform.workspace, "")
185
190
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
191
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
192
  }, local.alb_env_vars)
@@ -334,6 +334,60 @@ locals {
334
334
  use_self_managed_nodes = true
335
335
  instance_type = "g4dn.12xlarge"
336
336
  supported_gpu_types = {
337
+ # 8-GPU spot instances. instance_count=1 means the ASG tries to maintain 1
338
+ # spot instance per type — if AWS can't grant it (capacity / quota), the ASG
339
+ # sits at 0 and gpu-dev reservations queue. Bump counts once we see what
340
+ # actually gets fulfilled in us-east-1.
341
+ "b300" = {
342
+ instance_type = "p6-b300.48xlarge"
343
+ instance_types = null
344
+ instance_count = 1
345
+ gpus_per_instance = 8
346
+ use_placement_group = false
347
+ architecture = "x86_64"
348
+ efa_network_cards = 8
349
+ use_spot = true
350
+ }
351
+ "b200" = {
352
+ instance_type = "p6-b200.48xlarge"
353
+ instance_types = null
354
+ instance_count = 1
355
+ gpus_per_instance = 8
356
+ use_placement_group = false
357
+ architecture = "x86_64"
358
+ efa_network_cards = 8
359
+ use_spot = true
360
+ }
361
+ "h200" = {
362
+ instance_type = "p5e.48xlarge"
363
+ instance_types = null
364
+ instance_count = 1
365
+ gpus_per_instance = 8
366
+ use_placement_group = false
367
+ architecture = "x86_64"
368
+ efa_network_cards = 16
369
+ use_spot = true
370
+ }
371
+ "h100" = {
372
+ instance_type = "p5.48xlarge"
373
+ instance_types = null
374
+ instance_count = 1
375
+ gpus_per_instance = 8
376
+ use_placement_group = false
377
+ architecture = "x86_64"
378
+ efa_network_cards = 32
379
+ use_spot = true
380
+ }
381
+ "a100" = {
382
+ instance_type = "p4d.24xlarge"
383
+ instance_types = null
384
+ instance_count = 1
385
+ gpus_per_instance = 8
386
+ use_placement_group = false
387
+ architecture = "x86_64"
388
+ efa_network_cards = 4
389
+ use_spot = true
390
+ }
337
391
  "t4" = {
338
392
  instance_type = "g4dn.12xlarge"
339
393
  instance_types = null
@@ -421,8 +475,15 @@ locals {
421
475
  # Workspace-specific GPU type to subnet mappings
422
476
  gpu_subnet_assignments = {
423
477
  "prod-east1" = {
424
- # All node types land in the primary subnet (us-east-1a). Spot availability is
425
- # better than placement-group-strictness on these small ASGs.
478
+ # All node types land in the primary subnet (us-east-1a). Multi-EFA types
479
+ # (efa_network_cards > 1) automatically use the private subnet in the same AZ.
480
+ # Specific instance types may not have capacity in us-east-1a — those ASGs will
481
+ # sit at 0 until we widen to other AZs, that's expected for beta.
482
+ b300 = "primary"
483
+ b200 = "primary"
484
+ h200 = "primary"
485
+ h100 = "primary"
486
+ a100 = "primary"
426
487
  t4 = "primary"
427
488
  l4 = "primary"
428
489
  "cpu-x86" = "primary"
@@ -451,6 +512,22 @@ locals {
451
512
  }
452
513
  }
453
514
 
515
+ # Subdomain NS delegations to create in *this* workspace's parent zone. Lets
516
+ # prod (which owns devservers.io) auto-publish NS records pointing at child zones
517
+ # in other workspaces (prod-east1, future regions) without manual -var flags.
518
+ # The NS values come from `tofu output devservers_name_servers` in the child
519
+ # workspace once its hosted zone has been created.
520
+ prod_subdomain_delegations = {
521
+ prod = {
522
+ "east1.devservers.io" = [
523
+ "ns-1079.awsdns-06.org",
524
+ "ns-1999.awsdns-57.co.uk",
525
+ "ns-341.awsdns-42.com",
526
+ "ns-624.awsdns-14.net",
527
+ ]
528
+ }
529
+ }
530
+
454
531
  # Per-capacity-reservation AZ mappings (overrides gpu_subnet_assignments when CR is used)
455
532
  capacity_reservation_azs = {
456
533
  "prod-east1" = {
@@ -12,7 +12,8 @@ resource "helm_release" "aws_node_termination_handler" {
12
12
  repository = "https://aws.github.io/eks-charts"
13
13
  chart = "aws-node-termination-handler"
14
14
  namespace = "kube-system"
15
- version = "0.27.1"
15
+ # No version pin — chart versions advance frequently and my first guess (0.27.1)
16
+ # didn't exist. helm picks current latest stable. Add a pin once we hit a regression.
16
17
  cleanup_on_fail = true
17
18
 
18
19
  values = [yamlencode({
@@ -51,6 +51,19 @@ resource "aws_route53_record" "manual_subdomain_delegation" {
51
51
  records = var.subdomain_ns_records
52
52
  }
53
53
 
54
+ # Auto-published NS delegations for child workspaces. Iterates prod_subdomain_delegations
55
+ # (defined in main.tf) for the current workspace and creates an NS record per entry in
56
+ # the parent zone — so `tofu apply` in prod automatically wires up east1.devservers.io
57
+ # (and any future region) without -var flags.
58
+ resource "aws_route53_record" "workspace_subdomain_delegations" {
59
+ for_each = local.effective_domain_name != "" && !local.is_subdomain ? try(local.prod_subdomain_delegations[terraform.workspace], {}) : {}
60
+ zone_id = data.aws_route53_zone.parent[0].zone_id
61
+ name = each.key
62
+ type = "NS"
63
+ ttl = 300
64
+ records = each.value
65
+ }
66
+
54
67
  # Use appropriate hosted zone (subdomain if created, otherwise parent)
55
68
  locals {
56
69
  hosted_zone_id = local.is_subdomain ? aws_route53_zone.subdomain[0].zone_id : (local.effective_domain_name != "" ? data.aws_route53_zone.parent[0].zone_id : "")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes