gpu-dev 0.5.27__tar.gz → 0.5.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +19 -12
  4. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +6 -0
  5. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/pyproject.toml +1 -1
  6. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_processor/index.py +16 -2
  7. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda.tf +6 -1
  8. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/main.tf +1 -1
  9. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/.github/workflows/no-gitlinks.yml +0 -0
  10. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/.github/workflows/publish.yml +0 -0
  11. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/.gitignore +0 -0
  12. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/CLAUDE.md +0 -0
  13. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/PROGRESS.md +0 -0
  14. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/PR_DESCRIPTION.md +0 -0
  15. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/README.md +0 -0
  16. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/TODO.md +0 -0
  17. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/admin/README.md +0 -0
  18. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/admin/generate_stats.py +0 -0
  19. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/admin/requirements.txt +0 -0
  20. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/README.md +0 -0
  21. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  22. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  23. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  24. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  25. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  26. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  27. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  28. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  29. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  30. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  31. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  32. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  33. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  34. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  35. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  36. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/docs/USER_GUIDE.md +0 -0
  37. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/docs/devgpu-features.html +0 -0
  38. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/docs/docker-mark-blue.svg +0 -0
  39. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/docs/icons8-cursor-ai.svg +0 -0
  40. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/post.md +0 -0
  41. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/setup.cfg +0 -0
  42. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  43. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  44. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/README.md +0 -0
  45. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/alb.tf +0 -0
  46. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/availability.tf +0 -0
  47. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/backend.tf +0 -0
  48. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  49. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  50. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  51. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/bash_profile +0 -0
  52. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/bashrc +0 -0
  53. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  54. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  55. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  56. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  57. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/motd_script +0 -0
  58. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  59. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/profile +0 -0
  60. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  61. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  62. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  63. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/shell_env +0 -0
  64. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/ssh_config +0 -0
  65. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/zprofile +0 -0
  66. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/zshrc +0 -0
  67. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  68. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker-build.tf +0 -0
  69. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  70. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  71. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ecr.tf +0 -0
  72. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/efs.tf +0 -0
  73. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/eks.tf +0 -0
  74. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/expiry.tf +0 -0
  75. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/git-cache.tf +0 -0
  76. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  77. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/kubernetes.tf +0 -0
  78. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  79. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  80. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  81. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  82. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  83. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  84. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  85. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  86. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  87. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  88. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  89. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  90. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  91. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  92. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/mig-config.tf +0 -0
  93. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  94. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  95. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  96. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  97. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  98. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  99. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/monitoring.tf +0 -0
  100. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  101. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/outputs.tf +0 -0
  102. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/pyproject.toml +0 -0
  103. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/queue.tf +0 -0
  104. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/route53.tf +0 -0
  105. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  106. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  107. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  108. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  109. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  110. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  111. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  112. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  113. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  114. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  115. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/switch-to.sh +0 -0
  116. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  117. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  118. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  119. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  120. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/terraform-gpu-devservers/variables.tf +0 -0
  121. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/tests/submit/README.md +0 -0
  122. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/tests/submit/fail/run.sh +0 -0
  123. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/tests/submit/multinode/run.sh +0 -0
  124. {gpu_dev-0.5.27 → gpu_dev-0.5.29}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.27
3
+ Version: 0.5.29
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.27
3
+ Version: 0.5.29
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -576,6 +576,8 @@ def main(ctx: click.Context) -> None:
576
576
  multiple=True,
577
577
  help="Request nodes with specific label (format: key=value). Example: --node-label nsight=true for Nsight profiling nodes",
578
578
  )
579
+ @click.option("--spot", is_flag=True, default=False,
580
+ help="Acknowledge spot instance (~1/3 cost, may be preempted with 2-min notice). Required for spot-only types.")
579
581
  @click.pass_context
580
582
  def reserve(
581
583
  ctx: click.Context,
@@ -662,7 +664,7 @@ def reserve(
662
664
  "b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
663
665
  "h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
664
666
  "b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
665
- "b300": {"max_gpus": 8, "instance_type": "p6e-b300.48xlarge"},
667
+ "b300": {"max_gpus": 8, "instance_type": "p6-b300.48xlarge"},
666
668
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
667
669
  "cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
668
670
  }
@@ -1271,6 +1273,7 @@ def reserve(
1271
1273
  no_persistent_disk=no_persistent_disk,
1272
1274
  preserve_entrypoint=preserve_entrypoint,
1273
1275
  disk_name=disk,
1276
+ spot=spot,
1274
1277
  node_labels=node_labels if node_labels else None,
1275
1278
  )
1276
1279
  else:
@@ -1289,6 +1292,7 @@ def reserve(
1289
1292
  no_persistent_disk=no_persistent_disk,
1290
1293
  preserve_entrypoint=preserve_entrypoint,
1291
1294
  disk_name=disk,
1295
+ spot=spot,
1292
1296
  node_labels=node_labels if node_labels else None,
1293
1297
  trace=trace,
1294
1298
  )
@@ -1362,6 +1366,8 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
1362
1366
  @click.option("--hours", type=float, default=1.0, show_default=True, help="Reservation lifetime ceiling — job auto-cancels well before this if it finishes.")
1363
1367
  @click.option("--disk", type=str, default=None, help="Persistent disk name (master node only). Omit for ephemeral storage.")
1364
1368
  @click.option("--no-persistent-disk", is_flag=True, help="Skip persistent disk entirely.")
1369
+ @click.option("--spot", is_flag=True, default=False,
1370
+ help="Acknowledge spot instance (~1/3 cost, may be preempted). Required for spot-only types.")
1365
1371
  @click.option("--dockerfile", type=click.Path(exists=True, dir_okay=False, resolve_path=True), default=None,
1366
1372
  help="Local Dockerfile to build into the pod image (build context = the Dockerfile's directory).")
1367
1373
  @click.option("--dockerimage", type=str, default=None,
@@ -1377,7 +1383,7 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
1377
1383
  help="Minutes to wait for the reservation to become active. Defaults to 24h since GPU reservations may queue when the cluster is full.")
1378
1384
  @click.argument("command", nargs=-1, required=True)
1379
1385
  @click.pass_context
1380
- def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, dockerimage, preserve_entrypoint,
1386
+ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, spot, dockerfile, dockerimage, preserve_entrypoint,
1381
1387
  runtime, no_pull, keep_alive, name, timeout, command):
1382
1388
  """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
1383
1389
 
@@ -1491,7 +1497,7 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
1491
1497
  user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1492
1498
  duration_hours=hours, name=name, github_user=user_info["github_user"],
1493
1499
  no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1494
- dockerfile=dockerfile_payload, dockerimage=dockerimage,
1500
+ spot=spot, dockerfile=dockerfile_payload, dockerimage=dockerimage,
1495
1501
  preserve_entrypoint=preserve_entrypoint)
1496
1502
  if not reservation_ids:
1497
1503
  rprint("[red]❌ Failed to create multinode reservation[/red]")
@@ -1502,7 +1508,7 @@ def submit(ctx, gpu_type, gpus, hours, disk, no_persistent_disk, dockerfile, doc
1502
1508
  user_id=user_info["user_id"], gpu_count=gpus, gpu_type=gt,
1503
1509
  duration_hours=hours, name=name, github_user=user_info["github_user"],
1504
1510
  no_persistent_disk=no_persistent_disk, disk_name=disk_name,
1505
- dockerfile=dockerfile_payload, dockerimage=dockerimage,
1511
+ spot=spot, dockerfile=dockerfile_payload, dockerimage=dockerimage,
1506
1512
  preserve_entrypoint=preserve_entrypoint)
1507
1513
  if not primary_id:
1508
1514
  rprint("[red]❌ Failed to create reservation[/red]")
@@ -3658,24 +3664,25 @@ def set(key: str, value: str) -> None:
3658
3664
 
3659
3665
 
3660
3666
  @config.command()
3661
- @click.argument("env_name", type=click.Choice(["test", "prod"]))
3667
+ @click.argument("env_name", type=click.Choice(list(Config.ENVIRONMENTS.keys())))
3662
3668
  def environment(env_name: str) -> None:
3663
- """Set the environment (test or prod)
3669
+ """Set the environment
3664
3670
 
3665
3671
  Sets the AWS region and Terraform workspace for the specified environment.
3666
- This configuration is used by the switch-to.sh script.
3667
3672
 
3668
3673
  Arguments:
3669
- ENV_NAME: Environment name (test or prod)
3674
+ ENV_NAME: Environment name
3670
3675
 
3671
3676
  \b
3672
3677
  Examples:
3673
- gpu-dev config environment test # Set to test environment (us-west-1)
3674
- gpu-dev config environment prod # Set to prod environment (us-east-2)
3678
+ gpu-dev config environment prod # Production (us-east-2)
3679
+ gpu-dev config environment prod-east1 # Spot-only us-east-1
3680
+ gpu-dev config environment test # Test (us-west-1)
3675
3681
 
3676
3682
  Environment configurations:
3677
- test: us-west-1, Terraform workspace 'default'
3678
- prod: us-east-2, Terraform workspace 'prod'
3683
+ test: us-west-1, Terraform workspace 'default'
3684
+ prod: us-east-2, Terraform workspace 'prod'
3685
+ prod-east1: us-east-1, Terraform workspace 'prod-east1' (spot-only)'
3679
3686
  """
3680
3687
  from .config import Config
3681
3688
 
@@ -421,6 +421,7 @@ class ReservationManager:
421
421
  disk_name: Optional[str] = None,
422
422
  node_labels: Optional[Dict[str, str]] = None,
423
423
  trace: bool = False,
424
+ spot: bool = False,
424
425
  ) -> Optional[str]:
425
426
  """Create a new GPU reservation"""
426
427
  try:
@@ -500,6 +501,9 @@ class ReservationManager:
500
501
  if node_labels:
501
502
  message["node_labels"] = node_labels
502
503
 
504
+ if spot:
505
+ message["spot"] = True
506
+
503
507
  # Add trace flag and CLI start timestamp
504
508
  if trace:
505
509
  message["trace"] = True
@@ -536,6 +540,7 @@ class ReservationManager:
536
540
  preserve_entrypoint: bool = False,
537
541
  disk_name: Optional[str] = None,
538
542
  node_labels: Optional[Dict[str, str]] = None,
543
+ spot: bool = False,
539
544
  ) -> Optional[List[str]]:
540
545
  """Create multiple GPU reservations for multinode setup"""
541
546
  try:
@@ -602,6 +607,7 @@ class ReservationManager:
602
607
  "recreate_env": recreate_env,
603
608
  "is_multinode": True,
604
609
  "no_persistent_disk": no_persistent_disk,
610
+ "spot": spot,
605
611
  }
606
612
 
607
613
  if github_user:
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.27"
7
+ version = "0.5.29"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -59,6 +59,7 @@ ECR_REPOSITORY_URL = os.environ.get("ECR_REPOSITORY_URL")
59
59
  # Version validation - injected via Terraform
60
60
  LAMBDA_VERSION = os.environ.get("LAMBDA_VERSION", "0.3.9")
61
61
  MIN_CLI_VERSION = os.environ.get("MIN_CLI_VERSION", "0.3.9")
62
+ SPOT_GPU_TYPES = os.environ.get("SPOT_GPU_TYPES", "")
62
63
  OPERATIONS_TABLE = os.environ.get("OPERATIONS_TABLE", "pytorch-gpu-dev-operations")
63
64
 
64
65
  # GPU Configuration - single source of truth for all GPU type mappings
@@ -81,7 +82,7 @@ GPU_CONFIG = {
81
82
  "h100": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
82
83
  "h200": {"instance_type": "p5e.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
83
84
  "b200": {"instance_type": "p6-b200.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 32},
84
- "b300": {"instance_type": "p6e-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
85
+ "b300": {"instance_type": "p6-b300.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 8},
85
86
  "cpu-arm": {"instance_type": "c7g.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
86
87
  "cpu-x86": {"instance_type": "c7i.8xlarge", "max_gpus": 0, "cpus": 32, "memory_gb": 64, "efa_count": 0},
87
88
  }
@@ -2206,6 +2207,19 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2206
2207
  logger.warning(f"User {user_id} blocked from {gpu_type}: maintenance mode")
2207
2208
  return False, error_msg
2208
2209
 
2210
+ # Spot acknowledgment: if this workspace marks the GPU type as spot-only and
2211
+ # the user didn't pass --spot, reject with a clear message.
2212
+ if SPOT_GPU_TYPES and not request.get("spot", False):
2213
+ is_spot = SPOT_GPU_TYPES.strip() == "all" or gpu_type in [t.strip() for t in SPOT_GPU_TYPES.split(",")]
2214
+ if is_spot:
2215
+ error_msg = (
2216
+ f"{gpu_type.upper()} is only available as a spot instance in this environment. "
2217
+ f"Spot instances are ~1/3 the cost but can be reclaimed by AWS with 2-min notice. "
2218
+ f"Pass --spot to confirm: gpu-dev reserve --gpu-type {gpu_type} --spot"
2219
+ )
2220
+ logger.warning(f"Reservation: spot acknowledgment missing for {gpu_type}")
2221
+ return False, error_msg
2222
+
2209
2223
  # Validate GPU count based on type
2210
2224
  if gpu_type.startswith("cpu-") and gpu_count == 0:
2211
2225
  pass # Valid CPU-only instance
@@ -6531,7 +6545,7 @@ def get_instance_type_and_gpu_info(k8s_client, pod_name: str) -> tuple[str, str]
6531
6545
  "p5e.48xlarge": "H200",
6532
6546
  "p5en.48xlarge": "H200",
6533
6547
  "p6-b200.48xlarge": "B200",
6534
- "p6e-b300.48xlarge": "B300",
6548
+ "p6-b300.48xlarge": "B300",
6535
6549
  }
6536
6550
 
6537
6551
  gpu_type = gpu_type_mapping.get(instance_type, "Unknown")
@@ -180,8 +180,13 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.27"
183
+ LAMBDA_VERSION = "0.5.28"
184
184
  MIN_CLI_VERSION = "0.5.16"
185
+ # Comma-separated GPU types that require --spot flag, or "all" for every type.
186
+ # Empty = no spot types (on-demand / reserved). Set per-workspace.
187
+ SPOT_GPU_TYPES = lookup({
188
+ "prod-east1" = "all"
189
+ }, terraform.workspace, "")
185
190
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
191
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
192
  }, local.alb_env_vars)
@@ -339,7 +339,7 @@ locals {
339
339
  # sits at 0 and gpu-dev reservations queue. Bump counts once we see what
340
340
  # actually gets fulfilled in us-east-1.
341
341
  "b300" = {
342
- instance_type = "p6e-b300.48xlarge"
342
+ instance_type = "p6-b300.48xlarge"
343
343
  instance_types = null
344
344
  instance_count = 1
345
345
  gpus_per_instance = 8
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes