gpu-dev 0.5.31__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/CLAUDE.md +52 -0
  2. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/PKG-INFO +1 -1
  3. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  4. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -4
  5. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +106 -44
  6. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +7 -6
  7. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +15 -2
  8. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +40 -23
  9. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +19 -5
  10. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/pyproject.toml +1 -1
  11. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ami-baker.tf +22 -3
  12. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/availability.tf +1 -1
  13. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/Dockerfile +9 -9
  14. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ecr.tf +73 -4
  15. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/eks.tf +45 -5
  16. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/expiry.tf +1 -1
  17. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/kubernetes.tf +13 -13
  18. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/availability_updater/index.py +7 -5
  19. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/index.py +258 -170
  20. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda.tf +29 -5
  21. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/main.tf +31 -5
  22. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy-service.tf +8 -7
  23. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/al2023-user-data.sh +102 -10
  24. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +7 -3
  25. gpu_dev-0.5.31/PROGRESS.md +0 -288
  26. gpu_dev-0.5.31/PR_DESCRIPTION.md +0 -168
  27. gpu_dev-0.5.31/TODO.md +0 -64
  28. gpu_dev-0.5.31/post.md +0 -233
  29. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/.github/workflows/no-gitlinks.yml +0 -0
  30. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/.github/workflows/publish.yml +0 -0
  31. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/.gitignore +0 -0
  32. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/README.md +0 -0
  33. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/admin/README.md +0 -0
  34. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/admin/generate_stats.py +0 -0
  35. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/admin/requirements.txt +0 -0
  36. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/README.md +0 -0
  37. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  38. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  39. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  40. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  41. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  42. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  43. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  44. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  45. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  46. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  47. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  48. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/docs/USER_GUIDE.md +0 -0
  49. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/docs/devgpu-features.html +0 -0
  50. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/docs/docker-mark-blue.svg +0 -0
  51. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/docs/icons8-cursor-ai.svg +0 -0
  52. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/setup.cfg +0 -0
  53. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  54. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  55. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/README.md +0 -0
  56. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/alb.tf +0 -0
  57. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/backend.tf +0 -0
  58. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/check_b200.py +0 -0
  59. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  60. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  61. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  62. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  63. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bash_profile +0 -0
  64. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bashrc +0 -0
  65. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  66. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  67. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  68. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  69. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/motd_script +0 -0
  70. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  71. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/profile +0 -0
  72. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  73. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  74. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  75. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/shell_env +0 -0
  76. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/ssh_config +0 -0
  77. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zprofile +0 -0
  78. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zshrc +0 -0
  79. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  80. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-build.tf +0 -0
  81. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  82. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  83. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/efs.tf +0 -0
  84. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/git-cache.tf +0 -0
  85. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  86. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  87. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  88. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  89. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  90. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  91. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  92. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  93. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  94. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  95. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  96. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  97. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  98. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  99. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/list_b200.py +0 -0
  100. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/mig-config.tf +0 -0
  101. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  102. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  103. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  104. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  105. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  106. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  107. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/monitoring.tf +0 -0
  108. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  109. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/outputs.tf +0 -0
  110. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/pyproject.toml +0 -0
  111. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/queue.tf +0 -0
  112. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/route53.tf +0 -0
  113. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  114. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  115. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  116. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  117. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  118. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  119. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  120. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  121. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  122. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  123. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/switch-to.sh +0 -0
  124. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  125. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  126. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  127. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/terraform-gpu-devservers/variables.tf +0 -0
  128. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/tests/submit/README.md +0 -0
  129. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/tests/submit/fail/run.sh +0 -0
  130. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/tests/submit/multinode/run.sh +0 -0
  131. {gpu_dev-0.5.31 → gpu_dev-0.6.0}/tests/submit/success/run.sh +0 -0
@@ -183,6 +183,55 @@ kubectl port-forward -n monitoring svc/kube-prometheus-stack-prometheus 9090:909
183
183
  kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
184
184
  ```
185
185
 
186
+ ## Multi-Region Single-State Refactor (Research Notes, May 2026)
187
+
188
+ **Goal:** One `tf apply` manages all regions. No more `tf-all`, no double Docker builds, no double AMI bakes.
189
+
190
+ **Approach:** Module-per-region pattern.
191
+ ```hcl
192
+ # root main.tf
193
+ module "us_east_2" {
194
+ source = "./modules/region"
195
+ region = "us-east-2"
196
+ gpu_types = { h100 = {...}, b200 = {...}, ... }
197
+ spot_types = []
198
+ providers = { aws = aws.us_east_2 }
199
+ }
200
+ module "us_east_1" {
201
+ source = "./modules/region"
202
+ region = "us-east-1"
203
+ gpu_types = { b300 = {...}, t4 = {...}, ... }
204
+ spot_types = ["b300", "b200", "h100", ...]
205
+ providers = { aws = aws.us_east_1 }
206
+ }
207
+ ```
208
+
209
+ **What goes in the module:** VPC, subnets, EKS cluster, ASGs, launch templates, Lambda functions, DDB tables, EFS, monitoring, DNS. Basically everything in the current root except provider config and shared resources.
210
+
211
+ **What stays at root:** Provider blocks with aliases, ECR replication config, AMI copy (`aws_ami_copy` from primary to secondary regions), global IAM roles if any, CLI config.
212
+
213
+ **AMI sharing:** Build baked AMI in us-east-2 (primary), `aws_ami_copy` to other regions. One build, replicated. The `ami_baker` stays in root, outputs AMI ID, each module receives it as a variable.
214
+
215
+ **Docker sharing:** ECR replication already set up. Docker builds once in primary region, auto-replicates.
216
+
217
+ **Migration plan (since nobody uses east1 yet):**
218
+ 1. `tofu workspace select prod-east1 && tofu destroy` — clean slate
219
+ 2. Move all resources into `modules/region/`
220
+ 3. Create provider aliases in root
221
+ 4. Import prod (us-east-2) resources into new module state: `tofu import module.us_east_2.aws_vpc.gpu_dev_vpc vpc-xxx`
222
+ 5. Add us-east-1 module — fresh create, no import needed
223
+ 6. Delete workspace: `tofu workspace delete prod-east1`
224
+
225
+ **Risks:**
226
+ - Import step for prod is tedious (~50+ resources) but mechanical
227
+ - Lambda zip paths need to be relative to module, not root
228
+ - EKS auth (aws-auth ConfigMap) is per-cluster — each module manages its own
229
+ - CLI needs to know which region to query — already handled by config
230
+
231
+ **Estimated effort:** 1 dedicated session (~4-6 hours). Most time on the module extraction + prod import.
232
+
233
+ **Prerequisite for:** Adding us-west-1, us-west-2, or any future region (becomes one module block each).
234
+
186
235
  ## Recent Fixes (Oct 27, 2025)
187
236
 
188
237
  **NVIDIA Profiling Bootstrap Configuration (Oct 27, 2025):**
@@ -232,6 +281,9 @@ kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana
232
281
 
233
282
  ### 📋 Remaining Tasks
234
283
 
284
+ - **Merge multi-region into single tf state** - HIGH PRIORITY. Kill prod-east1 workspace, refactor into module-per-region in one state. See research notes below. Enables: one `tf apply`, shared AMI (aws_ami_copy), shared Docker (ECR replication already set up), no double builds. Prerequisite for adding west regions.
285
+ - **Add us-west-1 and us-west-2 spot regions** - BLOCKED on single-state refactor. After refactor, adding a region = adding one module block.
286
+ - **Spot UX improvements** - Queue position should be #1 for each type (not cross-type FIFO). Status should show "queued (waiting for capacity)" not just "queued". Interactive picker should show spot GPU counts from east1 not prod.
235
287
  - **FQDN for devservers** - Set up proper domain names for development server access
236
288
  - **Automated SSH config per reservation** - ✅ DONE - Each reservation now gets `~/.devgpu/<reservation_id>-sshconfig` file, use with `ssh -F ~/.devgpu/<reservation_id>-sshconfig <pod_name>`
237
289
  - **Custom Docker image scaffold** - Create Dockerfile with pre-installed packages (Jupyter, etc.)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.31
3
+ Version: 0.6.0
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.31
3
+ Version: 0.6.0
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,10 +1,6 @@
1
1
  .gitignore
2
2
  CLAUDE.md
3
- PROGRESS.md
4
- PR_DESCRIPTION.md
5
3
  README.md
6
- TODO.md
7
- post.md
8
4
  pyproject.toml
9
5
  .github/workflows/no-gitlinks.yml
10
6
  .github/workflows/publish.yml
@@ -526,7 +526,7 @@ def main(ctx: click.Context) -> None:
526
526
  "--gpu-type",
527
527
  "-t",
528
528
  type=click.Choice(
529
- ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
529
+ ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"], case_sensitive=False
530
530
  ),
531
531
  help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
532
532
  )
@@ -698,6 +698,7 @@ def reserve(
698
698
  "b300": {"max_gpus": 8, "instance_type": "p6-b300.48xlarge"},
699
699
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
700
700
  "cpu-x86": {"max_gpus": 0, "instance_type": "c7i.4xlarge"},
701
+ "cpu-spot": {"max_gpus": 0, "instance_type": "c7i.2xlarge"},
701
702
  }
702
703
 
703
704
  # Early validation of GPU type to extract max_gpus (needed for disk selection)
@@ -896,6 +897,13 @@ def reserve(
896
897
 
897
898
  else:
898
899
  # Non-interactive mode - use defaults and validate
900
+ # Route --spot to east1 when on prod (env vars override config region)
901
+ if spot and load_config().user_config.get("environment") == "prod":
902
+ east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
903
+ if east1_cfg:
904
+ import os as _os
905
+ _os.environ["AWS_REGION"] = east1_cfg["region"]
906
+
899
907
  if gpu_type is None:
900
908
  gpu_type = "a100"
901
909
  if hours is None:
@@ -1418,7 +1426,7 @@ def reserve(
1418
1426
 
1419
1427
  _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
1420
1428
  "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000",
1421
- "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"]
1429
+ "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"]
1422
1430
 
1423
1431
 
1424
1432
  @main.command(context_settings={"ignore_unknown_options": True})
@@ -1837,7 +1845,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1837
1845
  ended = item.get("reservation_ended") or item.get("expired_at") or item.get("created_at", "")
1838
1846
  if ended and ended < one_hour_ago:
1839
1847
  continue
1840
- item["_region"] = "us-east-1"
1848
+ item["_region"] = "east1"
1841
1849
  results.append(item)
1842
1850
  return results
1843
1851
  except Exception:
@@ -1847,11 +1855,45 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1847
1855
  active_future = executor.submit(fetch_active)
1848
1856
  failures_future = executor.submit(fetch_recent_failures)
1849
1857
  east1_future = executor.submit(fetch_east1)
1850
- reservations = active_future.result() + failures_future.result() + east1_future.result()
1858
+ prod_results = active_future.result() + failures_future.result()
1859
+ for r in prod_results:
1860
+ if "_region" not in r:
1861
+ r["_region"] = "prod"
1862
+ east1_results = east1_future.result()
1863
+ for r in east1_results:
1864
+ if "_region" not in r:
1865
+ r["_region"] = "east1"
1866
+ reservations = prod_results + east1_results
1851
1867
  else:
1852
- reservations = reservation_mgr.list_reservations(
1868
+ prod_res = reservation_mgr.list_reservations(
1853
1869
  user_filter=user_filter, statuses_to_include=statuses_to_include
1854
1870
  )
1871
+ for r in prod_res:
1872
+ if "_region" not in r:
1873
+ r["_region"] = "prod"
1874
+ east1_res = fetch_east1() if not status else []
1875
+ if not east1_res:
1876
+ try:
1877
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
1878
+ if east1_env and config.user_config.get("environment") == "prod":
1879
+ import boto3 as _b3
1880
+ east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
1881
+ east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
1882
+ for s in (statuses_to_include or ["active", "preparing", "queued", "pending"]):
1883
+ resp = east1_table.query(
1884
+ IndexName="StatusIndex",
1885
+ KeyConditionExpression="#s = :status",
1886
+ ExpressionAttributeNames={"#s": "status"},
1887
+ ExpressionAttributeValues={":status": s},
1888
+ )
1889
+ for item in resp.get("Items", []):
1890
+ if user_filter and item.get("user_id") != user_filter:
1891
+ continue
1892
+ item["_region"] = "east1"
1893
+ east1_res.append(item)
1894
+ except Exception:
1895
+ pass
1896
+ reservations = prod_res + east1_res
1855
1897
  except RuntimeError as e:
1856
1898
  rprint(f"[red]❌ {str(e)}[/red]")
1857
1899
  return False
@@ -1883,7 +1925,8 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1883
1925
 
1884
1926
  # Create table with enhanced columns for queue info
1885
1927
  # Check if we have cross-region reservations
1886
- _has_east1 = any(r.get("_region") == "us-east-1" for r in reservations)
1928
+ _regions = frozenset(r.get("_region", "") for r in reservations if r.get("_region"))
1929
+ _has_multi_region = len(_regions) > 1 or "east1" in _regions
1887
1930
 
1888
1931
  table = Table(title="GPU Reservations")
1889
1932
  table.add_column("ID", style="cyan", no_wrap=True)
@@ -1894,7 +1937,7 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1894
1937
  table.add_column("Queue Info", style="cyan")
1895
1938
  table.add_column("Created", style="blue")
1896
1939
  table.add_column("Expires/ETA", style="red")
1897
- if _has_east1:
1940
+ if _has_multi_region:
1898
1941
  table.add_column("Region", style="dim")
1899
1942
  if details:
1900
1943
  table.add_column("CLI Ver", style="dim", no_wrap=True)
@@ -1935,13 +1978,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1935
1978
  # Use the new helper that shows time + remaining
1936
1979
  expires_formatted = _format_expires_with_remaining(expires_at)
1937
1980
  elif res_status in ["queued", "pending"]:
1938
- # Show estimated wait time if available
1939
1981
  estimated_wait = reservation.get(
1940
1982
  "estimated_wait_minutes", "?")
1941
- if estimated_wait != "?" and estimated_wait is not None:
1983
+ if estimated_wait and estimated_wait not in ("?", "None", None):
1942
1984
  expires_formatted = f"~{estimated_wait}min"
1943
1985
  else:
1944
- expires_formatted = "Calculating..."
1986
+ expires_formatted = "Waiting..."
1945
1987
  elif res_status in ("expired", "failed", "cancelled"):
1946
1988
  reason = reservation.get("failure_reason", "")
1947
1989
  ended = reservation.get("reservation_ended") or reservation.get("expired_at", "")
@@ -1968,15 +2010,11 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
1968
2010
  # Format queue info for queued reservations
1969
2011
  queue_info = ""
1970
2012
  if res_status in ["queued", "pending"]:
1971
- queue_position = reservation.get("queue_position", "?")
1972
- estimated_wait = reservation.get(
1973
- "estimated_wait_minutes", "?")
1974
- if queue_position != "?" and queue_position is not None:
1975
- queue_info = f"#{queue_position}"
1976
- if estimated_wait != "?" and estimated_wait is not None:
1977
- queue_info += f" (~{estimated_wait}min)"
2013
+ detail = reservation.get("current_detailed_status") or reservation.get("detailed_status") or ""
2014
+ if "capacity" in detail.lower() or "spot" in detail.lower():
2015
+ queue_info = "Waiting for spot"
1978
2016
  else:
1979
- queue_info = "Calculating..."
2017
+ queue_info = "Spot pending"
1980
2018
  elif res_status == "active":
1981
2019
  # Show pod IP for multinode, SSH hint for single-node
1982
2020
  pod_ip = reservation.get("pod_ip", "")
@@ -2099,9 +2137,12 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
2099
2137
  row_data.append(
2100
2138
  f"[dim]{lambda_version_display}[/dim]" if dim_row else lambda_version_display)
2101
2139
 
2102
- if _has_east1:
2103
- region = reservation.get("_region", "us-east-2")
2104
- row_data.append("[yellow]east1[/yellow]" if region == "us-east-1" else "prod")
2140
+ if _has_multi_region:
2141
+ region = reservation.get("_region", "prod")
2142
+ if region in ("us-east-1", "east1"):
2143
+ row_data.append("[yellow]east1[/yellow]")
2144
+ else:
2145
+ row_data.append("prod")
2105
2146
 
2106
2147
  table.add_row(*row_data)
2107
2148
 
@@ -2279,8 +2320,11 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
2279
2320
 
2280
2321
  queue_info = ""
2281
2322
  if res_status in ["queued", "pending"]:
2282
- queue_position = reservation.get("queue_position", "?")
2283
- queue_info = f"#{queue_position}" if queue_position != "?" else "Calculating..."
2323
+ detail = reservation.get("current_detailed_status") or reservation.get("detailed_status") or ""
2324
+ if "capacity" in detail.lower() or "spot" in detail.lower():
2325
+ queue_info = "Waiting for spot"
2326
+ else:
2327
+ queue_info = "Spot pending"
2284
2328
  elif res_status == "active":
2285
2329
  queue_info = "Ready"
2286
2330
 
@@ -2313,10 +2357,10 @@ def list(ctx: click.Context, user: Optional[str], status: Optional[str], details
2313
2357
  expires_formatted = _format_expires_with_remaining(expires_at)
2314
2358
  elif res_status in ["queued", "pending"]:
2315
2359
  estimated_wait = reservation.get("estimated_wait_minutes", "?")
2316
- if estimated_wait != "?" and estimated_wait is not None:
2360
+ if estimated_wait and estimated_wait not in ("?", "None", None):
2317
2361
  expires_formatted = f"~{estimated_wait}min"
2318
2362
  else:
2319
- expires_formatted = "Calculating..."
2363
+ expires_formatted = "Waiting..."
2320
2364
  else:
2321
2365
  expires_formatted = "N/A"
2322
2366
 
@@ -2531,10 +2575,21 @@ def cancel(
2531
2575
  with Live(
2532
2576
  Spinner("dots", text="📡 Cancelling reservations..."), console=console
2533
2577
  ) as live:
2578
+ # Build east1 reservation manager for cross-region cancellations
2579
+ east1_mgr = None
2580
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
2581
+ if east1_env:
2582
+ import os as _os
2583
+ _east1_config = Config()
2584
+ _east1_config.aws_region = east1_env["region"]
2585
+ east1_mgr = ReservationManager(_east1_config)
2586
+
2534
2587
  for reservation in reservations:
2535
2588
  res_id = reservation.get("reservation_id", "")
2536
2589
  if res_id:
2537
- success = reservation_mgr.cancel_reservation(
2590
+ # Use east1 manager for east1 reservations
2591
+ mgr = east1_mgr if reservation.get("_region") in ("east1", "us-east-1") and east1_mgr else reservation_mgr
2592
+ success = mgr.cancel_reservation(
2538
2593
  res_id, user_info["user_id"]
2539
2594
  )
2540
2595
  if success:
@@ -2971,7 +3026,7 @@ def _show_availability() -> None:
2971
3026
  spot_table.add_column("Avail\nNow", style="green")
2972
3027
  spot_table.add_column("Per\nNode", style="bright_green")
2973
3028
  spot_table.add_column("Status", style="magenta")
2974
- spot_table.add_column("Availability", style="dim")
3029
+ spot_table.add_column("Spot Discount", style="dim")
2975
3030
  _on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
2976
3031
  for gt, info in sorted(spot_region_info.items()):
2977
3032
  avail = info.get("available", 0)
@@ -2981,14 +3036,12 @@ def _show_availability() -> None:
2981
3036
  si = info.get("spot_info", {}) or {}
2982
3037
  sp = si.get("spot_price", "") if isinstance(si, dict) else ""
2983
3038
  if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
2984
- avail_signal = "[red]Not offered[/red]"
3039
+ avail_signal = "[green]Available[/green]" if avail > 0 else "[dim]No price data[/dim]"
2985
3040
  else:
2986
3041
  try:
2987
3042
  ratio = float(sp) / _on_demand.get(gt, 50)
2988
3043
  pct = int((1 - ratio) * 100)
2989
- if ratio < 0.4: avail_signal = f"[green]High ({pct}% off)[/green]"
2990
- elif ratio < 0.7: avail_signal = f"[yellow]Medium ({pct}% off)[/yellow]"
2991
- else: avail_signal = f"[red]Low ({pct}% off)[/red]"
3044
+ avail_signal = f"[green]{pct}% off on-demand[/green]" if pct > 0 else "[dim]At on-demand price[/dim]"
2992
3045
  except (ValueError, TypeError):
2993
3046
  avail_signal = "[yellow]Unknown[/yellow]"
2994
3047
  spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
@@ -3266,21 +3319,30 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3266
3319
 
3267
3320
  live.start()
3268
3321
 
3269
- # If the selected reservation is from east1, switch to east1 reservation_mgr
3270
- _sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
3271
- if _sel and _sel.get("_region") == "us-east-1":
3272
- import os as _os
3273
- east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
3274
- _os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
3275
- _east1_config = Config()
3276
- _east1_config.aws_region = east1_cfg["region"]
3277
- reservation_mgr = ReservationManager(_east1_config)
3278
-
3279
- # Get connection info
3322
+ # Try current region first, then cross-region if not found
3280
3323
  connection_info = reservation_mgr.get_connection_info(
3281
3324
  reservation_id, user_info["user_id"]
3282
3325
  )
3283
3326
 
3327
+ # If not found, try the other region
3328
+ if not connection_info:
3329
+ import os as _os
3330
+ current_env = config.user_config.get("environment", "prod")
3331
+ other_envs = {"prod": "prod-east1", "prod-east1": "prod"}
3332
+ other_env_name = other_envs.get(current_env)
3333
+ if other_env_name:
3334
+ other_env = Config.ENVIRONMENTS.get(other_env_name, {})
3335
+ if other_env:
3336
+ _os.environ["AWS_DEFAULT_REGION"] = other_env["region"]
3337
+ _other_config = Config()
3338
+ _other_config.aws_region = other_env["region"]
3339
+ other_mgr = ReservationManager(_other_config)
3340
+ connection_info = other_mgr.get_connection_info(
3341
+ reservation_id, user_info["user_id"]
3342
+ )
3343
+ if connection_info:
3344
+ reservation_mgr = other_mgr
3345
+
3284
3346
  live.stop()
3285
3347
 
3286
3348
  if not connection_info:
@@ -3829,7 +3891,7 @@ def set(key: str, value: str) -> None:
3829
3891
 
3830
3892
 
3831
3893
  @config.command()
3832
- @click.argument("env_name", type=click.Choice(["test", "prod", "prod-east1"]))
3894
+ @click.argument("env_name", type=click.Choice(["test", "prod"]))
3833
3895
  def environment(env_name: str) -> None:
3834
3896
  """Set the environment
3835
3897
 
@@ -3841,7 +3903,7 @@ def environment(env_name: str) -> None:
3841
3903
  \b
3842
3904
  Examples:
3843
3905
  gpu-dev config environment prod # Production (us-east-2)
3844
- gpu-dev config environment prod-east1 # Spot-only us-east-1
3906
+ gpu-dev config environment prod # Production (spot accessible via interactive picker)
3845
3907
  gpu-dev config environment test # Test (us-west-1)
3846
3908
 
3847
3909
  Environment configurations:
@@ -26,7 +26,7 @@ class Config:
26
26
  "region": "us-east-1",
27
27
  "workspace": "prod-east1",
28
28
  "description": "Spot-only us-east-1 environment (T4/L4/CPU)",
29
- "spot_types": ["b300", "b200", "h200", "h100", "a100"],
29
+ "spot_types": ["b300", "b200", "h200", "h100", "a100", "t4", "l4", "rtxpro6000"],
30
30
  },
31
31
  }
32
32
  DEFAULT_ENVIRONMENT = "prod"
@@ -42,13 +42,14 @@ class Config:
42
42
  # Load unified config (handles migration from legacy files)
43
43
  self.user_config = self._load_config()
44
44
 
45
- # Get region from config, then AWS env vars, or default
46
- if self.user_config.get("region"):
45
+ # Get region: env vars take priority (for spot routing), then config, then default
46
+ env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
47
+ if env_region and env_region != self.user_config.get("region"):
48
+ self.aws_region = env_region
49
+ elif self.user_config.get("region"):
47
50
  self.aws_region = self.user_config["region"]
48
51
  else:
49
- self.aws_region = os.getenv(
50
- "AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-2")
51
- )
52
+ self.aws_region = "us-east-2"
52
53
 
53
54
  os.environ["AWS_DEFAULT_REGION"] = self.aws_region
54
55
 
@@ -355,8 +355,21 @@ def unlock_disk(disk_name: str, user_id: str, config: Config) -> bool:
355
355
  return False
356
356
 
357
357
  if not disk['in_use']:
358
- print(f"Disk '{disk_name}' is not locked")
359
- return False
358
+ # DDB says not locked — but check if EBS volume is still physically attached
359
+ try:
360
+ ec2 = config.session.client('ec2', region_name=config.aws_region)
361
+ vols = ec2.describe_volumes(Filters=[
362
+ {"Name": "tag:gpu-dev-user", "Values": [user_id]},
363
+ {"Name": "tag:disk_name", "Values": [disk_name]},
364
+ {"Name": "status", "Values": ["in-use"]},
365
+ ]).get("Volumes", [])
366
+ if not vols:
367
+ print(f"Disk '{disk_name}' is not locked")
368
+ return False
369
+ print(f"Disk '{disk_name}' DDB lock is clear but EBS volume is still attached — sending force-detach request")
370
+ except Exception:
371
+ print(f"Disk '{disk_name}' is not locked")
372
+ return False
360
373
 
361
374
  operation_id = str(uuid.uuid4())
362
375
 
@@ -52,11 +52,19 @@ def check_interactive_support() -> bool:
52
52
 
53
53
  def select_gpu_type_interactive(
54
54
  availability_info: Dict[str, Dict[str, Any]],
55
+ _refresh: bool = False,
55
56
  ) -> Optional[str]:
56
57
  """Interactive GPU type selection with availability table"""
57
58
  if not check_interactive_support():
58
59
  return None
59
60
 
61
+ if _refresh:
62
+ from .reservations import ReservationManager
63
+ from .config import load_config
64
+ _cfg = load_config()
65
+ _mgr = ReservationManager(_cfg)
66
+ availability_info = _mgr.get_gpu_availability_by_type() or availability_info
67
+
60
68
  # Hide MIG slice SKUs from the top-level selector — reached via the h100 submenu.
61
69
  # Direct `--gpu-type h100-mig-1g` still works for non-interactive scripts.
62
70
  visible_info = {
@@ -194,7 +202,7 @@ def select_gpu_type_interactive(
194
202
  st.add_column("Avail\nNow", style="green")
195
203
  st.add_column("Per\nNode", style="bright_green")
196
204
  st.add_column("Status", style="magenta")
197
- st.add_column("Availability", style="dim")
205
+ st.add_column("Spot Discount", style="dim")
198
206
  _on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
199
207
  for gt, info in spot_gpus.items():
200
208
  avail = info.get("available", 0)
@@ -205,7 +213,7 @@ def select_gpu_type_interactive(
205
213
  # Availability signal from spot price vs on-demand
206
214
  sp = si.get("spot_price", "") if isinstance(si, dict) else ""
207
215
  if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
208
- avail_signal = "[red]Not offered[/red]"
216
+ avail_signal = "[green]Available[/green]" if avail > 0 else "[dim]No price data[/dim]"
209
217
  else:
210
218
  try:
211
219
  ratio = float(sp) / _on_demand.get(gt, 50)
@@ -266,37 +274,46 @@ def select_gpu_type_interactive(
266
274
  si_data = info.get("spot_info", {}) or {}
267
275
  sp = si_data.get("spot_price", "") if isinstance(si_data, dict) else ""
268
276
  # Derive availability signal
277
+ avail_now = int(info.get("available", 0))
269
278
  if not sp or "No spot data" in str(si_data.get("spot_signal", "")):
270
- # Not offered — skip from choices
271
- continue
272
- try:
273
- ratio = float(sp) / _on_demand.get(gt, 50)
274
- pct = int((1 - ratio) * 100)
275
- if ratio < 0.4: signal = f"🟢 High avail ({pct}% off)"
276
- elif ratio < 0.7: signal = f"🟡 Medium ({pct}% off)"
277
- else: signal = f"🔴 Low ({pct}% off)"
278
- except (ValueError, TypeError):
279
- signal = "availability unknown"
279
+ if avail_now > 0:
280
+ signal = f"🟢 {avail_now} available now"
281
+ else:
282
+ continue
283
+ else:
284
+ try:
285
+ ratio = float(sp) / _on_demand.get(gt, 50)
286
+ pct = int((1 - ratio) * 100)
287
+ if ratio < 0.4: signal = f"🟢 High avail ({pct}% off)"
288
+ elif ratio < 0.7: signal = f"🟡 Medium ({pct}% off)"
289
+ else: signal = f"🔴 Low ({pct}% off)"
290
+ except (ValueError, TypeError):
291
+ signal = "availability unknown"
280
292
  if avail > 0:
281
293
  label = f"✅ {gt.upper()} * ({avail} free, {pn}/node, {signal})"
282
294
  else:
283
295
  label = f"⚡ {gt.upper()} * ({pn} GPUs/node, {signal})"
284
296
  choices.append(questionary.Choice(title=label, value=f"spot:{gt}"))
285
297
 
286
- console.print()
298
+ choices.append(questionary.Separator("───"))
299
+ choices.append(questionary.Choice(title="🔄 Refresh availability", value="_refresh"))
287
300
 
288
- # Interactive selection console.print()
301
+ console.print()
289
302
 
290
- # Interactive selection
291
- try:
292
- answer = questionary.select(
293
- "Select GPU type:", choices=choices, style=custom_style
294
- ).ask()
303
+ # Interactive selection — loop on refresh
304
+ while True:
305
+ try:
306
+ answer = questionary.select(
307
+ "Select GPU type:", choices=choices, style=custom_style
308
+ ).ask()
295
309
 
296
- return answer
297
- except (KeyboardInterrupt, EOFError):
298
- console.print("\n[yellow]Selection cancelled.[/yellow]")
299
- return None
310
+ if answer == "_refresh":
311
+ console.print("[dim]Refreshing...[/dim]")
312
+ return select_gpu_type_interactive(availability_info, _refresh=True)
313
+ return answer
314
+ except (KeyboardInterrupt, EOFError):
315
+ console.print("\n[yellow]Selection cancelled.[/yellow]")
316
+ return None
300
317
 
301
318
 
302
319
  def _format_eta_seconds(delta_seconds: int) -> str:
@@ -826,8 +826,20 @@ class ReservationManager:
826
826
  ]
827
827
 
828
828
  if len(matching_reservations) == 0:
829
- return None
830
- elif len(matching_reservations) > 1:
829
+ # Not found by user_id — try direct lookup (for added users viewing other's reservations)
830
+ try:
831
+ from boto3.dynamodb.conditions import Key
832
+ scan_resp = self.reservations_table.scan(
833
+ FilterExpression="begins_with(reservation_id, :rid)",
834
+ ExpressionAttributeValues={":rid": reservation_id},
835
+ Limit=10,
836
+ )
837
+ matching_reservations = scan_resp.get("Items", [])
838
+ except Exception:
839
+ pass
840
+ if not matching_reservations:
841
+ return None
842
+ if len(matching_reservations) > 1:
831
843
  return None # Ambiguous - need longer prefix
832
844
 
833
845
  reservation = matching_reservations[0]
@@ -1689,6 +1701,7 @@ class ReservationManager:
1689
1701
  initial_text = f"📡 Starting multinode reservation..." if is_multinode else "🔄 Sending reservation request..."
1690
1702
  spinner = Spinner("dots", text=initial_text)
1691
1703
  live.update(spinner)
1704
+ poll_delay = 0.5 # start fast, back off over time
1692
1705
 
1693
1706
  while (
1694
1707
  (timeout_seconds is None or time.time() -
@@ -1749,7 +1762,7 @@ class ReservationManager:
1749
1762
  if not is_multinode:
1750
1763
  spinner.text = "📡 Waiting for reservation status update..."
1751
1764
  live.update(spinner)
1752
- time.sleep(2)
1765
+ time.sleep(0.5)
1753
1766
  continue
1754
1767
  else:
1755
1768
  node_details.append({
@@ -2281,8 +2294,9 @@ class ReservationManager:
2281
2294
 
2282
2295
  return None
2283
2296
 
2284
- # Continue polling
2285
- time.sleep(3)
2297
+ # Poll with backoff: 0.5s → 1s → 1.5s → 2s → 3s (cap)
2298
+ time.sleep(poll_delay)
2299
+ poll_delay = min(poll_delay + 0.5, 3.0)
2286
2300
 
2287
2301
  except Exception as e:
2288
2302
  console.print(
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.31"
7
+ version = "0.6.0"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"