gpu-dev 0.6.0__tar.gz → 0.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/PKG-INFO +23 -3
  2. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/README.md +22 -2
  3. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +23 -3
  4. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +25 -0
  5. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +1 -1
  6. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +119 -56
  7. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +57 -10
  8. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +40 -11
  9. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/pyproject.toml +1 -1
  10. gpu_dev-0.6.3/sdk/python/README.md +240 -0
  11. gpu_dev-0.6.3/sdk/python/examples/batch_multi_gpu.py +66 -0
  12. gpu_dev-0.6.3/sdk/python/examples/interactive_debug.py +54 -0
  13. gpu_dev-0.6.3/sdk/python/examples/quickstart.ipynb +365 -0
  14. gpu_dev-0.6.3/sdk/python/examples/run_tests.py +64 -0
  15. gpu_dev-0.6.3/sdk/python/examples/submit_job.py +38 -0
  16. gpu_dev-0.6.3/sdk/python/pyproject.toml +27 -0
  17. gpu_dev-0.6.3/sdk/python/src/gpu_dev/__init__.py +60 -0
  18. gpu_dev-0.6.3/sdk/python/src/gpu_dev/_async/__init__.py +2 -0
  19. gpu_dev-0.6.3/sdk/python/src/gpu_dev/_backend/aws.py +322 -0
  20. gpu_dev-0.6.3/sdk/python/src/gpu_dev/_backend/protocol.py +53 -0
  21. gpu_dev-0.6.3/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  22. gpu_dev-0.6.3/sdk/python/src/gpu_dev/_sync/client.py +305 -0
  23. gpu_dev-0.6.3/sdk/python/src/gpu_dev/_sync/sandbox.py +355 -0
  24. gpu_dev-0.6.3/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  25. gpu_dev-0.6.3/sdk/python/src/gpu_dev/_transport/ssh.py +121 -0
  26. gpu_dev-0.6.3/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  27. gpu_dev-0.6.3/sdk/python/src/gpu_dev/common/config.py +45 -0
  28. gpu_dev-0.6.3/sdk/python/src/gpu_dev/common/enums.py +44 -0
  29. gpu_dev-0.6.3/sdk/python/src/gpu_dev/common/errors.py +33 -0
  30. gpu_dev-0.6.3/sdk/python/src/gpu_dev/common/models.py +73 -0
  31. gpu_dev-0.6.3/sdk/python/src/gpu_dev/py.typed +0 -0
  32. gpu_dev-0.6.3/sdk/python/tests/__init__.py +0 -0
  33. gpu_dev-0.6.3/sdk/python/tests/test_models.py +69 -0
  34. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/kubernetes.tf +66 -70
  35. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/reservation_processor/index.py +181 -68
  36. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda.tf +1 -1
  37. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/mig-config.tf +2 -1
  38. gpu_dev-0.6.3/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  39. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/templates/al2023-user-data.sh +5 -1
  40. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/.github/workflows/no-gitlinks.yml +0 -0
  41. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/.github/workflows/publish.yml +0 -0
  42. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/.gitignore +0 -0
  43. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/CLAUDE.md +0 -0
  44. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/README.md +0 -0
  45. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/admin/README.md +0 -0
  46. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/admin/generate_stats.py +0 -0
  47. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/admin/requirements.txt +0 -0
  48. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  49. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  50. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  51. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  52. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  53. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  54. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  55. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  56. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  57. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  58. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  59. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  60. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/docs/USER_GUIDE.md +0 -0
  61. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/docs/devgpu-features.html +0 -0
  62. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/docs/docker-mark-blue.svg +0 -0
  63. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/docs/icons8-cursor-ai.svg +0 -0
  64. /gpu_dev-0.6.0/terraform-gpu-devservers/subnet-0fe3a2c45570091ad → /gpu_dev-0.6.3/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  65. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/setup.cfg +0 -0
  66. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  67. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  68. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/README.md +0 -0
  69. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/alb.tf +0 -0
  70. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/ami-baker.tf +0 -0
  71. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/availability.tf +0 -0
  72. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/backend.tf +0 -0
  73. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/check_b200.py +0 -0
  74. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  75. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  76. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  77. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  78. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  79. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/bash_profile +0 -0
  80. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/bashrc +0 -0
  81. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  82. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  83. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  84. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  85. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/motd_script +0 -0
  86. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  87. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/profile +0 -0
  88. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  89. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  90. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  91. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/shell_env +0 -0
  92. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/ssh_config +0 -0
  93. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/zprofile +0 -0
  94. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/zshrc +0 -0
  95. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  96. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker-build.tf +0 -0
  97. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  98. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  99. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/ecr.tf +0 -0
  100. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/efs.tf +0 -0
  101. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/eks.tf +0 -0
  102. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/expiry.tf +0 -0
  103. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/git-cache.tf +0 -0
  104. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  105. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  106. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  107. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  108. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  109. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  110. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  111. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  112. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  113. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  114. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  115. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  116. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  117. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  118. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  119. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/list_b200.py +0 -0
  120. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/main.tf +0 -0
  121. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  122. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  123. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  124. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  125. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  126. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  127. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/monitoring.tf +0 -0
  128. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  129. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/outputs.tf +0 -0
  130. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/pyproject.toml +0 -0
  131. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/queue.tf +0 -0
  132. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/route53.tf +0 -0
  133. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  134. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  135. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  136. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  137. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  138. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  139. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  140. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  141. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  142. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  143. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/switch-to.sh +0 -0
  144. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  145. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  146. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  147. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  148. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/terraform-gpu-devservers/variables.tf +0 -0
  149. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/tests/submit/README.md +0 -0
  150. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/tests/submit/fail/run.sh +0 -0
  151. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/tests/submit/multinode/run.sh +0 -0
  152. {gpu_dev-0.6.0 → gpu_dev-0.6.3}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.6.0
3
+ Version: 0.6.3
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -16,9 +16,29 @@ Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
18
 
19
- # GPU Developer CLI
19
+ # GPU Developer CLI & SDK
20
20
 
21
- A command-line tool for reserving and managing GPU development servers on AWS EKS.
21
+ A command-line tool and Python SDK for reserving and managing GPU development servers.
22
+
23
+ ## Python SDK
24
+
25
+ For programmatic access, use the [Python SDK](../../sdk/python/README.md):
26
+
27
+ ```python
28
+ from gpu_dev import GpuDev
29
+
30
+ client = GpuDev()
31
+ sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
32
+ result = sandbox.exec("nvidia-smi")
33
+ print(result.stdout)
34
+ sandbox.cancel()
35
+ ```
36
+
37
+ Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
38
+
39
+ ---
40
+
41
+ ## CLI
22
42
 
23
43
  ## Table of Contents
24
44
 
@@ -1,6 +1,26 @@
1
- # GPU Developer CLI
1
+ # GPU Developer CLI & SDK
2
2
 
3
- A command-line tool for reserving and managing GPU development servers on AWS EKS.
3
+ A command-line tool and Python SDK for reserving and managing GPU development servers.
4
+
5
+ ## Python SDK
6
+
7
+ For programmatic access, use the [Python SDK](../../sdk/python/README.md):
8
+
9
+ ```python
10
+ from gpu_dev import GpuDev
11
+
12
+ client = GpuDev()
13
+ sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
14
+ result = sandbox.exec("nvidia-smi")
15
+ print(result.stdout)
16
+ sandbox.cancel()
17
+ ```
18
+
19
+ Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
20
+
21
+ ---
22
+
23
+ ## CLI
4
24
 
5
25
  ## Table of Contents
6
26
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.6.0
3
+ Version: 0.6.3
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -16,9 +16,29 @@ Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
18
 
19
- # GPU Developer CLI
19
+ # GPU Developer CLI & SDK
20
20
 
21
- A command-line tool for reserving and managing GPU development servers on AWS EKS.
21
+ A command-line tool and Python SDK for reserving and managing GPU development servers.
22
+
23
+ ## Python SDK
24
+
25
+ For programmatic access, use the [Python SDK](../../sdk/python/README.md):
26
+
27
+ ```python
28
+ from gpu_dev import GpuDev
29
+
30
+ client = GpuDev()
31
+ sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
32
+ result = sandbox.exec("nvidia-smi")
33
+ print(result.stdout)
34
+ sandbox.cancel()
35
+ ```
36
+
37
+ Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
38
+
39
+ ---
40
+
41
+ ## CLI
22
42
 
23
43
  ## Table of Contents
24
44
 
@@ -30,6 +30,31 @@ docs/USER_GUIDE.md
30
30
  docs/devgpu-features.html
31
31
  docs/docker-mark-blue.svg
32
32
  docs/icons8-cursor-ai.svg
33
+ sdk/python/README.md
34
+ sdk/python/pyproject.toml
35
+ sdk/python/examples/batch_multi_gpu.py
36
+ sdk/python/examples/interactive_debug.py
37
+ sdk/python/examples/quickstart.ipynb
38
+ sdk/python/examples/run_tests.py
39
+ sdk/python/examples/submit_job.py
40
+ sdk/python/src/gpu_dev/__init__.py
41
+ sdk/python/src/gpu_dev/py.typed
42
+ sdk/python/src/gpu_dev/_async/__init__.py
43
+ sdk/python/src/gpu_dev/_backend/__init__.py
44
+ sdk/python/src/gpu_dev/_backend/aws.py
45
+ sdk/python/src/gpu_dev/_backend/protocol.py
46
+ sdk/python/src/gpu_dev/_sync/__init__.py
47
+ sdk/python/src/gpu_dev/_sync/client.py
48
+ sdk/python/src/gpu_dev/_sync/sandbox.py
49
+ sdk/python/src/gpu_dev/_transport/__init__.py
50
+ sdk/python/src/gpu_dev/_transport/ssh.py
51
+ sdk/python/src/gpu_dev/common/__init__.py
52
+ sdk/python/src/gpu_dev/common/config.py
53
+ sdk/python/src/gpu_dev/common/enums.py
54
+ sdk/python/src/gpu_dev/common/errors.py
55
+ sdk/python/src/gpu_dev/common/models.py
56
+ sdk/python/tests/__init__.py
57
+ sdk/python/tests/test_models.py
33
58
  terraform-gpu-devservers/.terraform.lock.hcl
34
59
  terraform-gpu-devservers/README.md
35
60
  terraform-gpu-devservers/alb.tf
@@ -13,7 +13,7 @@ from rich.spinner import Spinner
13
13
  # SSH validation result is cached locally for 24h. New keys pushed to GitHub still take effect
14
14
  # at reservation time (pods fetch live keys via init container) — caching only skips the
15
15
  # pre-flight "are you who you say you are" check.
16
- _SSH_CACHE_TTL_SECONDS = 24 * 60 * 60
16
+ _SSH_CACHE_TTL_SECONDS = 14 * 24 * 60 * 60
17
17
  _SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cache.json"))
18
18
 
19
19
  # Cache for authenticate_user. STS GetCallerIdentity is stable per AWS profile and slow under SSO
@@ -41,33 +41,51 @@ from .interactive import (
41
41
  console = Console()
42
42
 
43
43
 
44
+ _east1_table = None
45
+
44
46
  def _fetch_reservations_cross_region(reservation_mgr, user_filter, statuses, config=None):
45
47
  """Fetch reservations from current region + prod-east1 if on prod."""
46
- reservations = reservation_mgr.list_reservations(
47
- user_filter=user_filter, statuses_to_include=statuses)
48
- # Cross-region fetch
49
- try:
48
+ global _east1_table
49
+
50
+ from concurrent.futures import ThreadPoolExecutor
51
+
52
+ def _fetch_primary():
53
+ return reservation_mgr.list_reservations(
54
+ user_filter=user_filter, statuses_to_include=statuses)
55
+
56
+ def _fetch_east1():
57
+ global _east1_table
50
58
  cfg = config or load_config()
51
- if cfg.user_config.get("environment") == "prod":
52
- east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
53
- if east1_env:
54
- import boto3 as _b3
55
- east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
56
- east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
57
- for st in (statuses or ["active"]):
58
- resp = east1_table.query(
59
- IndexName="StatusIndex",
60
- KeyConditionExpression="#s = :status",
61
- ExpressionAttributeNames={"#s": "status"},
62
- ExpressionAttributeValues={":status": st},
63
- )
64
- for item in resp.get("Items", []):
65
- if user_filter and item.get("user_id") != user_filter:
66
- continue
67
- item["_region"] = "us-east-1"
68
- reservations.append(item)
59
+ if cfg.user_config.get("environment") != "prod":
60
+ return []
61
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
62
+ if not east1_env or not user_filter:
63
+ return []
64
+ if _east1_table is None:
65
+ _east1_table = cfg.session.resource(
66
+ "dynamodb", region_name=east1_env["region"]
67
+ ).Table("pytorch-gpu-dev-reservations")
68
+ results = []
69
+ for st in (statuses or ["active"]):
70
+ resp = _east1_table.query(
71
+ IndexName="UserStatusIndex",
72
+ KeyConditionExpression="user_id = :uid AND #s = :status",
73
+ ExpressionAttributeNames={"#s": "status"},
74
+ ExpressionAttributeValues={":uid": user_filter, ":status": st},
75
+ )
76
+ for item in resp.get("Items", []):
77
+ item["_region"] = "us-east-1"
78
+ results.append(item)
79
+ return results
80
+
81
+ try:
82
+ with ThreadPoolExecutor(max_workers=2) as ex:
83
+ f1 = ex.submit(_fetch_primary)
84
+ f2 = ex.submit(_fetch_east1)
85
+ reservations = f1.result()
86
+ reservations.extend(f2.result())
69
87
  except Exception:
70
- pass
88
+ reservations = _fetch_primary()
71
89
  return reservations
72
90
 
73
91
 
@@ -608,6 +626,8 @@ def main(ctx: click.Context) -> None:
608
626
  )
609
627
  @click.option("--spot", is_flag=True, default=False,
610
628
  help="Acknowledge spot instance (~1/3 cost, may be preempted with 2-min notice). Required for spot-only types.")
629
+ @click.option("--fast-cache", is_flag=True, default=False, hidden=True,
630
+ help="Use NVMe local cache for faster session restore (experimental).")
611
631
  @click.pass_context
612
632
  def reserve(
613
633
  ctx: click.Context,
@@ -629,6 +649,7 @@ def reserve(
629
649
  disk: Optional[str],
630
650
  node_label: tuple,
631
651
  spot: bool = False,
652
+ fast_cache: bool = False,
632
653
  ) -> None:
633
654
  """Reserve GPU development server(s)
634
655
 
@@ -746,7 +767,10 @@ def reserve(
746
767
  else:
747
768
  f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
748
769
  ssh_result = None
749
- f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
770
+ # Only fetch availability if we need the interactive picker
771
+ need_interactive = gpu_type is None
772
+ if need_interactive:
773
+ f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
750
774
 
751
775
  # Surface auth failure first (most actionable).
752
776
  try:
@@ -758,7 +782,7 @@ def reserve(
758
782
 
759
783
  if ssh_result is None:
760
784
  ssh_result = f_ssh.result()
761
- availability_info = f_avail.result()
785
+ availability_info = f_avail.result() if need_interactive else None
762
786
 
763
787
  # Surface SSH validation failure with the same UX as before.
764
788
  if not ssh_result.get("valid"):
@@ -1108,11 +1132,13 @@ def reserve(
1108
1132
  rprint(f"[red]❌ {str(e)}[/red]")
1109
1133
  return
1110
1134
 
1111
- # Validate SSH key matches configured GitHub username
1112
- live.update(Spinner("dots", text="🔐 Validating SSH key..."))
1135
+ # Validate SSH key matches configured GitHub username (cached, ~0ms)
1113
1136
  if not _validate_ssh_key_or_exit(config, live):
1114
1137
  return
1115
1138
 
1139
+ live.update(Spinner("dots", text="📡 Preparing reservation..."))
1140
+ reservation_mgr = ReservationManager(config)
1141
+
1116
1142
  # Track if user explicitly requests no persistent disk
1117
1143
  explicit_no_disk = explicit_no_disk_from_param
1118
1144
 
@@ -1166,8 +1192,10 @@ def reserve(
1166
1192
  # Build choices
1167
1193
  choices = []
1168
1194
 
1169
- # Get available disks (exclude in-use and deleted disks)
1170
- available_disks = [d for d in existing_disks if not d['in_use'] and not d.get('is_deleted', False)]
1195
+ # Show all non-deleted disks, marking in-use ones as disabled
1196
+ all_disks = [d for d in existing_disks if not d.get('is_deleted', False)]
1197
+ available_disks = [d for d in all_disks if not d['in_use']]
1198
+ in_use_disks = [d for d in all_disks if d['in_use']]
1171
1199
 
1172
1200
  if available_disks:
1173
1201
  choices.append(questionary.Separator("=== Available Disks ==="))
@@ -1178,6 +1206,17 @@ def reserve(
1178
1206
  value=("select", d['name'])
1179
1207
  ))
1180
1208
 
1209
+ if in_use_disks:
1210
+ choices.append(questionary.Separator("=== In Use ==="))
1211
+ for d in in_use_disks:
1212
+ res_id = d.get('reservation_id', '?')[:8]
1213
+ display = f"{d['name']} ({d['size_gb']}GB) — in use by {res_id}"
1214
+ choices.append(questionary.Choice(
1215
+ title=display,
1216
+ value=("in_use", d['name']),
1217
+ disabled="currently in use",
1218
+ ))
1219
+
1181
1220
  choices.append(questionary.Separator("=== Options ==="))
1182
1221
  choices.append(questionary.Choice(
1183
1222
  title="Create a new disk",
@@ -1224,11 +1263,6 @@ def reserve(
1224
1263
  rprint(f"[yellow]Use a different disk or wait for the reservation to end[/yellow]")
1225
1264
  return
1226
1265
 
1227
- live.update(
1228
- Spinner("dots", text="📡 Setting up reservation manager...")
1229
- )
1230
- reservation_mgr = ReservationManager(config)
1231
-
1232
1266
  # Submit reservation request
1233
1267
  live.update(
1234
1268
  Spinner("dots", text="📡 Submitting reservation request...")
@@ -1364,6 +1398,7 @@ def reserve(
1364
1398
  spot=spot,
1365
1399
  node_labels=node_labels if node_labels else None,
1366
1400
  trace=trace,
1401
+ fast_cache=fast_cache,
1367
1402
  )
1368
1403
  reservation_ids = [reservation_id] if reservation_id else None
1369
1404
 
@@ -2887,36 +2922,42 @@ def _show_availability() -> None:
2887
2922
  ) as live:
2888
2923
  config = load_config()
2889
2924
 
2890
- # Authenticate using AWS credentials
2925
+ # Authenticate and fetch availability (both regions in parallel)
2891
2926
  try:
2892
2927
  user_info = authenticate_user(config)
2893
2928
  reservation_mgr = ReservationManager(config)
2894
- availability_info = reservation_mgr.get_gpu_availability_by_type()
2929
+
2930
+ from concurrent.futures import ThreadPoolExecutor
2931
+ _env_name = config.user_config.get("environment", "prod")
2932
+ _east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
2933
+
2934
+ def _fetch_east1_spot():
2935
+ if _env_name != "prod" or not _east1_spot_types:
2936
+ return {}
2937
+ east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
2938
+ east1_table = config.session.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability")
2939
+ result = {}
2940
+ for item in east1_table.scan().get("Items", []):
2941
+ gt = item.get("gpu_type", "")
2942
+ if gt in _east1_spot_types:
2943
+ result[gt] = {
2944
+ "available": int(item.get("available_gpus", 0)),
2945
+ "total": int(item.get("total_gpus", 0)),
2946
+ "max_reservable": int(item.get("max_reservable", 0)),
2947
+ "spot_info": item.get("spot_info", {}),
2948
+ }
2949
+ return result
2950
+
2951
+ with ThreadPoolExecutor(max_workers=2) as ex:
2952
+ f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
2953
+ f_spot = ex.submit(_fetch_east1_spot)
2954
+ availability_info = f_avail.result()
2955
+ spot_region_info = f_spot.result()
2895
2956
  except RuntimeError as e:
2896
2957
  live.stop()
2897
2958
  rprint(f"[red]❌ {str(e)}[/red]")
2898
2959
  return
2899
2960
 
2900
- # Cross-region: fetch spot availability from prod-east1
2901
- spot_region_info = {}
2902
- _env_name = config.user_config.get("environment", "prod")
2903
- _east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
2904
- if _env_name == "prod" and _east1_spot_types:
2905
- try:
2906
- import boto3 as _b3
2907
- east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
2908
- for item in _b3.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability").scan().get("Items", []):
2909
- gt = item.get("gpu_type", "")
2910
- if gt in _east1_spot_types:
2911
- spot_region_info[gt] = {
2912
- "available": int(item.get("available_gpus", 0)),
2913
- "total": int(item.get("total_gpus", 0)),
2914
- "max_reservable": int(item.get("max_reservable", 0)),
2915
- "spot_info": item.get("spot_info", {}),
2916
- }
2917
- except Exception:
2918
- pass
2919
-
2920
2961
  if availability_info:
2921
2962
  # GPU architecture mapping (for display)
2922
2963
  gpu_architectures = {
@@ -3273,8 +3314,28 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3273
3314
  For VS Code Remote or manual SSH, use 'gpu-dev show' to see full SSH command.
3274
3315
  """
3275
3316
  import subprocess
3317
+ from pathlib import Path
3276
3318
 
3277
3319
  try:
3320
+ # Fast path: if reservation ID given, check local SSH config first (no network)
3321
+ if reservation_id:
3322
+ ssh_config_dir = Path.home() / ".gpu-dev"
3323
+ config_file = ssh_config_dir / f"{reservation_id[:8]}-sshconfig"
3324
+ if config_file.exists():
3325
+ config_text = config_file.read_text()
3326
+ fqdn_line = [l.strip() for l in config_text.splitlines() if l.strip().startswith("HostName")]
3327
+ if fqdn_line:
3328
+ fqdn = fqdn_line[0].split(None, 1)[1]
3329
+ pod_name = f"gpu-dev-{reservation_id[:8]}"
3330
+ rprint(f"[cyan]Connecting to {pod_name}...[/cyan]\n")
3331
+ import subprocess, sys
3332
+ sys.exit(subprocess.call([
3333
+ "ssh", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null",
3334
+ "-o", "ProxyCommand=gpu-dev-ssh-proxy %h %p",
3335
+ "-o", "ForwardAgent=yes",
3336
+ f"dev@{fqdn}",
3337
+ ]))
3338
+
3278
3339
  with Live(
3279
3340
  Spinner("dots", text="📡 Fetching reservation details..."), console=console
3280
3341
  ) as live:
@@ -3504,7 +3565,9 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3504
3565
  except KeyboardInterrupt:
3505
3566
  rprint("\n[yellow]Connection cancelled by user[/yellow]")
3506
3567
  except Exception as e:
3568
+ import traceback
3507
3569
  rprint(f"[red]❌ Error: {str(e)}[/red]")
3570
+ traceback.print_exc()
3508
3571
 
3509
3572
 
3510
3573
  @main.command(name="get-ssh-config")
@@ -3,6 +3,7 @@
3
3
  import os
4
4
  import json
5
5
  import boto3
6
+ import botocore.exceptions
6
7
  from pathlib import Path
7
8
  from typing import Dict, Any, Optional
8
9
 
@@ -72,17 +73,63 @@ class Config:
72
73
  self._sqs_client = None
73
74
  self._dynamodb = None
74
75
 
76
+ _CRED_CACHE = Path.home() / ".config" / "gpu-dev" / "aws-cred-cache.json"
77
+
75
78
  def _create_aws_session(self):
76
- """Create AWS session with profile support"""
77
- available_profiles = boto3.Session().available_profiles
78
- if "gpu-dev" in available_profiles:
79
- try:
80
- session = boto3.Session(profile_name="gpu-dev")
81
- session.get_credentials()
82
- return session
83
- except Exception:
84
- pass
85
- return boto3.Session()
79
+ """Create AWS session, caching resolved credentials to skip SSO resolution (~900ms)."""
80
+ import time as _time
81
+
82
+ # Try cached credentials first (avoids 900ms SSO resolution)
83
+ try:
84
+ if self._CRED_CACHE.exists():
85
+ cached = json.loads(self._CRED_CACHE.read_text())
86
+ if _time.time() < cached.get("expires", 0):
87
+ return boto3.Session(
88
+ aws_access_key_id=cached["access_key"],
89
+ aws_secret_access_key=cached["secret_key"],
90
+ aws_session_token=cached["token"],
91
+ region_name=self.aws_region,
92
+ )
93
+ except Exception:
94
+ pass
95
+
96
+ # Resolve credentials from SSO/profile (slow path, ~900ms)
97
+ try:
98
+ session = boto3.Session(profile_name="gpu-dev")
99
+ creds = session.get_credentials()
100
+ if not creds:
101
+ raise Exception("no credentials")
102
+ except Exception:
103
+ session = boto3.Session()
104
+ creds = session.get_credentials()
105
+
106
+ # Cache resolved credentials (safe — they're short-lived STS tokens)
107
+ try:
108
+ frozen = creds.get_frozen_credentials()
109
+ if frozen.token:
110
+ self._CRED_CACHE.parent.mkdir(parents=True, exist_ok=True)
111
+ self._CRED_CACHE.write_text(json.dumps({
112
+ "access_key": frozen.access_key,
113
+ "secret_key": frozen.secret_key,
114
+ "token": frozen.token,
115
+ "expires": _time.time() + 2700, # cache 45min (SSO tokens last ~1h)
116
+ }))
117
+ self._CRED_CACHE.chmod(0o600)
118
+ except Exception:
119
+ pass
120
+
121
+ return session
122
+
123
+ def refresh_session(self):
124
+ """Clear cached credentials and re-resolve. Called on ExpiredTokenException."""
125
+ try:
126
+ self._CRED_CACHE.unlink(missing_ok=True)
127
+ except Exception:
128
+ pass
129
+ self.session = self._create_aws_session()
130
+ self._sts_client = None
131
+ self._sqs_client = None
132
+ self._dynamodb = None
86
133
 
87
134
  @property
88
135
  def sts_client(self):
@@ -23,6 +23,8 @@ from .name_generator import sanitize_name
23
23
  def _spot_stage_number(status: str) -> tuple:
24
24
  """Map a spot provisioning status message to a numbered step (N, total)."""
25
25
  s = status.lower()
26
+ if "no spot capacity" in s or "no capacity" in s:
27
+ return 1, 7 # stuck at step 1, but message itself says why
26
28
  if "requested" in s or "waiting for aws" in s or "allocate capacity" in s:
27
29
  return 1, 7
28
30
  if "allocated" in s or "launching" in s or "booting" in s:
@@ -424,6 +426,18 @@ class ReservationManager:
424
426
  self.reservations_table = config.dynamodb.Table(
425
427
  config.reservations_table)
426
428
 
429
+ def _retry_on_expired(self, fn):
430
+ """Call fn, auto-refresh credentials on ExpiredTokenException."""
431
+ try:
432
+ return fn()
433
+ except Exception as e:
434
+ if "ExpiredToken" in str(type(e).__name__) or "expired" in str(e).lower():
435
+ self.config.refresh_session()
436
+ self.reservations_table = self.config.dynamodb.Table(
437
+ self.config.reservations_table)
438
+ return fn()
439
+ raise
440
+
427
441
  def create_reservation(
428
442
  self,
429
443
  user_id: str,
@@ -442,6 +456,7 @@ class ReservationManager:
442
456
  node_labels: Optional[Dict[str, str]] = None,
443
457
  trace: bool = False,
444
458
  spot: bool = False,
459
+ fast_cache: bool = False,
445
460
  ) -> Optional[str]:
446
461
  """Create a new GPU reservation"""
447
462
  try:
@@ -524,6 +539,9 @@ class ReservationManager:
524
539
  if spot:
525
540
  message["spot"] = True
526
541
 
542
+ if fast_cache:
543
+ message["fast_cache"] = True
544
+
527
545
  # Add trace flag and CLI start timestamp
528
546
  if trace:
529
547
  message["trace"] = True
@@ -801,20 +819,21 @@ class ReservationManager:
801
819
  For multi-node reservations, returns info for all nodes in the group.
802
820
  """
803
821
  try:
804
- # Query by user first (efficient), then filter by reservation_id prefix
822
+ # Short ID prefix query UserIndex with server-side filter
805
823
  response = self.reservations_table.query(
806
824
  IndexName="UserIndex",
807
825
  KeyConditionExpression="user_id = :user_id",
808
- ExpressionAttributeValues={":user_id": user_id},
826
+ FilterExpression="begins_with(reservation_id, :rid)",
827
+ ExpressionAttributeValues={":user_id": user_id, ":rid": reservation_id},
809
828
  )
810
829
  all_reservations = response.get("Items", [])
811
830
 
812
- # Handle pagination for UserIndex query
813
831
  while "LastEvaluatedKey" in response:
814
832
  response = self.reservations_table.query(
815
833
  IndexName="UserIndex",
816
834
  KeyConditionExpression="user_id = :user_id",
817
- ExpressionAttributeValues={":user_id": user_id},
835
+ FilterExpression="begins_with(reservation_id, :rid)",
836
+ ExpressionAttributeValues={":user_id": user_id, ":rid": reservation_id},
818
837
  ExclusiveStartKey=response["LastEvaluatedKey"]
819
838
  )
820
839
  all_reservations.extend(response.get("Items", []))
@@ -1078,9 +1097,16 @@ class ReservationManager:
1078
1097
  )
1079
1098
  all_items.extend(response.get("Items", []))
1080
1099
 
1100
+ # Fetch queue lengths for all GPU types in parallel
1101
+ from concurrent.futures import ThreadPoolExecutor
1102
+ gpu_types_list = [item["gpu_type"] for item in all_items]
1103
+ with ThreadPoolExecutor(max_workers=10) as ex:
1104
+ queue_futures = {gt: ex.submit(self._get_queue_length_for_gpu_type, gt) for gt in gpu_types_list}
1105
+ queue_lengths = {gt: f.result() for gt, f in queue_futures.items()}
1106
+
1081
1107
  for item in all_items:
1082
1108
  gpu_type = item["gpu_type"]
1083
- queue_length = self._get_queue_length_for_gpu_type(gpu_type)
1109
+ queue_length = queue_lengths.get(gpu_type, 0)
1084
1110
  estimated_wait = queue_length * 15 if queue_length > 0 else 0
1085
1111
 
1086
1112
  # size_etas is a DDB Map of {size_str: epoch_seconds (Decimal)} — pass through
@@ -1210,7 +1236,6 @@ class ReservationManager:
1210
1236
  try:
1211
1237
  total_count = 0
1212
1238
 
1213
- # Count queued reservations for this GPU type
1214
1239
  for status in ["queued", "pending"]:
1215
1240
  try:
1216
1241
  response = self.reservations_table.query(
@@ -1221,10 +1246,10 @@ class ReservationManager:
1221
1246
  ":status": status,
1222
1247
  ":gpu_type": gpu_type,
1223
1248
  },
1249
+ Select="COUNT",
1224
1250
  )
1225
- total_count += len(response.get("Items", []))
1251
+ total_count += response.get("Count", 0)
1226
1252
 
1227
- # Handle pagination for StatusGpuTypeIndex query
1228
1253
  while "LastEvaluatedKey" in response:
1229
1254
  response = self.reservations_table.query(
1230
1255
  IndexName="StatusGpuTypeIndex",
@@ -1234,9 +1259,10 @@ class ReservationManager:
1234
1259
  ":status": status,
1235
1260
  ":gpu_type": gpu_type,
1236
1261
  },
1262
+ Select="COUNT",
1237
1263
  ExclusiveStartKey=response["LastEvaluatedKey"]
1238
1264
  )
1239
- total_count += len(response.get("Items", []))
1265
+ total_count += response.get("Count", 0)
1240
1266
  except Exception as query_error:
1241
1267
  # Fallback to scanning if the composite index doesn't exist yet
1242
1268
  console.print(
@@ -1904,9 +1930,12 @@ class ReservationManager:
1904
1930
  detailed = first_queued.get("current_detailed_status", "")
1905
1931
  # Spot stages come through current_detailed_status — show as
1906
1932
  # numbered steps so users see progress and don't give up.
1907
- if detailed and ("spot" in detailed.lower() or "node" in detailed.lower() or "instance" in detailed.lower()):
1933
+ if detailed and ("spot" in detailed.lower() or "node" in detailed.lower() or "instance" in detailed.lower() or "capacity" in detailed.lower()):
1908
1934
  step, total = _spot_stage_number(detailed)
1909
- message = f" Step {step}/{total}: {detailed}"
1935
+ if "no spot capacity" in detailed.lower() or "no capacity" in detailed.lower():
1936
+ message = f"⚠️ {detailed}"
1937
+ else:
1938
+ message = f"⏳ Step {step}/{total}: {detailed}"
1910
1939
  elif is_multinode:
1911
1940
  total_gpus = sum(
1912
1941
  node["gpu_count"] for node in node_details if node["reservation"])
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.6.0"
7
+ version = "0.6.3"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"