gpu-dev 0.7.3__tar.gz → 0.7.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/CLAUDE.md +1 -1
  2. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/PKG-INFO +1 -1
  3. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +18 -0
  4. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +72 -49
  5. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/gpu_dev.egg-info/PKG-INFO +1 -1
  6. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/pyproject.toml +1 -1
  7. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/__init__.py +1 -1
  8. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/reservation_processor/index.py +20 -1
  9. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/.github/workflows/no-gitlinks.yml +0 -0
  10. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/.github/workflows/publish.yml +0 -0
  11. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/.gitignore +0 -0
  12. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/README.md +0 -0
  13. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/admin/README.md +0 -0
  14. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/admin/generate_stats.py +0 -0
  15. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/admin/requirements.txt +0 -0
  16. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/README.md +0 -0
  17. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  18. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  19. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  20. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  21. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  22. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  23. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  24. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  25. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  26. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  27. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/docs/SDK_REPRO.md +0 -0
  28. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/docs/USER_GUIDE.md +0 -0
  29. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/docs/devgpu-features.html +0 -0
  30. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/docs/docker-mark-blue.svg +0 -0
  31. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/docs/icons8-cursor-ai.svg +0 -0
  32. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/gpu_dev.egg-info/SOURCES.txt +0 -0
  33. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/gpu_dev.egg-info/dependency_links.txt +0 -0
  34. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/gpu_dev.egg-info/entry_points.txt +0 -0
  35. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/gpu_dev.egg-info/requires.txt +0 -0
  36. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/gpu_dev.egg-info/top_level.txt +0 -0
  37. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/architecture.html +0 -0
  38. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/cli-demo.html +0 -0
  39. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/devgpu-features.html +0 -0
  40. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/docker-mark-blue.svg +0 -0
  41. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/feedback.png +0 -0
  42. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/gpu-fleet.html +0 -0
  43. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/icons8-cursor-ai.svg +0 -0
  44. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/index.html +0 -0
  45. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/k8s-under-the-hood.html +0 -0
  46. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/multinode.html +0 -0
  47. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/osdc-future-plans.html +0 -0
  48. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/problem.png +0 -0
  49. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/sandbox.html +0 -0
  50. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/sdk-demo.html +0 -0
  51. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/thesis.html +0 -0
  52. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/title-vid.mp4 +0 -0
  53. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/weneedgpus.png +0 -0
  54. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/presentation/wow.html +0 -0
  55. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/README.md +0 -0
  56. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/examples/batch_multi_gpu.py +0 -0
  57. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/examples/interactive_debug.py +0 -0
  58. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/examples/parallel_experiments.ipynb +0 -0
  59. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/examples/quickstart.ipynb +0 -0
  60. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/examples/run_tests.py +0 -0
  61. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/examples/submit_job.py +0 -0
  62. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
  63. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  64. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_backend/aws.py +0 -0
  65. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
  66. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  67. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
  68. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
  69. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  70. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
  71. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  72. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/common/config.py +0 -0
  73. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/common/enums.py +0 -0
  74. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/common/errors.py +0 -0
  75. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/common/models.py +0 -0
  76. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/src/gpu_dev/py.typed +0 -0
  77. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/tests/__init__.py +0 -0
  78. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/sdk/python/tests/test_models.py +0 -0
  79. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/setup.cfg +0 -0
  80. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-deck/backend.tf +0 -0
  81. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-deck/main.tf +0 -0
  82. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-deck/terraform.tfvars.example +0 -0
  83. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  84. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  85. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/README.md +0 -0
  86. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/alb.tf +0 -0
  87. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/ami-baker.tf +0 -0
  88. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/availability.tf +0 -0
  89. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/backend.tf +0 -0
  90. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/build-node.tf +0 -0
  91. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/check_b200.py +0 -0
  92. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  93. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  94. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  95. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  96. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  97. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/bash_profile +0 -0
  98. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/bashrc +0 -0
  99. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  100. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  101. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  102. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  103. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/motd_script +0 -0
  104. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  105. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/profile +0 -0
  106. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  107. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  108. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  109. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/shell_env +0 -0
  110. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/ssh_config +0 -0
  111. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/zprofile +0 -0
  112. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/zshrc +0 -0
  113. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  114. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker-build.tf +0 -0
  115. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  116. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  117. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/ecr.tf +0 -0
  118. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/efs.tf +0 -0
  119. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/eks.tf +0 -0
  120. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/expiry.tf +0 -0
  121. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/git-cache.tf +0 -0
  122. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  123. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/kubernetes.tf +0 -0
  124. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  125. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  126. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  127. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  128. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  129. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  130. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  131. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  132. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  133. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  134. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  135. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  136. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  137. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  138. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/lambda.tf +0 -0
  139. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/list_b200.py +0 -0
  140. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/main.tf +0 -0
  141. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/mig-config.tf +0 -0
  142. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  143. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  144. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  145. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  146. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  147. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  148. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/monitoring.tf +0 -0
  149. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  150. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/outputs.tf +0 -0
  151. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/pyproject.toml +0 -0
  152. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/pytorch-prebuild.tf +0 -0
  153. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/queue.tf +0 -0
  154. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/route53.tf +0 -0
  155. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  156. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  157. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  158. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  159. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  160. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  161. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  162. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  163. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  164. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  165. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  166. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/switch-to.sh +0 -0
  167. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  168. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  169. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  170. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  171. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  172. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/variables.tf +0 -0
  173. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/terraform-gpu-devservers/warm-pool.tf +0 -0
  174. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/tests/submit/README.md +0 -0
  175. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/tests/submit/fail/run.sh +0 -0
  176. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/tests/submit/multinode/run.sh +0 -0
  177. {gpu_dev-0.7.3 → gpu_dev-0.7.5}/tests/submit/success/run.sh +0 -0
@@ -79,7 +79,7 @@ Big push on warm pools + instant claims + prebuilt pytorch. Tracking state here
79
79
  - [ ] **Prebuilt built WITHOUT cuDNN** — `import torch` warns "compiled without cuDNN/MIOpen". CI/nightly build with cudnn9. Add libcudnn to the gpu-dev image + `USE_CUDNN=1` to the build recipe for fidelity (conv/cudnn-dependent ops + tests). Irrelevant for flex-attention int64 test; matters generally.
80
80
  - [ ] **`--ref pr/N` uses `pull/N/head`, not `/merge`** — `/head` is the PR author's raw branch tip (often based on old trunk, missing trunk-added tests); CI tests `/merge` (PR merged onto current trunk). For CI-repro fidelity, `pr/N` should fetch `pull/N/merge` (fall back to `/head` if no merge ref). `stage-pytorch` REF case in `index.py`. (This is why `pull/185479/head` lacked `test_large_kv_int64_pointer_math_cuda`.)
81
81
  - [ ] **Misleading disconnect/expiry message** — on `gpu-dev connect` connection loss OR reservation expiry, the CLI prints "❌ Authentication failed. You don't have SSH access... ask the primary user to add you" even for the PRIMARY user's own expired/cancelled reservation. Distinguish: (a) reservation expired -> "Reservation <id> expired at <time>"; (b) cancelled -> "Reservation was cancelled"; (c) connection dropped but still active -> "Connection lost, reconnect with gpu-dev connect <id>"; (d) genuine auth failure -> the current add-user message. Check reservation status before assuming auth failure.
82
- - [ ] **`gpu-dev cancel` from inside the pod** — show "Shutting down this reservation..." (graceful message) instead of an abrupt SSH drop, so the user knows the disconnect was intentional.
82
+ - [x] **`gpu-dev cancel` from inside the pod** (DONE, 0.7.5) two bugs: (1) cancel inside a **warm-claimed** pod failed with "GitHub username not configured" because the warm pod was pre-booted with `user_id="warm"` and the claim never stamped the real identity → `GPU_DEV_USER_ID/GPU_DEV_GITHUB_USER/AWS_ROLE_SESSION_NAME` stayed `"warm"`/empty. Fix: `try_claim_warm_pod` now seds the real `user_id`/`github_user` into both `.bashrc_ext`/`.zshrc_ext` + writes `GPU_DEV_RESERVATION_ID` (full id). Cold `_ext` derives `GPU_DEV_RESERVATION_ID` from the hostname (8-char prefix; cancellation resolves by prefix). (2) `gpu-dev cancel` (no id) inside a pod now fast-paths: cancels THIS reservation directly via `GPU_DEV_RESERVATION_ID`+`GPU_DEV_USER_ID` (no github_user/interactive) with the graceful "🛑 Shutting down..." message. Needs `tf apply` (lambda) + image rebuild (CLI in pods).
83
83
  - [ ] SSH CA certs to drop the ~0.33s `kubectl exec` key injection on warm claim (auth-model change).
84
84
  - [ ] AMI baker re-bakes on every base-EKS-AMI roll (5 baked AMIs in 2 days): pin the base AMI version + clean up old `gpu-dev-baked-*`.
85
85
  - [ ] **Warm pods: gate `warm-state=ready` on staging completion** (NOW MORE IMPORTANT — the built tree is ~30GB, and on GPU nodes it's a `cp` not reflink, so staging takes ~1-3min; a claim in that window hands over a half-copied tree). Two options: (a) claim-time check — exec `[ -f /home/dev/.pytorch-staging ]` in `try_claim_warm_pod`, skip pods still staging (simple, but adds ~0.5s exec to every warm claim); (b) label-flip — create with `warm-state=provisioning`, reconciler exec-checks staging + flips to `ready` (no claim latency, but 4 interacting changes: create label + reconciler flip + eviction must also target `provisioning` + claim already filters `ready`). Prefer (b). Marker: `.pytorch-staging` present during, removed when done; `.pytorch-ready` written at end.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.7.3
3
+ Version: 0.7.5
4
4
  Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -2668,6 +2668,24 @@ def cancel(
2668
2668
  rprint("[red]❌ Cannot specify both --all and a reservation ID[/red]")
2669
2669
  return
2670
2670
 
2671
+ # Inside a gpu-dev pod, `gpu-dev cancel` (no id) shuts down THIS reservation
2672
+ # directly. The pod knows its own reservation (GPU_DEV_RESERVATION_ID) and
2673
+ # owner (GPU_DEV_USER_ID), so we skip the github_user / interactive list —
2674
+ # which can't work in a pod that has no `gpu-dev config set github_user`.
2675
+ pod_rid = os.environ.get("GPU_DEV_RESERVATION_ID", "").strip()
2676
+ pod_uid = os.environ.get("GPU_DEV_USER_ID", "").strip()
2677
+ if pod_rid and pod_uid and pod_uid != "warm" and not reservation_id and not all:
2678
+ rprint("[yellow]🛑 Shutting down this reservation — if you're connected to this pod, your session will close shortly.[/yellow]")
2679
+ try:
2680
+ reservation_mgr = ReservationManager(load_config())
2681
+ ok = reservation_mgr.cancel_reservation(pod_rid, pod_uid)
2682
+ except Exception as e:
2683
+ ok = False
2684
+ rprint(f"[red]❌ Could not cancel from inside the pod: {e}[/red]")
2685
+ if not ok:
2686
+ rprint(f"[dim]If that didn't work, cancel from your laptop: gpu-dev cancel {pod_rid[:8]}[/dim]")
2687
+ return
2688
+
2671
2689
  # Handle --all flag (non-interactive)
2672
2690
  if all:
2673
2691
  with Live(
@@ -159,37 +159,51 @@ def select_gpu_type_interactive(
159
159
  parts.append(f"{a}×{cgt.rsplit('-', 1)[-1].upper()}")
160
160
  return parts, tot_a, tot_c
161
161
 
162
- # ── The selectable list IS the table ──────────────────────────────────────
163
- # questionary indents Separators and Choices identically, so a Separator
164
- # header + aligned column text line up with the selectable rows. Arrow keys
165
- # move through the table; Enter picks the highlighted row. No separate print.
166
- def _row_cells(gt, info, is_spot=False):
162
+ # ── The selectable list IS the table (boxed + colored) ────────────────────
163
+ # Box-drawing borders are non-selectable Separators; each GPU is a Choice whose
164
+ # title is FormattedText so cells are individually colored. questionary indents
165
+ # Separators and Choices identically, so the borders line up with the rows.
166
+ # Arrow keys move through the table; the » pointer marks the row, Enter picks it.
167
+ # Emoji are kept OUT of cells (double-width → would ragged the right border);
168
+ # status is conveyed with color instead.
169
+ G, R, Y, BL, CY, MG, DIM = (
170
+ "fg:ansigreen", "fg:ansired", "fg:ansiyellow",
171
+ "fg:ansiblue", "fg:ansicyan bold", "fg:ansimagenta", "fg:#808080")
172
+
173
+ def _status(info):
167
174
  avail = int(info.get("available", 0))
168
- wd, emoji = _format_wait(avail, info.get("estimated_wait_minutes", 0))
175
+ est = info.get("estimated_wait_minutes", 0)
169
176
  ql = int(info.get("queue_length", 0))
177
+ if avail > 0:
178
+ text, style = "available now", G
179
+ elif est:
180
+ text, style = _format_wait(avail, est)[0], Y
181
+ else:
182
+ text, style = "queued", Y
170
183
  if ql > 0:
171
- wd += f" · {ql} queued"
172
- typ = f"{gt.upper()} *" if is_spot else gt.upper()
173
- return [typ, str(avail), str(int(info.get("max_reservable", 0))),
174
- str(int(info.get("total", 0)))], f"{emoji} {wd}"
184
+ text += f" · {ql} queued"
185
+ return text, style
175
186
 
176
- # Rows: (cells[type, avail, maxres, total], status, value, kind).
177
- data_rows = []
187
+ # rows: (cells[type, avail, maxres, total, status], styles|None, value, kind)
188
+ rows = []
178
189
  for gt, info in full_gpus.items():
179
190
  if info.get("maintenance", False):
180
- data_rows.append((
181
- [gt.upper(), "-", "-", str(int(info.get("total", 0)))],
182
- f"MAINTENANCE: {info.get('maintenance_reason', '')}", gt, "maint"))
191
+ reason = (info.get("maintenance_reason", "") or "maintenance")[:18]
192
+ rows.append(([gt.upper(), "-", "-", str(int(info.get("total", 0))),
193
+ f"MAINT: {reason}"], None, gt, "maint"))
183
194
  continue
184
- cells, status = _row_cells(gt, info)
185
- data_rows.append((cells, status, gt, "gpu"))
195
+ a = int(info.get("available", 0))
196
+ st_text, st_style = _status(info)
197
+ cells = [gt.upper(), str(a), str(int(info.get("max_reservable", 0))),
198
+ str(int(info.get("total", 0))), st_text]
199
+ styles = [CY, G if a > 0 else R, G, BL, st_style]
200
+ rows.append((cells, styles, gt, "gpu"))
186
201
  parts, mig_a, mig_c = _mig_breakdown(gt)
187
202
  if parts:
188
- data_rows.append((
189
- [" └─ MIG", str(mig_a), "-", str(mig_c)],
190
- f"{' '.join(parts)} · pick {gt.upper()} ↑", None, "mig"))
203
+ rows.append(([" └─ MIG", str(mig_a), "-", str(mig_c), " ".join(parts)],
204
+ None, None, "mig"))
191
205
 
192
- spot_data = []
206
+ spot_rows = []
193
207
  if spot_gpus:
194
208
  _pn = {"b300": 8, "b200": 8, "h200": 8, "h100": 8, "a100": 8, "t4": 4, "l4": 4}
195
209
  _od = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
@@ -205,40 +219,48 @@ def select_gpu_type_interactive(
205
219
  try:
206
220
  disc = f"~{int((1 - float(sp) / _od.get(gt, 50)) * 100)}% off"
207
221
  except (ValueError, TypeError):
208
- disc = "spot price n/a"
209
- status = ("node up" if avail > 0 else "spins up ~10min") + f" · {disc}"
210
- spot_data.append((
211
- [f"{gt.upper()} *", str(avail), f"{_pn.get(gt, 8)}/node", "-"],
212
- status, f"spot:{gt}", "spot"))
222
+ disc = "spot n/a"
223
+ st_text = ("node up · " if avail > 0 else "spins up · ") + disc
224
+ cells = [f"{gt.upper()} *", str(avail), f"{_pn.get(gt, 8)}/node", "-", st_text]
225
+ styles = [MG, G if avail > 0 else DIM, G, DIM, G if avail > 0 else Y]
226
+ spot_rows.append((cells, styles, f"spot:{gt}", "spot"))
227
+
228
+ headers = ["GPU Type", "Avail", "MaxRes", "Total", "Status"]
229
+ all_cells = [headers] + [r[0] for r in rows] + [s[0] for s in spot_rows]
230
+ W = [max(len(str(rc[i])) for rc in all_cells) for i in range(5)]
231
+
232
+ def _bar(left, mid, right):
233
+ return left + mid.join("─" * (w + 2) for w in W) + right
213
234
 
214
- # Column widths over the 4 text columns (header + all rows).
215
- headers = ["GPU Type", "Avail", "MaxRes", "Total"]
216
- _all_cells = [headers] + [r[0] for r in data_rows] + [s[0] for s in spot_data]
217
- widths = [max(len(str(row[i])) for row in _all_cells) for i in range(4)]
235
+ def _line(cells): # plain string row (header / mig / maint), inside the box
236
+ return "" + "".join(f" {str(c):<{W[i]}} " for i, c in enumerate(cells)) + ""
218
237
 
219
- def _fmt(cells, status=""):
220
- body = " ".join(str(c).ljust(widths[i]) for i, c in enumerate(cells))
221
- return f"{body} {status}".rstrip()
238
+ def _ft(cells, styles): # colored row -> FormattedText for a Choice
239
+ toks = [("class:separator", "│")]
240
+ for i, c in enumerate(cells):
241
+ toks.append((styles[i], f" {str(c):<{W[i]}} "))
242
+ toks.append(("class:separator", "│"))
243
+ return toks
222
244
 
223
245
  console.print()
224
- choices = [questionary.Separator(_fmt(headers, "Status"))]
225
- if not data_rows:
226
- choices.append(questionary.Separator("(no GPU types available)"))
227
- for cells, status, value, kind in data_rows:
228
- title = _fmt(cells, status)
229
- if kind == "mig":
230
- choices.append(questionary.Separator(title))
231
- elif kind == "maint":
232
- choices.append(questionary.Choice(title=title, value=value, disabled="maintenance"))
246
+ choices = [questionary.Separator(_bar("┌", "", "┐")),
247
+ questionary.Separator(_line(headers)),
248
+ questionary.Separator(_bar("├", "┼", "┤"))]
249
+ if not rows:
250
+ choices.append(questionary.Separator(_line(["(none)", "", "", "", ""])))
251
+ for cells, styles, value, kind in rows:
252
+ if kind in ("mig", "maint"):
253
+ choices.append(questionary.Separator(_line(cells)))
233
254
  else:
234
- choices.append(questionary.Choice(title=title, value=value))
255
+ choices.append(questionary.Choice(title=_ft(cells, styles), value=value))
235
256
 
236
- if spot_data:
237
- choices.append(questionary.Separator("⚡ Spot — us-east-1, ~70% cheaper, may be preempted:"))
238
- for cells, status, value, _kind in spot_data:
239
- choices.append(questionary.Choice(title=_fmt(cells, status), value=value))
257
+ if spot_rows:
258
+ choices.append(questionary.Separator(_bar("├", "┼", "┤")))
259
+ for cells, styles, value, _k in spot_rows:
260
+ choices.append(questionary.Choice(title=_ft(cells, styles), value=value))
240
261
 
241
- choices.append(questionary.Separator("───"))
262
+ choices.append(questionary.Separator(_bar("", "┴", "┘")))
263
+ choices.append(questionary.Separator(" "))
242
264
  if _hide_spot:
243
265
  choices.append(questionary.Choice(
244
266
  title="⚡ Show spot options (us-east-1, ~70% cheaper, may be preempted)",
@@ -251,7 +273,8 @@ def select_gpu_type_interactive(
251
273
  while True:
252
274
  try:
253
275
  answer = questionary.select(
254
- "Select GPU type (↑/↓, Enter):", choices=choices, style=custom_style
276
+ "Select GPU type — ↑/↓ then Enter (MIG: pick its parent GPU):",
277
+ choices=choices, style=custom_style
255
278
  ).ask()
256
279
 
257
280
  if answer == "_refresh":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.7.3
3
+ Version: 0.7.5
4
4
  Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.7.3"
7
+ version = "0.7.5"
8
8
  description = "CLI + Python SDK for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -63,4 +63,4 @@ try:
63
63
 
64
64
  __version__ = _pkg_version("gpu-dev")
65
65
  except Exception:
66
- __version__ = "0.7.3"
66
+ __version__ = "0.7.5"
@@ -1633,7 +1633,18 @@ def try_claim_warm_pod(body: dict) -> bool:
1633
1633
  "while IFS= read -r k; do [ -n \"$k\" ] && ! grep -Fq \"$k\" /home/dev/.ssh/authorized_keys && echo \"$k\" >> /home/dev/.ssh/authorized_keys; done <<'KEOF'\n"
1634
1634
  f"{github_public_key}\n"
1635
1635
  "KEOF\n"
1636
- "chmod 700 /home/dev/.ssh && chmod 600 /home/dev/.ssh/authorized_keys && chown -R 1081:1081 /home/dev/.ssh"
1636
+ "chmod 700 /home/dev/.ssh && chmod 600 /home/dev/.ssh/authorized_keys && chown -R 1081:1081 /home/dev/.ssh\n"
1637
+ # Warm pods were pre-booted with user_id='warm'; stamp the real claimant's
1638
+ # identity into the managed shell-ext files so `gpu-dev` inside the pod
1639
+ # (cancel/list/...) authenticates as the user and IRSA assumes the right
1640
+ # session. The user connects AFTER the claim, so their login shell picks
1641
+ # these up. Also record the reservation id for `gpu-dev cancel`.
1642
+ "for f in /home/dev/.bashrc_ext /home/dev/.zshrc_ext; do [ -f \"$f\" ] || continue\n"
1643
+ f" sed -i -e 's|^export GPU_DEV_USER_ID=.*|export GPU_DEV_USER_ID=\"{user_id}\"|'"
1644
+ f" -e 's|^export GPU_DEV_GITHUB_USER=.*|export GPU_DEV_GITHUB_USER=\"{github_user}\"|'"
1645
+ f" -e 's|^export AWS_ROLE_SESSION_NAME=.*|export AWS_ROLE_SESSION_NAME=\"{user_id}\"|' \"$f\"\n"
1646
+ f" grep -q '^export GPU_DEV_RESERVATION_ID=' \"$f\" && sed -i 's|^export GPU_DEV_RESERVATION_ID=.*|export GPU_DEV_RESERVATION_ID=\"{reservation_id}\"|' \"$f\" || echo 'export GPU_DEV_RESERVATION_ID=\"{reservation_id}\"' >> \"$f\"\n"
1647
+ "done"
1637
1648
  )
1638
1649
  stream(
1639
1650
  v1.connect_get_namespaced_pod_exec, pod_name, "gpu-dev",
@@ -5272,6 +5283,10 @@ EOF_PROFILE
5272
5283
 
5273
5284
  # User identification
5274
5285
  export GPU_DEV_USER_ID="{user_id or 'dev'}"
5286
+ # Reservation id — from the pod hostname (gpu-dev-<id>). Warm claims overwrite this
5287
+ # in place with the full id; cold pods carry the 8-char prefix (cancellation resolves
5288
+ # by prefix). Lets `gpu-dev cancel` with no args inside the pod stop this reservation.
5289
+ export GPU_DEV_RESERVATION_ID="$(hostname | sed -e 's/^gpu-dev-//')"
5275
5290
 
5276
5291
  # Multinode peer info — inlined from container env at pod startup. sshd strips
5277
5292
  # container env vars from login shells, so we materialize the values into rc files.
@@ -5338,6 +5353,10 @@ EOF_BASHRC_EXT
5338
5353
 
5339
5354
  # User identification
5340
5355
  export GPU_DEV_USER_ID="{user_id or 'dev'}"
5356
+ # Reservation id — from the pod hostname (gpu-dev-<id>). Warm claims overwrite this
5357
+ # in place with the full id; cold pods carry the 8-char prefix (cancellation resolves
5358
+ # by prefix). Lets `gpu-dev cancel` with no args inside the pod stop this reservation.
5359
+ export GPU_DEV_RESERVATION_ID="$(hostname | sed -e 's/^gpu-dev-//')"
5341
5360
 
5342
5361
  # Multinode peer info — inlined from container env at pod startup. sshd strips
5343
5362
  # container env vars from login shells, so we materialize the values into rc files.
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes