gpu-dev 0.7.6__tar.gz → 0.7.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. gpu_dev-0.7.11/.github/workflows/tests.yml +20 -0
  2. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/.gitignore +11 -0
  3. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/CLAUDE.md +89 -0
  4. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/PKG-INFO +6 -1
  5. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +112 -34
  6. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +26 -3
  7. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +28 -18
  8. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +13 -1
  9. gpu_dev-0.7.11/conftest.py +92 -0
  10. gpu_dev-0.7.11/docs/FAST_REPRO_DESIGN.md +141 -0
  11. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/PKG-INFO +6 -1
  12. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/SOURCES.txt +55 -2
  13. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/requires.txt +6 -0
  14. gpu_dev-0.7.11/post-may-2026.md +185 -0
  15. gpu_dev-0.7.11/presentation/CLAUDE.md +220 -0
  16. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/cli-demo.html +5 -5
  17. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/gpu-fleet.html +5 -5
  18. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/k8s-under-the-hood.html +8 -8
  19. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/multinode.html +10 -10
  20. gpu_dev-0.7.11/presentation/pyproject.toml +33 -0
  21. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/sdk-demo.html +6 -6
  22. gpu_dev-0.7.11/presentation/teaser.html +317 -0
  23. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/pyproject.toml +17 -1
  24. gpu_dev-0.7.11/sdk/python/examples/parallel_experiments.ipynb +408 -0
  25. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_backend/aws.py +4 -1
  26. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/availability.tf +2 -1
  27. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/Dockerfile +18 -7
  28. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/bashrc +9 -1
  29. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/zshrc +5 -2
  30. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/git-cache.tf +2 -0
  31. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/kubernetes.tf +7 -2
  32. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/availability_updater/index.py +39 -3
  33. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +11 -0
  34. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_processor/index.py +206 -13
  35. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda.tf +16 -1
  36. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/mig-parted-config.yaml +15 -0
  37. gpu_dev-0.7.11/terraform-gpu-devservers/pytorch-ondemand.tf +178 -0
  38. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/pytorch-prebuild.tf +47 -3
  39. gpu_dev-0.7.11/tests/integration/README.md +35 -0
  40. gpu_dev-0.7.11/tests/integration/__init__.py +0 -0
  41. gpu_dev-0.7.11/tests/integration/conftest.py +131 -0
  42. gpu_dev-0.7.11/tests/integration/test_claude.py +40 -0
  43. gpu_dev-0.7.11/tests/integration/test_cpu_lifecycle.py +36 -0
  44. gpu_dev-0.7.11/tests/integration/test_repro_known_failure.py +54 -0
  45. gpu_dev-0.7.11/tests/integration/test_t4_lifecycle.py +39 -0
  46. gpu_dev-0.7.11/tests/integration/test_warm_pool.py +54 -0
  47. gpu_dev-0.7.11/tests/unit/__init__.py +0 -0
  48. gpu_dev-0.7.11/tests/unit/cli/__init__.py +0 -0
  49. gpu_dev-0.7.11/tests/unit/cli/test_auth.py +442 -0
  50. gpu_dev-0.7.11/tests/unit/cli/test_avail.py +295 -0
  51. gpu_dev-0.7.11/tests/unit/cli/test_cancel.py +380 -0
  52. gpu_dev-0.7.11/tests/unit/cli/test_config_cmd.py +187 -0
  53. gpu_dev-0.7.11/tests/unit/cli/test_config_module.py +476 -0
  54. gpu_dev-0.7.11/tests/unit/cli/test_connect.py +373 -0
  55. gpu_dev-0.7.11/tests/unit/cli/test_disks.py +747 -0
  56. gpu_dev-0.7.11/tests/unit/cli/test_edit.py +321 -0
  57. gpu_dev-0.7.11/tests/unit/cli/test_interactive.py +489 -0
  58. gpu_dev-0.7.11/tests/unit/cli/test_list_show.py +547 -0
  59. gpu_dev-0.7.11/tests/unit/cli/test_name_generator.py +272 -0
  60. gpu_dev-0.7.11/tests/unit/cli/test_repro.py +454 -0
  61. gpu_dev-0.7.11/tests/unit/cli/test_reservations_mgr.py +593 -0
  62. gpu_dev-0.7.11/tests/unit/cli/test_reserve.py +394 -0
  63. gpu_dev-0.7.11/tests/unit/cli/test_smoke.py +12 -0
  64. gpu_dev-0.7.11/tests/unit/cli/test_ssh_alias.py +130 -0
  65. gpu_dev-0.7.11/tests/unit/cli/test_submit.py +401 -0
  66. gpu_dev-0.7.11/tests/unit/lambda_fn/__init__.py +0 -0
  67. gpu_dev-0.7.11/tests/unit/lambda_fn/test_availability.py +488 -0
  68. gpu_dev-0.7.11/tests/unit/lambda_fn/test_cancellation.py +355 -0
  69. gpu_dev-0.7.11/tests/unit/lambda_fn/test_claim.py +348 -0
  70. gpu_dev-0.7.11/tests/unit/lambda_fn/test_mig_gpu_config.py +598 -0
  71. gpu_dev-0.7.11/tests/unit/lambda_fn/test_pod_resources.py +255 -0
  72. gpu_dev-0.7.11/tests/unit/lambda_fn/test_ref_staging.py +292 -0
  73. gpu_dev-0.7.11/tests/unit/lambda_fn/test_smoke.py +12 -0
  74. gpu_dev-0.7.11/tests/unit/lambda_fn/test_version_gate.py +178 -0
  75. gpu_dev-0.7.11/tests/unit/lambda_fn/test_warm_pool.py +682 -0
  76. gpu_dev-0.7.11/tests/unit/sdk/__init__.py +0 -0
  77. gpu_dev-0.7.11/tests/unit/sdk/test_backend_aws.py +790 -0
  78. gpu_dev-0.7.11/tests/unit/sdk/test_client.py +519 -0
  79. gpu_dev-0.7.11/tests/unit/sdk/test_errors_enums.py +308 -0
  80. gpu_dev-0.7.11/tests/unit/sdk/test_models_extra.py +361 -0
  81. gpu_dev-0.7.11/tests/unit/sdk/test_sandbox.py +352 -0
  82. gpu_dev-0.7.11/tests/unit/sdk/test_sdk_config.py +258 -0
  83. gpu_dev-0.7.11/tests/unit/sdk/test_transport_ssh.py +327 -0
  84. gpu_dev-0.7.6/sdk/python/examples/parallel_experiments.ipynb +0 -362
  85. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/.github/workflows/no-gitlinks.yml +0 -0
  86. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/.github/workflows/publish.yml +0 -0
  87. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/README.md +0 -0
  88. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/admin/README.md +0 -0
  89. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/admin/generate_stats.py +0 -0
  90. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/admin/requirements.txt +0 -0
  91. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/README.md +0 -0
  92. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  93. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  94. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  95. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  96. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  97. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  98. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  99. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  100. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/SDK_REPRO.md +0 -0
  101. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/USER_GUIDE.md +0 -0
  102. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/devgpu-features.html +0 -0
  103. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/docker-mark-blue.svg +0 -0
  104. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/docs/icons8-cursor-ai.svg +0 -0
  105. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/dependency_links.txt +0 -0
  106. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/entry_points.txt +0 -0
  107. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/gpu_dev.egg-info/top_level.txt +0 -0
  108. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/architecture.html +0 -0
  109. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/devgpu-features.html +0 -0
  110. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/docker-mark-blue.svg +0 -0
  111. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/feedback.png +0 -0
  112. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/icons8-cursor-ai.svg +0 -0
  113. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/index.html +0 -0
  114. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/osdc-future-plans.html +0 -0
  115. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/problem.png +0 -0
  116. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/sandbox.html +0 -0
  117. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/thesis.html +0 -0
  118. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/title-vid.mp4 +0 -0
  119. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/weneedgpus.png +0 -0
  120. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/presentation/wow.html +0 -0
  121. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/README.md +0 -0
  122. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/batch_multi_gpu.py +0 -0
  123. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/interactive_debug.py +0 -0
  124. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/quickstart.ipynb +0 -0
  125. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/run_tests.py +0 -0
  126. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/examples/submit_job.py +0 -0
  127. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/__init__.py +0 -0
  128. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
  129. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  130. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
  131. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  132. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
  133. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
  134. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  135. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
  136. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  137. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/config.py +0 -0
  138. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/enums.py +0 -0
  139. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/errors.py +0 -0
  140. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/common/models.py +0 -0
  141. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/src/gpu_dev/py.typed +0 -0
  142. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/sdk/python/tests/test_models.py +0 -0
  143. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/setup.cfg +0 -0
  144. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-deck/backend.tf +0 -0
  145. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-deck/main.tf +0 -0
  146. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-deck/terraform.tfvars.example +0 -0
  147. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  148. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  149. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/README.md +0 -0
  150. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/alb.tf +0 -0
  151. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ami-baker.tf +0 -0
  152. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/backend.tf +0 -0
  153. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/build-node.tf +0 -0
  154. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/check_b200.py +0 -0
  155. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  156. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  157. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  158. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  159. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/bash_profile +0 -0
  160. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  161. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  162. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  163. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  164. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/motd_script +0 -0
  165. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  166. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/profile +0 -0
  167. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  168. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  169. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  170. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/shell_env +0 -0
  171. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/ssh_config +0 -0
  172. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/zprofile +0 -0
  173. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  174. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker-build.tf +0 -0
  175. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  176. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  177. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ecr.tf +0 -0
  178. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/efs.tf +0 -0
  179. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/eks.tf +0 -0
  180. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/expiry.tf +0 -0
  181. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  182. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  183. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  184. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  185. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  186. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  187. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  188. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  189. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  190. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  191. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  192. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  193. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  194. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/list_b200.py +0 -0
  195. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/main.tf +0 -0
  196. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/mig-config.tf +0 -0
  197. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  198. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  199. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  200. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  201. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  202. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/monitoring.tf +0 -0
  203. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  204. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/outputs.tf +0 -0
  205. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/pyproject.toml +0 -0
  206. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/queue.tf +0 -0
  207. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/route53.tf +0 -0
  208. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  209. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  210. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  211. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  212. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  213. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  214. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  215. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  216. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  217. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  218. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  219. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/switch-to.sh +0 -0
  220. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  221. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  222. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  223. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  224. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  225. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/variables.tf +0 -0
  226. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/terraform-gpu-devservers/warm-pool.tf +0 -0
  227. {gpu_dev-0.7.6/sdk/python → gpu_dev-0.7.11}/tests/__init__.py +0 -0
  228. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/tests/submit/README.md +0 -0
  229. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/tests/submit/fail/run.sh +0 -0
  230. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/tests/submit/multinode/run.sh +0 -0
  231. {gpu_dev-0.7.6 → gpu_dev-0.7.11}/tests/submit/success/run.sh +0 -0
@@ -0,0 +1,20 @@
1
+ name: tests
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ unit:
9
+ name: unit + mocks
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - name: Install uv
14
+ uses: astral-sh/setup-uv@v5
15
+ with:
16
+ python-version: "3.12"
17
+ - name: Install package + test deps
18
+ run: uv pip install -e ".[test]"
19
+ - name: Run unit + mock tests (integration excluded)
20
+ run: uv run pytest -m "not integration"
@@ -73,3 +73,14 @@ lambda/*/package/
73
73
  admin/output/
74
74
 
75
75
  .claude/worktrees/
76
+ .claude/settings.local.json
77
+ .claude/scheduled_tasks.lock
78
+
79
+ # Org-specific (filled in locally; not committed)
80
+ docs/INTERNAL_AUTH.md
81
+
82
+ # Local scratch / staging terraform working dir
83
+ *.pid
84
+ terraform-gpu-devservers/staging/.terraform/
85
+ terraform-gpu-devservers/staging/__pycache__/
86
+ terraform-gpu-devservers/staging/*.log
@@ -28,6 +28,59 @@ For terraform, we use opentofu, don't ever run tf apply directly. You're free to
28
28
  - Group imports in standard order: standard library, third-party, local imports
29
29
  - Use absolute imports when possible
30
30
 
31
+ ## Testing (DO THIS FOR EVERY CHANGE)
32
+
33
+ There is a real test suite now. **Every change must keep it green, and add/adjust
34
+ tests.** Two tiers:
35
+
36
+ **1. Unit + mocks — ALWAYS run, must stay green (CI runs this on every push/PR).**
37
+ Fully mocked (boto3 / k8s / SSH / subprocess), no network, ~2s.
38
+ ```bash
39
+ uv pip install -e ".[test]" # one-time: pytest, moto, kubernetes
40
+ uv run pytest -m "not integration" # ~1140 tests; run before every commit
41
+ ```
42
+ - Layout: `tests/unit/{sdk,cli,lambda_fn}/test_*.py`; shared fixtures in the root
43
+ `conftest.py` (`cli_runner`, `lambda_index` = the lambda imported as `index`
44
+ with env pre-set, `aws_mocks` = MagicMock boto3 handles).
45
+ - When you touch CLI / SDK / lambda code, update or add the matching `test_*.py`.
46
+ - CI: `.github/workflows/tests.yml`. Lambda imports need env vars + sys.path — the
47
+ root `conftest.py` already sets both.
48
+
49
+ **2. e2e integration on STAGING — run for anything touching the
50
+ reserve/pod/SSH/lambda path before merging.** Real reservations on the **staging**
51
+ cluster (us-west-1), cpu + t4 only, auto-cancelled. Staging is the DEFAULT target
52
+ and github_user comes from your config, so the bare command is enough:
53
+ ```bash
54
+ uv run pytest -m integration --run-integration -v
55
+ ```
56
+ - Staging is the default (`GPU_DEV_TEST_ENV` defaults to `staging` → us-west-1,
57
+ standard `pytorch-gpu-dev-*` prefix, tf workspace `default`). The integration
58
+ conftest pins the region so the unit-test us-east-2 default can't leak in. Wired
59
+ in `cli-tools/.../config.py` ENVIRONMENTS.
60
+ - Covers: cpu-x86 + t4 reserve→active→cancel, list-while-active, exec
61
+ (`nproc`/`nvidia-smi`/`torch.cuda`), **`claude -p` answers "Paris"** (pod Claude
62
+ Code/Bedrock), and the **warm pool** (fast warm claim + custom-image
63
+ warm-ineligibility). Each cancels in a `finally` (no leaked pods).
64
+ - Warm-pool tests need `WARM_POOL_TARGETS` deployed on staging — set in
65
+ `lambda.tf` for the `default` workspace (`{t4, cpu-x86, cpu-arm}`). Staging IS the
66
+ tf `default` workspace (us-west-1, environment=test) — there is no `test`/`staging`
67
+ workspace: `tofu workspace select default && tofu apply`. Until then the warm
68
+ tests skip ("came up cold"). Custom-image test: set `GPU_DEV_TEST_IMAGE`.
69
+ - Repro test (`test_repro_known_failure.py`): set `GPU_DEV_REPRO_REF` +
70
+ `GPU_DEV_REPRO_TEST` to a known-red (commit, test). Find one with the
71
+ **treehugger MCP** (`hud`, user-scope — `get_hud_data`/`master_commit_red`).
72
+ Note: prebuilt torch is h100/b200 arch, so a CUDA test on t4 needs a full build;
73
+ prefer a failure that runs on the box's GPU or on cpu.
74
+ - Skips cleanly if staging is unreachable or the runner has no outbound SSH (e.g. a
75
+ sandbox). The reservation role can query/SQS but lacks `DescribeTable`, so the
76
+ reachability probe uses scan+get-queue-url, not describe.
77
+ - Validated live (2026-05-31): cpu + t4 lifecycle PASS; warm-claim test confirmed
78
+ it reaches the real reserve (skips until WARM_POOL_TARGETS is applied).
79
+
80
+ **Rule of thumb:** unit+mocks for *every* change; add e2e coverage when you add a
81
+ new command/flow; run the staging e2e before merging anything that could affect a
82
+ live reservation. Don't say "done/tested" without having run the relevant tier.
83
+
31
84
  ## Content
32
85
 
33
86
  - torchci - a next.js app containing a PyTorch CI tracker
@@ -51,6 +104,42 @@ Currently we're working on a developer servers with GPUs in AWS. This means we'l
51
104
 
52
105
  # AGENT SECTION
53
106
 
107
+ ## Fast-repro redesign — by-SHA artifact cache + on-demand build (2026-06-01)
108
+
109
+ Goal: `gpu-dev repro <ref>` for any pytorch commit from the last ~72h lands a built,
110
+ importable tree in <2min. Design: `docs/FAST_REPRO_DESIGN.md`. **All merged to main**
111
+ (PRs #186–#189); **needs `tofu apply` (prod, workspace `prod`) + image rebuild**.
112
+
113
+ - **by-SHA artifact cache** (#186): whole *built* trees keyed by commit SHA at
114
+ `/ccache_shared/prebuilt/by-sha/<sha>.tar.{zst,gz}` (`.sha` written last = the
115
+ completion gate). Cron seeds one per viable/strict bump (hardlink, no extra space).
116
+ `stage-pytorch` (cold `--ref`) + `gpu-dev repro` consume on hit → `import torch`
117
+ with ZERO build. `repro` also publishes its in-pod build via `publish-pytorch-build`
118
+ (detached) so the cache fills from real usage. All paths safe-fallback on miss;
119
+ `ls-remote` is `timeout 15`.
120
+ - **retention** (#188): prebuild cron prunes by-sha entries >72h every tick (storage
121
+ budget ~500-650GB on the elastic ccache EFS). The by-sha set IS the snapshot ladder.
122
+ - **mold linker** (#187): Dockerfile installs `mold`; cron + in-pod repro build wrap
123
+ with `mold -run` (guarded on `command -v mold`). Drops the libtorch_cuda.so relink
124
+ ~1-3min → ~15s. **Needs image rebuild** to activate (prod runs a stale image; that's
125
+ also why prod publishes gzip not zstd — the Dockerfile has zstd already).
126
+ - **on-demand build worker** (#189, `pytorch-ondemand.tf`): always-on Deployment on
127
+ NodeType=build drains `prebuilt/build-queue/<sha>.req` (own hostPath tree
128
+ `/mnt/ondemand-build` → builds at `/home/dev/pytorch` so build/ paths are
129
+ pod-compatible; mold+ccache), publishes by-sha, writes `.worker-alive` heartbeat.
130
+ `repro` enqueues + polls ONLY when the heartbeat is fresh (else straight to in-pod
131
+ build → zero regression if not deployed). Makes the FIRST repro of an uncached
132
+ commit fast. Coordination 100% via shared EFS — no new networking/RBAC/lambda.
133
+ - cuDNN fidelity (`USE_CUDNN=1`) DEFERRED — forcing it can fail the build if cuDNN
134
+ isn't found under cuda-13.2; needs prod e2e. Base image is cudnn9-devel.
135
+ - Fast path is **prod-arch only** (`sm_90;sm_100` = H100/B200); t4/staging is wrong-arch.
136
+ - Also: SSH alias now keys off reservation id not pod name (#185) so warm/repro pods
137
+ are reachable via `ssh gpu-dev-<resid>` / `connect` (routing is via the FQDN, the
138
+ alias is a local label). CCACHE_MAXSIZE settled at 250G (#184).
139
+ - Prod e2e: `gpu-dev repro <fresh-sha> <test> --gpu-type h100 --no-connect` (first =
140
+ off-pod build + stage; rerun = by-sha HIT zero build). Worker logs:
141
+ `k -n management logs deploy/pytorch-ondemand-builder -f`.
142
+
54
143
  ## Instant-sandboxes branch — WIP & things to fix (2026-05-29)
55
144
 
56
145
  Big push on warm pools + instant claims + prebuilt pytorch. Tracking state here so it's not lost.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.7.6
3
+ Version: 0.7.11
4
4
  Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -15,6 +15,11 @@ Requires-Dist: questionary>=2.1.1
15
15
  Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
+ Provides-Extra: test
19
+ Requires-Dist: pytest>=7.4; extra == "test"
20
+ Requires-Dist: pytest-cov>=4.1; extra == "test"
21
+ Requires-Dist: moto[dynamodb,ec2,sqs]>=5.0; extra == "test"
22
+ Requires-Dist: kubernetes>=28.1; extra == "test"
18
23
 
19
24
  # GPU Developer CLI & SDK
20
25
 
@@ -319,6 +319,9 @@ def _show_single_reservation(connection_info: dict) -> None:
319
319
  reservation_id = connection_info["reservation_id"]
320
320
  reservation_name = connection_info.get("name")
321
321
  pod_name = connection_info.get("pod_name", "")
322
+ # SSH host alias keys off the reservation id (works for warm-claimed pods,
323
+ # whose pod_name != gpu-dev-<resid8>). pod_name is shown separately below.
324
+ host_alias = f"gpu-dev-{short_id}"
322
325
  ssh_config_path = get_ssh_config_path(reservation_id, reservation_name)
323
326
  use_include = is_ssh_include_enabled()
324
327
 
@@ -328,14 +331,14 @@ def _show_single_reservation(connection_info: dict) -> None:
328
331
  if use_include:
329
332
  # User approved Include - show simple commands
330
333
  from .reservations import _make_vscode_link
331
- ssh_command_display = f"[green]ssh {pod_name}[/green]"
332
- vscode_url = _make_vscode_link(pod_name)
333
- vscode_cmd_text = f"code --remote ssh-remote+{pod_name} /home/dev"
334
+ ssh_command_display = f"[green]ssh {host_alias}[/green]"
335
+ vscode_url = _make_vscode_link(host_alias)
336
+ vscode_cmd_text = f"code --remote ssh-remote+{host_alias} /home/dev"
334
337
  vscode_command_display = f"[link={vscode_url}][green]{vscode_cmd_text}[/green][/link]"
335
338
  vscode_info = f"[blue]VS Code Remote:[/blue] {vscode_command_display}\n"
336
339
  else:
337
340
  # User declined Include - show commands with -F flag
338
- ssh_command_display = f"[green]ssh -F {ssh_config_path} {pod_name}[/green]"
341
+ ssh_command_display = f"[green]ssh -F {ssh_config_path} {host_alias}[/green]"
339
342
  vscode_command_display = f"Add [green]Include ~/.gpu-dev/*-sshconfig[/green] to ~/.ssh/config and ~/.cursor/ssh_config (or: [green]gpu-dev config ssh-include enable[/green])"
340
343
  vscode_info = f"[blue]VS Code/Cursor:[/blue] {vscode_command_display}\n"
341
344
  else:
@@ -1554,27 +1557,82 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
1554
1557
  except RuntimeError as e:
1555
1558
  rprint(f"[red]❌ {str(e)}[/red]"); return
1556
1559
 
1557
- # ref -> in-pod fetch+checkout (PRs prefer /merge = CI's view, fall back to /head)
1560
+ # Resolve the ref in-pod -> WANT (sha, for the by-sha cache) + FREF (fetch ref).
1561
+ # A MERGED pr/N reproduces the actual squash/merge commit on main (the real trunk
1562
+ # state that was red) — NOT pull/N/merge (the PR re-applied onto *current* trunk,
1563
+ # which goes green once the fix lands). Open PRs keep pull/N/merge (= CI's view).
1558
1564
  r = ref.strip(); prnum = None
1559
1565
  if r.startswith("pr/"): prnum = r[3:]
1560
1566
  elif r.startswith("#"): prnum = r[1:]
1561
1567
  elif r.isdigit(): prnum = r
1568
+ gh = "https://github.com/pytorch/pytorch.git"
1562
1569
  if prnum:
1563
- fetch = (f"git fetch origin pull/{prnum}/merge 2>/dev/null && git checkout -f FETCH_HEAD || "
1564
- f"{{ echo '[repro] no /merge ref, using /head'; git fetch origin pull/{prnum}/head && git checkout -f FETCH_HEAD; }}")
1570
+ api = f"https://api.github.com/repos/pytorch/pytorch/pulls/{prnum}"
1571
+ resolve = (
1572
+ f"PRJSON=$(curl -s -m 10 -H 'Accept: application/vnd.github+json' -H 'User-Agent: gpu-dev' {api} 2>/dev/null); "
1573
+ "MCS=$(printf '%s' \"$PRJSON\" | grep -oE '\"merge_commit_sha\": *\"[0-9a-f]+\"' | head -1 | cut -d'\"' -f4); "
1574
+ "if printf '%s' \"$PRJSON\" | grep -q '\"merged\": *true' && [ -n \"$MCS\" ]; then "
1575
+ f"WANT=\"$MCS\"; FREF=\"$MCS\"; echo \"[repro] pr/{prnum} is merged -> reproducing trunk commit $MCS\"; "
1576
+ f"else FREF=pull/{prnum}/merge; WANT=$(timeout 15 git ls-remote {gh} $FREF 2>/dev/null | head -1 | cut -f1); "
1577
+ f"[ -n \"$WANT\" ] || {{ FREF=pull/{prnum}/head; WANT=$(timeout 15 git ls-remote {gh} $FREF 2>/dev/null | head -1 | cut -f1); echo '[repro] open PR, no /merge -> /head'; }}; fi; ")
1565
1578
  else:
1566
1579
  rq = shlex.quote(r)
1567
- fetch = f"git fetch origin {rq} 2>/dev/null && git checkout -f FETCH_HEAD || git checkout -f {rq}"
1580
+ resolve = (f"FREF={rq}; WANT=$(timeout 15 git ls-remote {gh} {rq} 2>/dev/null | head -1 | cut -f1); "
1581
+ f"[ -n \"$WANT\" ] || case {rq} in *[!0-9a-fA-F]*) WANT= ;; *) WANT={rq} ;; esac; ")
1582
+ # in-pod fallback checkout (by-sha miss + farm unavailable): fetch the resolved ref,
1583
+ # else check out the sha directly (reachable for a merged-PR land commit / trunk).
1584
+ checkout = ("git fetch origin \"$FREF\" 2>/dev/null && git checkout -f FETCH_HEAD "
1585
+ "|| git checkout -f \"$WANT\" 2>/dev/null "
1586
+ "|| { git fetch --force origin 2>/dev/null && git checkout -f \"$WANT\"; }")
1568
1587
 
1569
1588
  testcmd = " ".join(shlex.quote(a) for a in test_args)
1589
+ # by-sha artifact cache: if a fully-built tree for the resolved SHA already exists
1590
+ # (shared EFS, seeded by the build node + prior repros), stage it -> ZERO build.
1591
+ # Otherwise build, then publish the result so the next dev (anyone) gets it instant.
1570
1592
  remote = (
1571
1593
  "set -e; cd /home/dev/pytorch; "
1572
1594
  "git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true; "
1573
- f"echo '[repro] checkout {r}'; {fetch}; "
1595
+ "BYSHA=/ccache_shared/prebuilt/by-sha; QUEUE=/ccache_shared/prebuilt/build-queue; HIT=; "
1596
+ # bs <sha>: stage a fully-built by-sha tree into /home/dev/pytorch (zero build); 0 on success.
1597
+ # explicit ext check, not a glob: the pod login shell is zsh, where an unmatched glob is a hard error.
1598
+ # require the .sha completion gate (written last) so we never stage a half-published tarball.
1599
+ "bs() { local s=\"$1\" tb=; [ -f \"$BYSHA/$s.sha\" ] || return 1; for e in zst gz; do [ -f \"$BYSHA/$s.tar.$e\" ] && { tb=\"$BYSHA/$s.tar.$e\"; break; }; done; [ -n \"$tb\" ] || return 1; "
1600
+ "rm -rf /home/dev/pytorch.new; mkdir -p /home/dev/pytorch.new; "
1601
+ "case \"$tb\" in *.zst) zstd -dc \"$tb\" 2>/dev/null | tar -C /home/dev/pytorch.new --strip-components=1 -xf - 2>/dev/null ;; "
1602
+ "*) tar -C /home/dev/pytorch.new --strip-components=1 -xzf \"$tb\" 2>/dev/null ;; esac; "
1603
+ "[ -d /home/dev/pytorch.new/.git ] || { rm -rf /home/dev/pytorch.new; return 1; }; "
1604
+ "rm -rf /home/dev/pytorch; mv /home/dev/pytorch.new /home/dev/pytorch; return 0; }; "
1605
+ + resolve +
1606
+ "echo \"[repro] target ${WANT:-?}\"; "
1607
+ # 1) already cached -> stage it (zero build)
1608
+ "if [ -n \"$WANT\" ] && bs \"$WANT\"; then cd /home/dev/pytorch; HIT=1; echo '[repro] by-sha cache HIT -> staged prebuilt tree (zero build)'; fi; "
1609
+ # 2) not cached, build farm alive -> request an off-pod build, wait, then stage
1610
+ "if [ -z \"$HIT\" ] && [ -n \"$WANT\" ] && [ -n \"$(find \"$QUEUE/.worker-alive\" -mmin -2 2>/dev/null)\" ]; then "
1611
+ "echo \"[repro] no cached build; requesting off-pod build of $WANT (build farm; streaming progress)…\"; printf '%s\\n' \"$FREF\" > \"$QUEUE/$WANT.req\" 2>/dev/null || true; "
1612
+ # poll for the artifact; meanwhile tail the farm's build log (ninja [x/N]) so it's not a silent hang.
1613
+ "i=0; LL=0; while [ $i -lt 400 ]; do [ -f \"$BYSHA/$WANT.sha\" ] && break; [ -f \"$QUEUE/$WANT.req\" ] || break; "
1614
+ "if [ -f \"$QUEUE/$WANT.log\" ]; then NL=$(wc -l < \"$QUEUE/$WANT.log\" 2>/dev/null || echo 0); "
1615
+ "if [ \"$NL\" -gt \"$LL\" ]; then tail -n +$((LL+1)) \"$QUEUE/$WANT.log\" 2>/dev/null | grep -aE '\\[[0-9]+/[0-9]+\\]|Building wheel|Successfully built|error' | tail -1 | sed 's/^/ [farm] /'; LL=$NL; fi; fi; "
1616
+ "sleep 3; i=$((i+1)); done; "
1617
+ "if bs \"$WANT\"; then cd /home/dev/pytorch; HIT=1; echo '[repro] off-pod build ready -> staged (zero build)'; else echo '[repro] off-pod build unavailable, building locally'; fi; fi; "
1618
+ # 3) fall back to in-pod fetch + build (+ cache the result for the next dev)
1619
+ "if [ -z \"$HIT\" ]; then "
1620
+ "echo \"[repro] checking out $FREF\"; " + checkout + "; "
1574
1621
  "echo \"[repro] HEAD $(git rev-parse --short HEAD)\"; "
1575
1622
  "git -c protocol.file.allow=always submodule update --init --recursive --jobs 8 >/dev/null 2>&1 || true; "
1576
1623
  "if ! PYTHONPATH=/home/dev/pytorch python -c 'import torch' 2>/dev/null; then "
1577
- "echo '[repro] incremental rebuild on warm build/...'; pip install --break-system-packages -e . --no-build-isolation; fi; "
1624
+ "echo \"[repro] prebuilt torch != this commit -> rebuilding (ccache-accelerated, but the further this commit is from viable/strict, the more recompiles). checked-out: $(git log -1 --format='%h %ci')\"; "
1625
+ # mold -run routes the libtorch_cuda.so relink through mold (~15s vs minutes); guarded.
1626
+ # Explicit if/else (not `$M pip`): the pod login shell is zsh, which doesn't word-split
1627
+ # unquoted vars. -v streams the cmake/ninja [x/N] progress instead of pip's blind spinner.
1628
+ "if command -v mold >/dev/null 2>&1; then mold -run pip install --break-system-packages -e . --no-build-isolation -v; "
1629
+ "else pip install --break-system-packages -e . --no-build-isolation -v; fi; fi; "
1630
+ # cache this build for the next dev (detached so it survives the ssh session)
1631
+ "SHA=$(git rev-parse HEAD 2>/dev/null); "
1632
+ "if command -v publish-pytorch-build >/dev/null 2>&1 && [ -n \"$SHA\" ] && [ ! -f \"$BYSHA/$SHA.sha\" ]; then "
1633
+ "echo '[repro] caching this build (by-sha) for next time…'; "
1634
+ "setsid publish-pytorch-build \"$SHA\" >/dev/null 2>&1 < /dev/null & fi; "
1635
+ "fi; "
1578
1636
  f"echo '[repro] running: python {testcmd}'; "
1579
1637
  f"PYTHONPATH=/home/dev/pytorch python {testcmd}"
1580
1638
  )
@@ -1879,7 +1937,9 @@ def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dock
1879
1937
  sys.exit(1)
1880
1938
  create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
1881
1939
 
1882
- ssh_alias = master_pod
1940
+ # Host alias matches the Host line written by create_ssh_config_for_reservation
1941
+ # (keyed off the reservation id, so warm-claimed masters resolve too).
1942
+ ssh_alias = f"gpu-dev-{master_id[:8]}"
1883
1943
  ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
1884
1944
  rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
1885
1945
 
@@ -3166,11 +3226,15 @@ def _show_direct_success(res: dict, elapsed: float) -> None:
3166
3226
  """Print the success block for an instant warm-pool claim,
3167
3227
  matching the normal reserve output (SSH config + VS Code/Cursor remote)."""
3168
3228
  from gpu_dev_cli.reservations import (
3169
- create_ssh_config_for_reservation, _generate_vscode_command, _generate_cursor_command)
3229
+ create_ssh_config_for_reservation, _generate_vscode_command,
3230
+ _generate_cursor_command, _make_vscode_link, _make_cursor_link)
3170
3231
  rid = res.get("reservation_id", "") or ""
3171
3232
  ssh_command = res.get("ssh_command", "") or ""
3172
3233
  pod_name = res.get("pod_name", "") or ""
3173
3234
  fqdn = res.get("fqdn") or ""
3235
+ # Host alias keys off the reservation id — warm-claimed pods have a pod_name
3236
+ # that is NOT gpu-dev-<resid8>, so we must not use pod_name as the ssh alias.
3237
+ host_alias = f"gpu-dev-{rid[:8]}" if rid else pod_name
3174
3238
 
3175
3239
  rprint(f"\n[green]✅ Instant reservation ready in {elapsed:.1f}s![/green]")
3176
3240
  rprint(f"[bold]📋 Reservation ID:[/bold] {rid}")
@@ -3179,24 +3243,28 @@ def _show_direct_success(res: dict, elapsed: float) -> None:
3179
3243
  if rid:
3180
3244
  rprint(f"[bold]⚡ Quick Connect:[/bold] gpu-dev connect {rid[:8]}")
3181
3245
 
3182
- # Build the per-reservation SSH config so `ssh <pod>` and connect work cleanly.
3246
+ # Build the per-reservation SSH config so `ssh gpu-dev-<resid8>` and connect work cleanly.
3183
3247
  use_include = False
3184
3248
  if fqdn and pod_name and rid:
3185
3249
  try:
3186
3250
  _cfg, use_include = create_ssh_config_for_reservation(fqdn, pod_name, rid, None)
3187
3251
  except Exception:
3188
3252
  pass
3189
- if pod_name and use_include:
3190
- rprint(f"[bold]🖥️ SSH Command:[/bold] ssh {pod_name}")
3191
- elif ssh_command:
3192
- rprint(f"[bold]🖥️ SSH Command:[/bold] {ssh_command}")
3193
-
3194
- vsc = _generate_vscode_command(ssh_command) if ssh_command else None
3195
- cur = _generate_cursor_command(ssh_command) if ssh_command else None
3196
- if vsc:
3197
- rprint(f"[bold]💻 VS Code Remote:[/bold] {vsc}")
3198
- if cur:
3199
- rprint(f"[bold]🖥️ Cursor Remote:[/bold] {cur}")
3253
+ if use_include and rid:
3254
+ rprint(f"[bold]🖥️ SSH Command:[/bold] ssh {host_alias}")
3255
+ vscode_url = _make_vscode_link(host_alias)
3256
+ cursor_url = _make_cursor_link(host_alias)
3257
+ rprint(f"[bold]💻 VS Code Remote:[/bold] [link={vscode_url}]code --remote ssh-remote+{host_alias} /home/dev[/link]")
3258
+ rprint(f"[bold]🖥️ Cursor Remote:[/bold] [link={cursor_url}]cursor --remote ssh-remote+{host_alias} /home/dev[/link]")
3259
+ else:
3260
+ if ssh_command:
3261
+ rprint(f"[bold]🖥️ SSH Command:[/bold] {ssh_command}")
3262
+ vsc = _generate_vscode_command(ssh_command) if ssh_command else None
3263
+ cur = _generate_cursor_command(ssh_command) if ssh_command else None
3264
+ if vsc:
3265
+ rprint(f"[bold]💻 VS Code Remote:[/bold] {vsc}")
3266
+ if cur:
3267
+ rprint(f"[bold]🖥️ Cursor Remote:[/bold] {cur}")
3200
3268
 
3201
3269
 
3202
3270
  def _format_gpu_display(gpu_count, gpu_type):
@@ -3385,15 +3453,22 @@ def _show_availability(show_spot: bool = False) -> None:
3385
3453
  spot_table = Table(title="⚡ Spot Instances (us-east-1, ~70% cheaper)")
3386
3454
  spot_table.add_column("GPU Type", style="cyan")
3387
3455
  spot_table.add_column("Avail\nNow", style="green")
3456
+ spot_table.add_column("In\nUse", style="yellow")
3388
3457
  spot_table.add_column("Per\nNode", style="bright_green")
3389
3458
  spot_table.add_column("Status", style="magenta")
3390
3459
  spot_table.add_column("Spot Discount", style="dim")
3391
3460
  _on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
3392
3461
  for gt, info in sorted(spot_region_info.items()):
3393
3462
  avail = info.get("available", 0)
3463
+ total = info.get("total", 0)
3464
+ in_use = max(0, total - avail) # GPUs on up spot nodes already taken
3394
3465
  per_node = spot_gpus_per_node.get(gt, 8)
3395
3466
  avail_display = f"[green]{avail}[/green]" if avail > 0 else f"[dim]0[/dim]"
3396
- status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
3467
+ in_use_display = f"[yellow]{in_use}[/yellow]" if in_use > 0 else f"[dim]0[/dim]"
3468
+ if in_use > 0:
3469
+ status = "[yellow]Node up (in use)[/yellow]" if avail == 0 else "[green]Node up[/green]"
3470
+ else:
3471
+ status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
3397
3472
  si = info.get("spot_info", {}) or {}
3398
3473
  sp = si.get("spot_price", "") if isinstance(si, dict) else ""
3399
3474
  if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
@@ -3405,7 +3480,7 @@ def _show_availability(show_spot: bool = False) -> None:
3405
3480
  avail_signal = f"[green]{pct}% off on-demand[/green]" if pct > 0 else "[dim]At on-demand price[/dim]"
3406
3481
  except (ValueError, TypeError):
3407
3482
  avail_signal = "[yellow]Unknown[/yellow]"
3408
- spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
3483
+ spot_table.add_row(f"{gt.upper()} *", avail_display, in_use_display, str(per_node), status, avail_signal)
3409
3484
  console.print(spot_table)
3410
3485
  rprint("[dim]* = spot: ~70% cheaper, AWS can reclaim with 2-min notice, fulfillment not guaranteed.[/dim]")
3411
3486
  rprint("[dim] Separate cluster (us-east-1) with separate disks. Select via gpu-dev reserve (interactive).[/dim]")
@@ -3779,7 +3854,8 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3779
3854
  for node in nodes:
3780
3855
  status_display = "✅ Active" if node.get("status") == "active" else f"⏳ {node.get('status', 'unknown')}"
3781
3856
  pod_name = node.get("pod_name", "unknown")
3782
- ssh_cmd_short = f"ssh {pod_name}" if pod_name != "unknown" else "N/A"
3857
+ node_rid = node.get("reservation_id")
3858
+ ssh_cmd_short = f"ssh gpu-dev-{node_rid[:8]}" if node_rid else "N/A"
3783
3859
 
3784
3860
  table.add_row(
3785
3861
  f"Node {node.get('node_index', 0) + 1}",
@@ -4036,10 +4112,11 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
4036
4112
  )
4037
4113
 
4038
4114
  if config_path:
4115
+ node_alias = f"gpu-dev-{node_res_id[:8]}"
4039
4116
  if use_include:
4040
- rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh {pod_name}[/cyan]")
4117
+ rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh {node_alias}[/cyan]")
4041
4118
  else:
4042
- rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh -F {config_path} {pod_name}[/cyan]")
4119
+ rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh -F {config_path} {node_alias}[/cyan]")
4043
4120
  else:
4044
4121
  rprint(f"[yellow]⚠️ Node {node_idx + 1}: Failed to create SSH config[/yellow]")
4045
4122
 
@@ -4067,12 +4144,13 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
4067
4144
  )
4068
4145
 
4069
4146
  if config_path:
4147
+ host_alias = f"gpu-dev-{reservation_id[:8]}"
4070
4148
  rprint(f"[green]✅ SSH config created:[/green] [cyan]{config_path}[/cyan]\n")
4071
4149
  if use_include:
4072
- rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh {pod_name}[/cyan]")
4150
+ rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh {host_alias}[/cyan]")
4073
4151
  rprint(f"[dim] or:[/dim] [cyan]gpu-dev connect {reservation_id[:8]}[/cyan]")
4074
4152
  else:
4075
- rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh -F {config_path} {pod_name}[/cyan]")
4153
+ rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh -F {config_path} {host_alias}[/cyan]")
4076
4154
  rprint(f"[dim] or:[/dim] [cyan]gpu-dev connect {reservation_id[:8]}[/cyan]")
4077
4155
  else:
4078
4156
  rprint("[red]❌ Failed to create SSH config[/red]")
@@ -4639,13 +4717,13 @@ def ssh_include(action: str):
4639
4717
 
4640
4718
  \b
4641
4719
  When enabled:
4642
- • Simple SSH commands: ssh <pod-name>
4643
- • VS Code Remote works: code --remote ssh-remote+<pod-name>
4720
+ • Simple SSH commands: ssh gpu-dev-<reservation-id>
4721
+ • VS Code Remote works: code --remote ssh-remote+gpu-dev-<reservation-id>
4644
4722
  • Cursor Remote works: Open Remote SSH in Cursor
4645
4723
 
4646
4724
  \b
4647
4725
  When disabled:
4648
- • Need -F flag: ssh -F ~/.gpu-dev/<id>-sshconfig <pod-name>
4726
+ • Need -F flag: ssh -F ~/.gpu-dev/<id>-sshconfig gpu-dev-<reservation-id>
4649
4727
  • VS Code/Cursor requires manual config setup
4650
4728
 
4651
4729
  \b
@@ -29,6 +29,15 @@ class Config:
29
29
  "description": "Spot-only us-east-1 environment (T4/L4/CPU)",
30
30
  "spot_types": ["b300", "b200", "h200", "h100", "a100", "t4", "l4", "rtxpro6000"],
31
31
  },
32
+ # Staging (us-west-1, tf "default" workspace, environment=test). Same
33
+ # standard resource prefix as prod, just a different region — so only the
34
+ # region changes. Live capacity: cpu-x86/arm + t4. Used for integration
35
+ # tests. Select via `GPU_DEV_ENVIRONMENT=staging` (or the "test" env alias).
36
+ "staging": {
37
+ "region": "us-west-1",
38
+ "workspace": "default",
39
+ "description": "Staging (us-west-1, cpu + t4)",
40
+ },
32
41
  }
33
42
  DEFAULT_ENVIRONMENT = "prod"
34
43
 
@@ -43,19 +52,33 @@ class Config:
43
52
  # Load unified config (handles migration from legacy files)
44
53
  self.user_config = self._load_config()
45
54
 
46
- # Get region: env vars take priority (for spot routing), then config, then default
55
+ # Active environment: GPU_DEV_ENVIRONMENT env wins (handy for tests/CI),
56
+ # then the persisted config, then the default. Its region/prefix back the
57
+ # fallbacks below so e.g. `GPU_DEV_ENVIRONMENT=staging` reaches us-west-2.
58
+ env_override = os.getenv("GPU_DEV_ENVIRONMENT")
59
+ env_name = env_override or self.user_config.get(
60
+ "environment", self.DEFAULT_ENVIRONMENT)
61
+ env_cfg = self.ENVIRONMENTS.get(env_name, {})
62
+
63
+ # Get region: AWS_* env vars take priority (for spot routing); then an
64
+ # explicit GPU_DEV_ENVIRONMENT switch uses that env's region (beating the
65
+ # persisted one); then the persisted config; then the env's region; default.
47
66
  env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
48
67
  if env_region and env_region != self.user_config.get("region"):
49
68
  self.aws_region = env_region
69
+ elif env_override and env_cfg.get("region"):
70
+ self.aws_region = env_cfg["region"]
50
71
  elif self.user_config.get("region"):
51
72
  self.aws_region = self.user_config["region"]
73
+ elif env_cfg.get("region"):
74
+ self.aws_region = env_cfg["region"]
52
75
  else:
53
76
  self.aws_region = "us-east-2"
54
77
 
55
78
  os.environ["AWS_DEFAULT_REGION"] = self.aws_region
56
79
 
57
- # Resource naming convention - no config needed!
58
- self.prefix = "pytorch-gpu-dev"
80
+ # Resource naming convention — per-environment prefix (default for prod).
81
+ self.prefix = env_cfg.get("prefix", "pytorch-gpu-dev")
59
82
 
60
83
  # Construct ARNs from convention
61
84
  self.queue_name = f"{self.prefix}-reservation-queue"
@@ -177,12 +177,14 @@ def _generate_cursor_command(ssh_command: str) -> Optional[str]:
177
177
  return None
178
178
 
179
179
 
180
- def _generate_ssh_config(hostname: str, pod_name: str) -> str:
180
+ def _generate_ssh_config(hostname: str, host_alias: str) -> str:
181
181
  """Generate SSH config for a reservation
182
182
 
183
183
  Args:
184
- hostname: The FQDN hostname (e.g., old_bison.devservers.io)
185
- pod_name: The pod name to use as SSH host alias
184
+ hostname: The FQDN hostname (e.g., old_bison.devservers.io). SSH routing
185
+ happens via this HostName (the ProxyCommand routes on the FQDN), so
186
+ host_alias is a purely local label.
187
+ host_alias: The local SSH host alias (e.g., gpu-dev-<resid8>)
186
188
 
187
189
  Returns:
188
190
  SSH config content as string
@@ -196,7 +198,7 @@ def _generate_ssh_config(hostname: str, pod_name: str) -> str:
196
198
  extra = " AddKeysToAgent yes\n"
197
199
  if sys.platform == "darwin":
198
200
  extra += " IgnoreUnknown UseKeychain\n UseKeychain yes\n"
199
- config_content = f"""Host {pod_name}
201
+ config_content = f"""Host {host_alias}
200
202
  HostName {hostname}
201
203
  User dev
202
204
  ForwardAgent yes
@@ -255,10 +257,10 @@ def _check_ssh_config_permission() -> bool:
255
257
  console.print("[dim] • ~/.cursor/ssh_config[/dim]")
256
258
  console.print("[dim]Line added: Include ~/.gpu-dev/*-sshconfig[/dim]\n")
257
259
  console.print("[green]Benefits:[/green]")
258
- console.print(" • Simple commands: [green]ssh <pod-name>[/green]")
259
- console.print(" • VS Code Remote works: [green]code --remote ssh-remote+<pod-name>[/green]")
260
+ console.print(" • Simple commands: [green]ssh gpu-dev-<reservation-id>[/green]")
261
+ console.print(" • VS Code Remote works: [green]code --remote ssh-remote+gpu-dev-<reservation-id>[/green]")
260
262
  console.print(" • Cursor Remote works: Open Remote SSH in Cursor")
261
- console.print("\n[dim]Without this, you'll need to use: [green]ssh -F ~/.gpu-dev/<id>-sshconfig <pod-name>[/green][/dim]")
263
+ console.print("\n[dim]Without this, you'll need to use: [green]ssh -F ~/.gpu-dev/<id>-sshconfig gpu-dev-<reservation-id>[/green][/dim]")
262
264
  console.print("[yellow]━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[/yellow]\n")
263
265
 
264
266
  approved = click.confirm("Add Include directive to SSH config files?", default=True)
@@ -326,7 +328,8 @@ def create_ssh_config_for_reservation(hostname: str, pod_name: str, reservation_
326
328
 
327
329
  Args:
328
330
  hostname: The FQDN hostname (e.g., old_bison.devservers.io)
329
- pod_name: The pod name to use as SSH host alias
331
+ pod_name: The k8s pod name (kept for API compat; no longer used for the
332
+ host alias — warm-claimed pods have a pod_name != gpu-dev-<resid8>)
330
333
  reservation_id: The reservation ID (full or short)
331
334
  name: Optional reservation name to use for filename (falls back to short ID)
332
335
 
@@ -346,8 +349,12 @@ def create_ssh_config_for_reservation(hostname: str, pod_name: str, reservation_
346
349
  short_id = reservation_id[:8]
347
350
  filename = f"{short_id}-sshconfig"
348
351
 
352
+ # Key the host alias off the reservation id (not pod_name) so warm-claimed pods,
353
+ # whose pod_name differs from gpu-dev-<resid8>, are still reachable as gpu-dev-<resid8>.
354
+ host_alias = f"gpu-dev-{short_id}"
355
+
349
356
  config_file = gpu_dev_dir / filename
350
- config_content = _generate_ssh_config(hostname, pod_name)
357
+ config_content = _generate_ssh_config(hostname, host_alias)
351
358
 
352
359
  try:
353
360
  config_file.write_text(config_content)
@@ -2220,10 +2227,11 @@ class ReservationManager:
2220
2227
  console.print(
2221
2228
  f"[yellow]⚠️ Could not create SSH config for node {node['index']+1}: {str(e)}[/yellow]")
2222
2229
 
2223
- # Show connection info
2230
+ # Show connection info (alias keys off the reservation id)
2231
+ node_alias = f"gpu-dev-{res_id[:8]}" if res_id else pod_name
2224
2232
  if config_path and pod_name and use_include:
2225
2233
  console.print(
2226
- f"[cyan]🖥️ Node {node['index']+1}:[/cyan] [green]ssh {pod_name}[/green]")
2234
+ f"[cyan]🖥️ Node {node['index']+1}:[/cyan] [green]ssh {node_alias}[/green]")
2227
2235
  else:
2228
2236
  ssh_command = res.get(
2229
2237
  "ssh_command", "ssh user@pending")
@@ -2321,27 +2329,29 @@ class ReservationManager:
2321
2329
  console.print(
2322
2330
  f"[yellow]⚠️ Could not create SSH config: {str(e)}[/yellow]")
2323
2331
 
2324
- # Show SSH command using config file if created, otherwise fallback
2332
+ # Show SSH command using config file if created, otherwise fallback.
2333
+ # Alias keys off the reservation id (works for warm-claimed pods too).
2334
+ host_alias = f"gpu-dev-{short_id}"
2325
2335
  if config_path and pod_name:
2326
2336
  if use_include:
2327
2337
  # User approved Include - show simple commands
2328
2338
  console.print(
2329
- f"[cyan]🖥️ SSH Command:[/cyan] [green]ssh {pod_name}[/green]")
2339
+ f"[cyan]🖥️ SSH Command:[/cyan] [green]ssh {host_alias}[/green]")
2330
2340
  # Create clickable VS Code link
2331
- vscode_url = _make_vscode_link(pod_name)
2332
- vscode_command = f"code --remote ssh-remote+{pod_name} /home/dev"
2341
+ vscode_url = _make_vscode_link(host_alias)
2342
+ vscode_command = f"code --remote ssh-remote+{host_alias} /home/dev"
2333
2343
  console.print(
2334
2344
  f"[cyan]💻 VS Code Remote:[/cyan] [link={vscode_url}][green]{vscode_command}[/green][/link]")
2335
2345
 
2336
2346
  # Create clickable Cursor link
2337
- cursor_url = _make_cursor_link(pod_name)
2338
- cursor_command = f"cursor --remote ssh-remote+{pod_name} /home/dev"
2347
+ cursor_url = _make_cursor_link(host_alias)
2348
+ cursor_command = f"cursor --remote ssh-remote+{host_alias} /home/dev"
2339
2349
  console.print(
2340
2350
  f"[cyan]🖥️ Cursor Remote:[/cyan] [link={cursor_url}][green]{cursor_command}[/green][/link]")
2341
2351
  else:
2342
2352
  # User declined Include - show commands with -F flag
2343
2353
  console.print(
2344
- f"[cyan]🖥️ SSH Command:[/cyan] [green]ssh -F {config_path} {pod_name}[/green]")
2354
+ f"[cyan]🖥️ SSH Command:[/cyan] [green]ssh -F {config_path} {host_alias}[/green]")
2345
2355
  console.print(
2346
2356
  f"[cyan]💻 VS Code/Cursor:[/cyan] Add [green]Include ~/.gpu-dev/*-sshconfig[/green] to ~/.ssh/config and ~/.cursor/ssh_config")
2347
2357
  console.print(