gpu-dev 0.7.5__tar.gz → 0.7.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. gpu_dev-0.7.10/.github/workflows/tests.yml +20 -0
  2. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/.gitignore +11 -0
  3. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/CLAUDE.md +89 -0
  4. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/PKG-INFO +6 -1
  5. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +165 -45
  6. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +26 -3
  7. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +28 -18
  8. gpu_dev-0.7.10/conftest.py +92 -0
  9. gpu_dev-0.7.10/docs/FAST_REPRO_DESIGN.md +141 -0
  10. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/SDK_REPRO.md +47 -4
  11. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/PKG-INFO +6 -1
  12. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/SOURCES.txt +55 -2
  13. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/requires.txt +6 -0
  14. gpu_dev-0.7.10/post-may-2026.md +185 -0
  15. gpu_dev-0.7.10/presentation/CLAUDE.md +220 -0
  16. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/cli-demo.html +5 -5
  17. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/gpu-fleet.html +5 -5
  18. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/k8s-under-the-hood.html +8 -8
  19. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/multinode.html +10 -10
  20. gpu_dev-0.7.10/presentation/pyproject.toml +33 -0
  21. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/sdk-demo.html +6 -6
  22. gpu_dev-0.7.10/presentation/teaser.html +317 -0
  23. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/pyproject.toml +17 -1
  24. gpu_dev-0.7.10/sdk/python/examples/parallel_experiments.ipynb +408 -0
  25. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/__init__.py +1 -1
  26. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_backend/aws.py +4 -1
  27. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/availability.tf +2 -1
  28. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/Dockerfile +18 -7
  29. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/bashrc +9 -1
  30. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/zshrc +5 -2
  31. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/git-cache.tf +2 -0
  32. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/kubernetes.tf +7 -2
  33. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/availability_updater/index.py +39 -3
  34. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +11 -0
  35. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_processor/index.py +216 -21
  36. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda.tf +16 -1
  37. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/mig-parted-config.yaml +15 -0
  38. gpu_dev-0.7.10/terraform-gpu-devservers/pytorch-ondemand.tf +178 -0
  39. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/pytorch-prebuild.tf +47 -3
  40. gpu_dev-0.7.10/tests/integration/README.md +35 -0
  41. gpu_dev-0.7.10/tests/integration/__init__.py +0 -0
  42. gpu_dev-0.7.10/tests/integration/conftest.py +131 -0
  43. gpu_dev-0.7.10/tests/integration/test_claude.py +40 -0
  44. gpu_dev-0.7.10/tests/integration/test_cpu_lifecycle.py +36 -0
  45. gpu_dev-0.7.10/tests/integration/test_repro_known_failure.py +54 -0
  46. gpu_dev-0.7.10/tests/integration/test_t4_lifecycle.py +39 -0
  47. gpu_dev-0.7.10/tests/integration/test_warm_pool.py +54 -0
  48. gpu_dev-0.7.10/tests/unit/__init__.py +0 -0
  49. gpu_dev-0.7.10/tests/unit/cli/__init__.py +0 -0
  50. gpu_dev-0.7.10/tests/unit/cli/test_auth.py +442 -0
  51. gpu_dev-0.7.10/tests/unit/cli/test_avail.py +295 -0
  52. gpu_dev-0.7.10/tests/unit/cli/test_cancel.py +380 -0
  53. gpu_dev-0.7.10/tests/unit/cli/test_config_cmd.py +187 -0
  54. gpu_dev-0.7.10/tests/unit/cli/test_config_module.py +476 -0
  55. gpu_dev-0.7.10/tests/unit/cli/test_connect.py +373 -0
  56. gpu_dev-0.7.10/tests/unit/cli/test_disks.py +747 -0
  57. gpu_dev-0.7.10/tests/unit/cli/test_edit.py +321 -0
  58. gpu_dev-0.7.10/tests/unit/cli/test_interactive.py +489 -0
  59. gpu_dev-0.7.10/tests/unit/cli/test_list_show.py +547 -0
  60. gpu_dev-0.7.10/tests/unit/cli/test_name_generator.py +272 -0
  61. gpu_dev-0.7.10/tests/unit/cli/test_repro.py +454 -0
  62. gpu_dev-0.7.10/tests/unit/cli/test_reservations_mgr.py +593 -0
  63. gpu_dev-0.7.10/tests/unit/cli/test_reserve.py +394 -0
  64. gpu_dev-0.7.10/tests/unit/cli/test_smoke.py +12 -0
  65. gpu_dev-0.7.10/tests/unit/cli/test_ssh_alias.py +130 -0
  66. gpu_dev-0.7.10/tests/unit/cli/test_submit.py +401 -0
  67. gpu_dev-0.7.10/tests/unit/lambda_fn/__init__.py +0 -0
  68. gpu_dev-0.7.10/tests/unit/lambda_fn/test_availability.py +488 -0
  69. gpu_dev-0.7.10/tests/unit/lambda_fn/test_cancellation.py +355 -0
  70. gpu_dev-0.7.10/tests/unit/lambda_fn/test_claim.py +348 -0
  71. gpu_dev-0.7.10/tests/unit/lambda_fn/test_mig_gpu_config.py +598 -0
  72. gpu_dev-0.7.10/tests/unit/lambda_fn/test_pod_resources.py +255 -0
  73. gpu_dev-0.7.10/tests/unit/lambda_fn/test_ref_staging.py +292 -0
  74. gpu_dev-0.7.10/tests/unit/lambda_fn/test_smoke.py +12 -0
  75. gpu_dev-0.7.10/tests/unit/lambda_fn/test_version_gate.py +178 -0
  76. gpu_dev-0.7.10/tests/unit/lambda_fn/test_warm_pool.py +682 -0
  77. gpu_dev-0.7.10/tests/unit/sdk/__init__.py +0 -0
  78. gpu_dev-0.7.10/tests/unit/sdk/test_backend_aws.py +790 -0
  79. gpu_dev-0.7.10/tests/unit/sdk/test_client.py +519 -0
  80. gpu_dev-0.7.10/tests/unit/sdk/test_errors_enums.py +308 -0
  81. gpu_dev-0.7.10/tests/unit/sdk/test_models_extra.py +361 -0
  82. gpu_dev-0.7.10/tests/unit/sdk/test_sandbox.py +352 -0
  83. gpu_dev-0.7.10/tests/unit/sdk/test_sdk_config.py +258 -0
  84. gpu_dev-0.7.10/tests/unit/sdk/test_transport_ssh.py +327 -0
  85. gpu_dev-0.7.5/sdk/python/examples/parallel_experiments.ipynb +0 -362
  86. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/.github/workflows/no-gitlinks.yml +0 -0
  87. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/.github/workflows/publish.yml +0 -0
  88. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/README.md +0 -0
  89. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/admin/README.md +0 -0
  90. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/admin/generate_stats.py +0 -0
  91. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/admin/requirements.txt +0 -0
  92. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/README.md +0 -0
  93. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  94. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  95. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  96. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  97. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  98. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  99. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  100. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  101. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  102. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/USER_GUIDE.md +0 -0
  103. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/devgpu-features.html +0 -0
  104. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/docker-mark-blue.svg +0 -0
  105. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/docs/icons8-cursor-ai.svg +0 -0
  106. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/dependency_links.txt +0 -0
  107. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/entry_points.txt +0 -0
  108. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/gpu_dev.egg-info/top_level.txt +0 -0
  109. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/architecture.html +0 -0
  110. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/devgpu-features.html +0 -0
  111. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/docker-mark-blue.svg +0 -0
  112. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/feedback.png +0 -0
  113. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/icons8-cursor-ai.svg +0 -0
  114. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/index.html +0 -0
  115. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/osdc-future-plans.html +0 -0
  116. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/problem.png +0 -0
  117. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/sandbox.html +0 -0
  118. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/thesis.html +0 -0
  119. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/title-vid.mp4 +0 -0
  120. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/weneedgpus.png +0 -0
  121. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/presentation/wow.html +0 -0
  122. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/README.md +0 -0
  123. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/batch_multi_gpu.py +0 -0
  124. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/interactive_debug.py +0 -0
  125. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/quickstart.ipynb +0 -0
  126. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/run_tests.py +0 -0
  127. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/examples/submit_job.py +0 -0
  128. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
  129. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  130. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
  131. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  132. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
  133. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
  134. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  135. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
  136. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  137. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/config.py +0 -0
  138. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/enums.py +0 -0
  139. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/errors.py +0 -0
  140. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/common/models.py +0 -0
  141. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/src/gpu_dev/py.typed +0 -0
  142. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/sdk/python/tests/test_models.py +0 -0
  143. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/setup.cfg +0 -0
  144. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-deck/backend.tf +0 -0
  145. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-deck/main.tf +0 -0
  146. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-deck/terraform.tfvars.example +0 -0
  147. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  148. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  149. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/README.md +0 -0
  150. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/alb.tf +0 -0
  151. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ami-baker.tf +0 -0
  152. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/backend.tf +0 -0
  153. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/build-node.tf +0 -0
  154. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/check_b200.py +0 -0
  155. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  156. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  157. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  158. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  159. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/bash_profile +0 -0
  160. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  161. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  162. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  163. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  164. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/motd_script +0 -0
  165. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  166. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/profile +0 -0
  167. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  168. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  169. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  170. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/shell_env +0 -0
  171. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/ssh_config +0 -0
  172. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/zprofile +0 -0
  173. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  174. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker-build.tf +0 -0
  175. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  176. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  177. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ecr.tf +0 -0
  178. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/efs.tf +0 -0
  179. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/eks.tf +0 -0
  180. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/expiry.tf +0 -0
  181. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  182. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  183. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  184. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  185. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  186. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  187. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  188. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  189. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  190. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  191. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  192. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  193. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  194. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/list_b200.py +0 -0
  195. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/main.tf +0 -0
  196. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/mig-config.tf +0 -0
  197. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  198. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  199. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  200. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  201. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  202. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/monitoring.tf +0 -0
  203. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  204. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/outputs.tf +0 -0
  205. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/pyproject.toml +0 -0
  206. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/queue.tf +0 -0
  207. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/route53.tf +0 -0
  208. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  209. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  210. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  211. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  212. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  213. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  214. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  215. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  216. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  217. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  218. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  219. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/switch-to.sh +0 -0
  220. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  221. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  222. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  223. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  224. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  225. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/variables.tf +0 -0
  226. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/terraform-gpu-devservers/warm-pool.tf +0 -0
  227. {gpu_dev-0.7.5/sdk/python → gpu_dev-0.7.10}/tests/__init__.py +0 -0
  228. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/tests/submit/README.md +0 -0
  229. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/tests/submit/fail/run.sh +0 -0
  230. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/tests/submit/multinode/run.sh +0 -0
  231. {gpu_dev-0.7.5 → gpu_dev-0.7.10}/tests/submit/success/run.sh +0 -0
@@ -0,0 +1,20 @@
1
+ name: tests
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ unit:
9
+ name: unit + mocks
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - name: Install uv
14
+ uses: astral-sh/setup-uv@v5
15
+ with:
16
+ python-version: "3.12"
17
+ - name: Install package + test deps
18
+ run: uv pip install -e ".[test]"
19
+ - name: Run unit + mock tests (integration excluded)
20
+ run: uv run pytest -m "not integration"
@@ -73,3 +73,14 @@ lambda/*/package/
73
73
  admin/output/
74
74
 
75
75
  .claude/worktrees/
76
+ .claude/settings.local.json
77
+ .claude/scheduled_tasks.lock
78
+
79
+ # Org-specific (filled in locally; not committed)
80
+ docs/INTERNAL_AUTH.md
81
+
82
+ # Local scratch / staging terraform working dir
83
+ *.pid
84
+ terraform-gpu-devservers/staging/.terraform/
85
+ terraform-gpu-devservers/staging/__pycache__/
86
+ terraform-gpu-devservers/staging/*.log
@@ -28,6 +28,59 @@ For terraform, we use opentofu, don't ever run tf apply directly. You're free to
28
28
  - Group imports in standard order: standard library, third-party, local imports
29
29
  - Use absolute imports when possible
30
30
 
31
+ ## Testing (DO THIS FOR EVERY CHANGE)
32
+
33
+ There is a real test suite now. **Every change must keep it green, and add/adjust
34
+ tests.** Two tiers:
35
+
36
+ **1. Unit + mocks — ALWAYS run, must stay green (CI runs this on every push/PR).**
37
+ Fully mocked (boto3 / k8s / SSH / subprocess), no network, ~2s.
38
+ ```bash
39
+ uv pip install -e ".[test]" # one-time: pytest, moto, kubernetes
40
+ uv run pytest -m "not integration" # ~1140 tests; run before every commit
41
+ ```
42
+ - Layout: `tests/unit/{sdk,cli,lambda_fn}/test_*.py`; shared fixtures in the root
43
+ `conftest.py` (`cli_runner`, `lambda_index` = the lambda imported as `index`
44
+ with env pre-set, `aws_mocks` = MagicMock boto3 handles).
45
+ - When you touch CLI / SDK / lambda code, update or add the matching `test_*.py`.
46
+ - CI: `.github/workflows/tests.yml`. Lambda imports need env vars + sys.path — the
47
+ root `conftest.py` already sets both.
48
+
49
+ **2. e2e integration on STAGING — run for anything touching the
50
+ reserve/pod/SSH/lambda path before merging.** Real reservations on the **staging**
51
+ cluster (us-west-1), cpu + t4 only, auto-cancelled. Staging is the DEFAULT target
52
+ and github_user comes from your config, so the bare command is enough:
53
+ ```bash
54
+ uv run pytest -m integration --run-integration -v
55
+ ```
56
+ - Staging is the default (`GPU_DEV_TEST_ENV` defaults to `staging` → us-west-1,
57
+ standard `pytorch-gpu-dev-*` prefix, tf workspace `default`). The integration
58
+ conftest pins the region so the unit-test us-east-2 default can't leak in. Wired
59
+ in `cli-tools/.../config.py` ENVIRONMENTS.
60
+ - Covers: cpu-x86 + t4 reserve→active→cancel, list-while-active, exec
61
+ (`nproc`/`nvidia-smi`/`torch.cuda`), **`claude -p` answers "Paris"** (pod Claude
62
+ Code/Bedrock), and the **warm pool** (fast warm claim + custom-image
63
+ warm-ineligibility). Each cancels in a `finally` (no leaked pods).
64
+ - Warm-pool tests need `WARM_POOL_TARGETS` deployed on staging — set in
65
+ `lambda.tf` for the `default` workspace (`{t4, cpu-x86, cpu-arm}`). Staging IS the
66
+ tf `default` workspace (us-west-1, environment=test) — there is no `test`/`staging`
67
+ workspace: `tofu workspace select default && tofu apply`. Until then the warm
68
+ tests skip ("came up cold"). Custom-image test: set `GPU_DEV_TEST_IMAGE`.
69
+ - Repro test (`test_repro_known_failure.py`): set `GPU_DEV_REPRO_REF` +
70
+ `GPU_DEV_REPRO_TEST` to a known-red (commit, test). Find one with the
71
+ **treehugger MCP** (`hud`, user-scope — `get_hud_data`/`master_commit_red`).
72
+ Note: prebuilt torch is h100/b200 arch, so a CUDA test on t4 needs a full build;
73
+ prefer a failure that runs on the box's GPU or on cpu.
74
+ - Skips cleanly if staging is unreachable or the runner has no outbound SSH (e.g. a
75
+ sandbox). The reservation role can query/SQS but lacks `DescribeTable`, so the
76
+ reachability probe uses scan+get-queue-url, not describe.
77
+ - Validated live (2026-05-31): cpu + t4 lifecycle PASS; warm-claim test confirmed
78
+ it reaches the real reserve (skips until WARM_POOL_TARGETS is applied).
79
+
80
+ **Rule of thumb:** unit+mocks for *every* change; add e2e coverage when you add a
81
+ new command/flow; run the staging e2e before merging anything that could affect a
82
+ live reservation. Don't say "done/tested" without having run the relevant tier.
83
+
31
84
  ## Content
32
85
 
33
86
  - torchci - a next.js app containing a PyTorch CI tracker
@@ -51,6 +104,42 @@ Currently we're working on a developer servers with GPUs in AWS. This means we'l
51
104
 
52
105
  # AGENT SECTION
53
106
 
107
+ ## Fast-repro redesign — by-SHA artifact cache + on-demand build (2026-06-01)
108
+
109
+ Goal: `gpu-dev repro <ref>` for any pytorch commit from the last ~72h lands a built,
110
+ importable tree in <2min. Design: `docs/FAST_REPRO_DESIGN.md`. **All merged to main**
111
+ (PRs #186–#189); **needs `tofu apply` (prod, workspace `prod`) + image rebuild**.
112
+
113
+ - **by-SHA artifact cache** (#186): whole *built* trees keyed by commit SHA at
114
+ `/ccache_shared/prebuilt/by-sha/<sha>.tar.{zst,gz}` (`.sha` written last = the
115
+ completion gate). Cron seeds one per viable/strict bump (hardlink, no extra space).
116
+ `stage-pytorch` (cold `--ref`) + `gpu-dev repro` consume on hit → `import torch`
117
+ with ZERO build. `repro` also publishes its in-pod build via `publish-pytorch-build`
118
+ (detached) so the cache fills from real usage. All paths safe-fallback on miss;
119
+ `ls-remote` is `timeout 15`.
120
+ - **retention** (#188): prebuild cron prunes by-sha entries >72h every tick (storage
121
+ budget ~500-650GB on the elastic ccache EFS). The by-sha set IS the snapshot ladder.
122
+ - **mold linker** (#187): Dockerfile installs `mold`; cron + in-pod repro build wrap
123
+ with `mold -run` (guarded on `command -v mold`). Drops the libtorch_cuda.so relink
124
+ ~1-3min → ~15s. **Needs image rebuild** to activate (prod runs a stale image; that's
125
+ also why prod publishes gzip not zstd — the Dockerfile has zstd already).
126
+ - **on-demand build worker** (#189, `pytorch-ondemand.tf`): always-on Deployment on
127
+ NodeType=build drains `prebuilt/build-queue/<sha>.req` (own hostPath tree
128
+ `/mnt/ondemand-build` → builds at `/home/dev/pytorch` so build/ paths are
129
+ pod-compatible; mold+ccache), publishes by-sha, writes `.worker-alive` heartbeat.
130
+ `repro` enqueues + polls ONLY when the heartbeat is fresh (else straight to in-pod
131
+ build → zero regression if not deployed). Makes the FIRST repro of an uncached
132
+ commit fast. Coordination 100% via shared EFS — no new networking/RBAC/lambda.
133
+ - cuDNN fidelity (`USE_CUDNN=1`) DEFERRED — forcing it can fail the build if cuDNN
134
+ isn't found under cuda-13.2; needs prod e2e. Base image is cudnn9-devel.
135
+ - Fast path is **prod-arch only** (`sm_90;sm_100` = H100/B200); t4/staging is wrong-arch.
136
+ - Also: SSH alias now keys off reservation id not pod name (#185) so warm/repro pods
137
+ are reachable via `ssh gpu-dev-<resid>` / `connect` (routing is via the FQDN, the
138
+ alias is a local label). CCACHE_MAXSIZE settled at 250G (#184).
139
+ - Prod e2e: `gpu-dev repro <fresh-sha> <test> --gpu-type h100 --no-connect` (first =
140
+ off-pod build + stage; rerun = by-sha HIT zero build). Worker logs:
141
+ `k -n management logs deploy/pytorch-ondemand-builder -f`.
142
+
54
143
  ## Instant-sandboxes branch — WIP & things to fix (2026-05-29)
55
144
 
56
145
  Big push on warm pools + instant claims + prebuilt pytorch. Tracking state here so it's not lost.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.7.5
3
+ Version: 0.7.10
4
4
  Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -15,6 +15,11 @@ Requires-Dist: questionary>=2.1.1
15
15
  Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
+ Provides-Extra: test
19
+ Requires-Dist: pytest>=7.4; extra == "test"
20
+ Requires-Dist: pytest-cov>=4.1; extra == "test"
21
+ Requires-Dist: moto[dynamodb,ec2,sqs]>=5.0; extra == "test"
22
+ Requires-Dist: kubernetes>=28.1; extra == "test"
18
23
 
19
24
  # GPU Developer CLI & SDK
20
25
 
@@ -319,6 +319,9 @@ def _show_single_reservation(connection_info: dict) -> None:
319
319
  reservation_id = connection_info["reservation_id"]
320
320
  reservation_name = connection_info.get("name")
321
321
  pod_name = connection_info.get("pod_name", "")
322
+ # SSH host alias keys off the reservation id (works for warm-claimed pods,
323
+ # whose pod_name != gpu-dev-<resid8>). pod_name is shown separately below.
324
+ host_alias = f"gpu-dev-{short_id}"
322
325
  ssh_config_path = get_ssh_config_path(reservation_id, reservation_name)
323
326
  use_include = is_ssh_include_enabled()
324
327
 
@@ -328,14 +331,14 @@ def _show_single_reservation(connection_info: dict) -> None:
328
331
  if use_include:
329
332
  # User approved Include - show simple commands
330
333
  from .reservations import _make_vscode_link
331
- ssh_command_display = f"[green]ssh {pod_name}[/green]"
332
- vscode_url = _make_vscode_link(pod_name)
333
- vscode_cmd_text = f"code --remote ssh-remote+{pod_name} /home/dev"
334
+ ssh_command_display = f"[green]ssh {host_alias}[/green]"
335
+ vscode_url = _make_vscode_link(host_alias)
336
+ vscode_cmd_text = f"code --remote ssh-remote+{host_alias} /home/dev"
334
337
  vscode_command_display = f"[link={vscode_url}][green]{vscode_cmd_text}[/green][/link]"
335
338
  vscode_info = f"[blue]VS Code Remote:[/blue] {vscode_command_display}\n"
336
339
  else:
337
340
  # User declined Include - show commands with -F flag
338
- ssh_command_display = f"[green]ssh -F {ssh_config_path} {pod_name}[/green]"
341
+ ssh_command_display = f"[green]ssh -F {ssh_config_path} {host_alias}[/green]"
339
342
  vscode_command_display = f"Add [green]Include ~/.gpu-dev/*-sshconfig[/green] to ~/.ssh/config and ~/.cursor/ssh_config (or: [green]gpu-dev config ssh-include enable[/green])"
340
343
  vscode_info = f"[blue]VS Code/Cursor:[/blue] {vscode_command_display}\n"
341
344
  else:
@@ -1523,12 +1526,19 @@ def reserve(
1523
1526
  @click.option("--gpu-type", default="b200", show_default=True, help="GPU type for the repro box.")
1524
1527
  @click.option("--gpus", type=int, default=1, show_default=True)
1525
1528
  @click.option("--hours", type=float, default=3.0, show_default=True,
1526
- help="Lifetime ceiling; the box auto-cancels when the test exits unless --keep.")
1529
+ help="Lifetime ceiling for the box.")
1530
+ @click.option("--no-connect", is_flag=True, default=False,
1531
+ help="CI mode: run the test, auto-cancel, exit code = test result. Default (on a TTY) drops you into the box to iterate.")
1527
1532
  @click.option("--keep", is_flag=True, default=False,
1528
- help="Keep the reservation after the test exits (default: auto-cancel).")
1533
+ help="Never cancel the box (skip the cancel prompt / auto-cancel).")
1529
1534
  @click.pass_context
1530
- def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
1531
- """Reserve a GPU, check out a PR/commit, run a test, then auto-cancel.
1535
+ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
1536
+ """Reserve a GPU, check out a PR/commit, run a test, then drop you into the box.
1537
+
1538
+ By default (in a terminal) repro runs the test and then **connects you into the
1539
+ box** at ~/pytorch — the ref is checked out, so you can fix and re-run. The box
1540
+ stays alive until you cancel it (you're prompted on exit). Use --no-connect for
1541
+ CI/scripts (run the test, auto-cancel, process exit code = the test result).
1532
1542
 
1533
1543
  REF: pr/<N>, #<N>, a bare PR number, a branch, or a commit sha. PRs use
1534
1544
  pull/<N>/merge (what CI tests), falling back to /head.
@@ -1539,6 +1549,7 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
1539
1549
  """
1540
1550
  import shlex
1541
1551
  import subprocess
1552
+ import sys
1542
1553
  config = load_config()
1543
1554
  reservation_mgr = ReservationManager(config)
1544
1555
  try:
@@ -1546,27 +1557,82 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
1546
1557
  except RuntimeError as e:
1547
1558
  rprint(f"[red]❌ {str(e)}[/red]"); return
1548
1559
 
1549
- # ref -> in-pod fetch+checkout (PRs prefer /merge = CI's view, fall back to /head)
1560
+ # Resolve the ref in-pod -> WANT (sha, for the by-sha cache) + FREF (fetch ref).
1561
+ # A MERGED pr/N reproduces the actual squash/merge commit on main (the real trunk
1562
+ # state that was red) — NOT pull/N/merge (the PR re-applied onto *current* trunk,
1563
+ # which goes green once the fix lands). Open PRs keep pull/N/merge (= CI's view).
1550
1564
  r = ref.strip(); prnum = None
1551
1565
  if r.startswith("pr/"): prnum = r[3:]
1552
1566
  elif r.startswith("#"): prnum = r[1:]
1553
1567
  elif r.isdigit(): prnum = r
1568
+ gh = "https://github.com/pytorch/pytorch.git"
1554
1569
  if prnum:
1555
- fetch = (f"git fetch origin pull/{prnum}/merge 2>/dev/null && git checkout -f FETCH_HEAD || "
1556
- f"{{ echo '[repro] no /merge ref, using /head'; git fetch origin pull/{prnum}/head && git checkout -f FETCH_HEAD; }}")
1570
+ api = f"https://api.github.com/repos/pytorch/pytorch/pulls/{prnum}"
1571
+ resolve = (
1572
+ f"PRJSON=$(curl -s -m 10 -H 'Accept: application/vnd.github+json' -H 'User-Agent: gpu-dev' {api} 2>/dev/null); "
1573
+ "MCS=$(printf '%s' \"$PRJSON\" | grep -oE '\"merge_commit_sha\": *\"[0-9a-f]+\"' | head -1 | cut -d'\"' -f4); "
1574
+ "if printf '%s' \"$PRJSON\" | grep -q '\"merged\": *true' && [ -n \"$MCS\" ]; then "
1575
+ f"WANT=\"$MCS\"; FREF=\"$MCS\"; echo \"[repro] pr/{prnum} is merged -> reproducing trunk commit $MCS\"; "
1576
+ f"else FREF=pull/{prnum}/merge; WANT=$(timeout 15 git ls-remote {gh} $FREF 2>/dev/null | head -1 | cut -f1); "
1577
+ f"[ -n \"$WANT\" ] || {{ FREF=pull/{prnum}/head; WANT=$(timeout 15 git ls-remote {gh} $FREF 2>/dev/null | head -1 | cut -f1); echo '[repro] open PR, no /merge -> /head'; }}; fi; ")
1557
1578
  else:
1558
1579
  rq = shlex.quote(r)
1559
- fetch = f"git fetch origin {rq} 2>/dev/null && git checkout -f FETCH_HEAD || git checkout -f {rq}"
1580
+ resolve = (f"FREF={rq}; WANT=$(timeout 15 git ls-remote {gh} {rq} 2>/dev/null | head -1 | cut -f1); "
1581
+ f"[ -n \"$WANT\" ] || case {rq} in *[!0-9a-fA-F]*) WANT= ;; *) WANT={rq} ;; esac; ")
1582
+ # in-pod fallback checkout (by-sha miss + farm unavailable): fetch the resolved ref,
1583
+ # else check out the sha directly (reachable for a merged-PR land commit / trunk).
1584
+ checkout = ("git fetch origin \"$FREF\" 2>/dev/null && git checkout -f FETCH_HEAD "
1585
+ "|| git checkout -f \"$WANT\" 2>/dev/null "
1586
+ "|| { git fetch --force origin 2>/dev/null && git checkout -f \"$WANT\"; }")
1560
1587
 
1561
1588
  testcmd = " ".join(shlex.quote(a) for a in test_args)
1589
+ # by-sha artifact cache: if a fully-built tree for the resolved SHA already exists
1590
+ # (shared EFS, seeded by the build node + prior repros), stage it -> ZERO build.
1591
+ # Otherwise build, then publish the result so the next dev (anyone) gets it instant.
1562
1592
  remote = (
1563
1593
  "set -e; cd /home/dev/pytorch; "
1564
1594
  "git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true; "
1565
- f"echo '[repro] checkout {r}'; {fetch}; "
1595
+ "BYSHA=/ccache_shared/prebuilt/by-sha; QUEUE=/ccache_shared/prebuilt/build-queue; HIT=; "
1596
+ # bs <sha>: stage a fully-built by-sha tree into /home/dev/pytorch (zero build); 0 on success.
1597
+ # explicit ext check, not a glob: the pod login shell is zsh, where an unmatched glob is a hard error.
1598
+ # require the .sha completion gate (written last) so we never stage a half-published tarball.
1599
+ "bs() { local s=\"$1\" tb=; [ -f \"$BYSHA/$s.sha\" ] || return 1; for e in zst gz; do [ -f \"$BYSHA/$s.tar.$e\" ] && { tb=\"$BYSHA/$s.tar.$e\"; break; }; done; [ -n \"$tb\" ] || return 1; "
1600
+ "rm -rf /home/dev/pytorch.new; mkdir -p /home/dev/pytorch.new; "
1601
+ "case \"$tb\" in *.zst) zstd -dc \"$tb\" 2>/dev/null | tar -C /home/dev/pytorch.new --strip-components=1 -xf - 2>/dev/null ;; "
1602
+ "*) tar -C /home/dev/pytorch.new --strip-components=1 -xzf \"$tb\" 2>/dev/null ;; esac; "
1603
+ "[ -d /home/dev/pytorch.new/.git ] || { rm -rf /home/dev/pytorch.new; return 1; }; "
1604
+ "rm -rf /home/dev/pytorch; mv /home/dev/pytorch.new /home/dev/pytorch; return 0; }; "
1605
+ + resolve +
1606
+ "echo \"[repro] target ${WANT:-?}\"; "
1607
+ # 1) already cached -> stage it (zero build)
1608
+ "if [ -n \"$WANT\" ] && bs \"$WANT\"; then cd /home/dev/pytorch; HIT=1; echo '[repro] by-sha cache HIT -> staged prebuilt tree (zero build)'; fi; "
1609
+ # 2) not cached, build farm alive -> request an off-pod build, wait, then stage
1610
+ "if [ -z \"$HIT\" ] && [ -n \"$WANT\" ] && [ -n \"$(find \"$QUEUE/.worker-alive\" -mmin -2 2>/dev/null)\" ]; then "
1611
+ "echo \"[repro] no cached build; requesting off-pod build of $WANT (build farm; streaming progress)…\"; printf '%s\\n' \"$FREF\" > \"$QUEUE/$WANT.req\" 2>/dev/null || true; "
1612
+ # poll for the artifact; meanwhile tail the farm's build log (ninja [x/N]) so it's not a silent hang.
1613
+ "i=0; LL=0; while [ $i -lt 400 ]; do [ -f \"$BYSHA/$WANT.sha\" ] && break; [ -f \"$QUEUE/$WANT.req\" ] || break; "
1614
+ "if [ -f \"$QUEUE/$WANT.log\" ]; then NL=$(wc -l < \"$QUEUE/$WANT.log\" 2>/dev/null || echo 0); "
1615
+ "if [ \"$NL\" -gt \"$LL\" ]; then tail -n +$((LL+1)) \"$QUEUE/$WANT.log\" 2>/dev/null | grep -aE '\\[[0-9]+/[0-9]+\\]|Building wheel|Successfully built|error' | tail -1 | sed 's/^/ [farm] /'; LL=$NL; fi; fi; "
1616
+ "sleep 3; i=$((i+1)); done; "
1617
+ "if bs \"$WANT\"; then cd /home/dev/pytorch; HIT=1; echo '[repro] off-pod build ready -> staged (zero build)'; else echo '[repro] off-pod build unavailable, building locally'; fi; fi; "
1618
+ # 3) fall back to in-pod fetch + build (+ cache the result for the next dev)
1619
+ "if [ -z \"$HIT\" ]; then "
1620
+ "echo \"[repro] checking out $FREF\"; " + checkout + "; "
1566
1621
  "echo \"[repro] HEAD $(git rev-parse --short HEAD)\"; "
1567
1622
  "git -c protocol.file.allow=always submodule update --init --recursive --jobs 8 >/dev/null 2>&1 || true; "
1568
1623
  "if ! PYTHONPATH=/home/dev/pytorch python -c 'import torch' 2>/dev/null; then "
1569
- "echo '[repro] incremental rebuild on warm build/...'; pip install --break-system-packages -e . --no-build-isolation; fi; "
1624
+ "echo \"[repro] prebuilt torch != this commit -> rebuilding (ccache-accelerated, but the further this commit is from viable/strict, the more recompiles). checked-out: $(git log -1 --format='%h %ci')\"; "
1625
+ # mold -run routes the libtorch_cuda.so relink through mold (~15s vs minutes); guarded.
1626
+ # Explicit if/else (not `$M pip`): the pod login shell is zsh, which doesn't word-split
1627
+ # unquoted vars. -v streams the cmake/ninja [x/N] progress instead of pip's blind spinner.
1628
+ "if command -v mold >/dev/null 2>&1; then mold -run pip install --break-system-packages -e . --no-build-isolation -v; "
1629
+ "else pip install --break-system-packages -e . --no-build-isolation -v; fi; fi; "
1630
+ # cache this build for the next dev (detached so it survives the ssh session)
1631
+ "SHA=$(git rev-parse HEAD 2>/dev/null); "
1632
+ "if command -v publish-pytorch-build >/dev/null 2>&1 && [ -n \"$SHA\" ] && [ ! -f \"$BYSHA/$SHA.sha\" ]; then "
1633
+ "echo '[repro] caching this build (by-sha) for next time…'; "
1634
+ "setsid publish-pytorch-build \"$SHA\" >/dev/null 2>&1 < /dev/null & fi; "
1635
+ "fi; "
1570
1636
  f"echo '[repro] running: python {testcmd}'; "
1571
1637
  f"PYTHONPATH=/home/dev/pytorch python {testcmd}"
1572
1638
  )
@@ -1602,21 +1668,55 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, keep):
1602
1668
  if "StrictHostKeyChecking" not in ssh_cmd:
1603
1669
  ssh_cmd = ssh_cmd.replace("ssh ", "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR ", 1)
1604
1670
  rprint(f"[dim]→ {ssh_cmd}[/dim]\n")
1671
+ rid8 = str(rid)[:8]
1605
1672
  rc = 1
1606
1673
  try:
1607
1674
  rc = subprocess.run(f"{ssh_cmd} {shlex.quote(remote)}", shell=True).returncode
1608
1675
  except KeyboardInterrupt:
1609
- rprint("\n[yellow]interrupted[/yellow]")
1610
- finally:
1676
+ rprint("\n[yellow]interrupted[/yellow]"); rc = 130
1677
+
1678
+ verdict = "[green]✓ test passed[/green]" if rc == 0 else f"[red]✗ test failed (exit {rc})[/red]"
1679
+
1680
+ # Default (TTY): drop into the box so you can fix and re-run. --no-connect is the
1681
+ # CI path: auto-cancel and exit with the test's code.
1682
+ connect = (not no_connect) and sys.stdout.isatty()
1683
+ if connect:
1684
+ rprint(f"\n{verdict} — dropping you into the box at ~/pytorch ({ref} checked out).")
1685
+ rprint(f"[dim] re-run: python {testcmd}[/dim]")
1686
+ rprint(f"[dim] finish: gpu-dev cancel (from inside) • or exit this shell[/dim]\n")
1687
+ shell_cmd = f"{ssh_cmd} -t {shlex.quote('cd /home/dev/pytorch 2>/dev/null; exec ${SHELL:-bash} -l')}"
1688
+ try:
1689
+ subprocess.run(shell_cmd, shell=True)
1690
+ except KeyboardInterrupt:
1691
+ pass
1611
1692
  if keep:
1612
- rprint(f"[cyan]📌 kept {str(rid)[:8]} — gpu-dev connect {str(rid)[:8]} • gpu-dev cancel {str(rid)[:8]}[/cyan]")
1613
- else:
1693
+ rprint(f"[cyan]📌 left {rid8} running connect: gpu-dev connect {rid8} • cancel: gpu-dev cancel {rid8}[/cyan]")
1694
+ return
1695
+ try:
1696
+ drop = click.confirm(f"Cancel repro box {rid8}?", default=True)
1697
+ except (KeyboardInterrupt, EOFError, click.Abort):
1698
+ drop = False
1699
+ if drop:
1614
1700
  try:
1615
1701
  reservation_mgr.cancel_reservation(rid, user_info["user_id"])
1616
- rprint(f"[green]🧹 cancelled repro box {str(rid)[:8]}[/green]")
1702
+ rprint(f"[green]🧹 cancelled {rid8}[/green]")
1617
1703
  except Exception as e:
1618
- rprint(f"[yellow]auto-cancel failed for {str(rid)[:8]}: {e}[/yellow]")
1619
- rprint(f"\n[bold]repro exit code: {rc}[/bold]")
1704
+ rprint(f"[yellow]cancel failed for {rid8}: {e}[/yellow]")
1705
+ else:
1706
+ rprint(f"[cyan]📌 left {rid8} running — connect: gpu-dev connect {rid8} • cancel: gpu-dev cancel {rid8}[/cyan]")
1707
+ return
1708
+
1709
+ # --no-connect / non-TTY: auto-cancel unless --keep, exit code = test result.
1710
+ if keep:
1711
+ rprint(f"[cyan]📌 kept {rid8} — gpu-dev connect {rid8} • gpu-dev cancel {rid8}[/cyan]")
1712
+ else:
1713
+ try:
1714
+ reservation_mgr.cancel_reservation(rid, user_info["user_id"])
1715
+ rprint(f"[green]🧹 cancelled repro box {rid8}[/green]")
1716
+ except Exception as e:
1717
+ rprint(f"[yellow]auto-cancel failed for {rid8}: {e}[/yellow]")
1718
+ rprint(f"\n[bold]repro exit code: {rc}[/bold] ({verdict})")
1719
+ sys.exit(rc)
1620
1720
 
1621
1721
 
1622
1722
  _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100",
@@ -1837,7 +1937,9 @@ def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dock
1837
1937
  sys.exit(1)
1838
1938
  create_ssh_config_for_reservation(master_fqdn, master_pod, master_id, master_name)
1839
1939
 
1840
- ssh_alias = master_pod
1940
+ # Host alias matches the Host line written by create_ssh_config_for_reservation
1941
+ # (keyed off the reservation id, so warm-claimed masters resolve too).
1942
+ ssh_alias = f"gpu-dev-{master_id[:8]}"
1841
1943
  ssh_base = ["ssh", "-F", str(config_file), "-o", "StrictHostKeyChecking=accept-new"]
1842
1944
  rsync_e = " ".join(shlex.quote(x) for x in ssh_base)
1843
1945
 
@@ -3124,11 +3226,15 @@ def _show_direct_success(res: dict, elapsed: float) -> None:
3124
3226
  """Print the success block for an instant warm-pool claim,
3125
3227
  matching the normal reserve output (SSH config + VS Code/Cursor remote)."""
3126
3228
  from gpu_dev_cli.reservations import (
3127
- create_ssh_config_for_reservation, _generate_vscode_command, _generate_cursor_command)
3229
+ create_ssh_config_for_reservation, _generate_vscode_command,
3230
+ _generate_cursor_command, _make_vscode_link, _make_cursor_link)
3128
3231
  rid = res.get("reservation_id", "") or ""
3129
3232
  ssh_command = res.get("ssh_command", "") or ""
3130
3233
  pod_name = res.get("pod_name", "") or ""
3131
3234
  fqdn = res.get("fqdn") or ""
3235
+ # Host alias keys off the reservation id — warm-claimed pods have a pod_name
3236
+ # that is NOT gpu-dev-<resid8>, so we must not use pod_name as the ssh alias.
3237
+ host_alias = f"gpu-dev-{rid[:8]}" if rid else pod_name
3132
3238
 
3133
3239
  rprint(f"\n[green]✅ Instant reservation ready in {elapsed:.1f}s![/green]")
3134
3240
  rprint(f"[bold]📋 Reservation ID:[/bold] {rid}")
@@ -3137,24 +3243,28 @@ def _show_direct_success(res: dict, elapsed: float) -> None:
3137
3243
  if rid:
3138
3244
  rprint(f"[bold]⚡ Quick Connect:[/bold] gpu-dev connect {rid[:8]}")
3139
3245
 
3140
- # Build the per-reservation SSH config so `ssh <pod>` and connect work cleanly.
3246
+ # Build the per-reservation SSH config so `ssh gpu-dev-<resid8>` and connect work cleanly.
3141
3247
  use_include = False
3142
3248
  if fqdn and pod_name and rid:
3143
3249
  try:
3144
3250
  _cfg, use_include = create_ssh_config_for_reservation(fqdn, pod_name, rid, None)
3145
3251
  except Exception:
3146
3252
  pass
3147
- if pod_name and use_include:
3148
- rprint(f"[bold]🖥️ SSH Command:[/bold] ssh {pod_name}")
3149
- elif ssh_command:
3150
- rprint(f"[bold]🖥️ SSH Command:[/bold] {ssh_command}")
3151
-
3152
- vsc = _generate_vscode_command(ssh_command) if ssh_command else None
3153
- cur = _generate_cursor_command(ssh_command) if ssh_command else None
3154
- if vsc:
3155
- rprint(f"[bold]💻 VS Code Remote:[/bold] {vsc}")
3156
- if cur:
3157
- rprint(f"[bold]🖥️ Cursor Remote:[/bold] {cur}")
3253
+ if use_include and rid:
3254
+ rprint(f"[bold]🖥️ SSH Command:[/bold] ssh {host_alias}")
3255
+ vscode_url = _make_vscode_link(host_alias)
3256
+ cursor_url = _make_cursor_link(host_alias)
3257
+ rprint(f"[bold]💻 VS Code Remote:[/bold] [link={vscode_url}]code --remote ssh-remote+{host_alias} /home/dev[/link]")
3258
+ rprint(f"[bold]🖥️ Cursor Remote:[/bold] [link={cursor_url}]cursor --remote ssh-remote+{host_alias} /home/dev[/link]")
3259
+ else:
3260
+ if ssh_command:
3261
+ rprint(f"[bold]🖥️ SSH Command:[/bold] {ssh_command}")
3262
+ vsc = _generate_vscode_command(ssh_command) if ssh_command else None
3263
+ cur = _generate_cursor_command(ssh_command) if ssh_command else None
3264
+ if vsc:
3265
+ rprint(f"[bold]💻 VS Code Remote:[/bold] {vsc}")
3266
+ if cur:
3267
+ rprint(f"[bold]🖥️ Cursor Remote:[/bold] {cur}")
3158
3268
 
3159
3269
 
3160
3270
  def _format_gpu_display(gpu_count, gpu_type):
@@ -3343,15 +3453,22 @@ def _show_availability(show_spot: bool = False) -> None:
3343
3453
  spot_table = Table(title="⚡ Spot Instances (us-east-1, ~70% cheaper)")
3344
3454
  spot_table.add_column("GPU Type", style="cyan")
3345
3455
  spot_table.add_column("Avail\nNow", style="green")
3456
+ spot_table.add_column("In\nUse", style="yellow")
3346
3457
  spot_table.add_column("Per\nNode", style="bright_green")
3347
3458
  spot_table.add_column("Status", style="magenta")
3348
3459
  spot_table.add_column("Spot Discount", style="dim")
3349
3460
  _on_demand = {"b300": 95, "b200": 95, "h200": 55, "h100": 98, "a100": 32, "t4": 4.5, "l4": 7}
3350
3461
  for gt, info in sorted(spot_region_info.items()):
3351
3462
  avail = info.get("available", 0)
3463
+ total = info.get("total", 0)
3464
+ in_use = max(0, total - avail) # GPUs on up spot nodes already taken
3352
3465
  per_node = spot_gpus_per_node.get(gt, 8)
3353
3466
  avail_display = f"[green]{avail}[/green]" if avail > 0 else f"[dim]0[/dim]"
3354
- status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
3467
+ in_use_display = f"[yellow]{in_use}[/yellow]" if in_use > 0 else f"[dim]0[/dim]"
3468
+ if in_use > 0:
3469
+ status = "[yellow]Node up (in use)[/yellow]" if avail == 0 else "[green]Node up[/green]"
3470
+ else:
3471
+ status = "[green]Node up[/green]" if avail > 0 else "Spins up on reserve (~10 min)"
3355
3472
  si = info.get("spot_info", {}) or {}
3356
3473
  sp = si.get("spot_price", "") if isinstance(si, dict) else ""
3357
3474
  if not sp or (isinstance(si, dict) and "No spot data" in str(si.get("spot_signal", ""))):
@@ -3363,7 +3480,7 @@ def _show_availability(show_spot: bool = False) -> None:
3363
3480
  avail_signal = f"[green]{pct}% off on-demand[/green]" if pct > 0 else "[dim]At on-demand price[/dim]"
3364
3481
  except (ValueError, TypeError):
3365
3482
  avail_signal = "[yellow]Unknown[/yellow]"
3366
- spot_table.add_row(f"{gt.upper()} *", avail_display, str(per_node), status, avail_signal)
3483
+ spot_table.add_row(f"{gt.upper()} *", avail_display, in_use_display, str(per_node), status, avail_signal)
3367
3484
  console.print(spot_table)
3368
3485
  rprint("[dim]* = spot: ~70% cheaper, AWS can reclaim with 2-min notice, fulfillment not guaranteed.[/dim]")
3369
3486
  rprint("[dim] Separate cluster (us-east-1) with separate disks. Select via gpu-dev reserve (interactive).[/dim]")
@@ -3737,7 +3854,8 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3737
3854
  for node in nodes:
3738
3855
  status_display = "✅ Active" if node.get("status") == "active" else f"⏳ {node.get('status', 'unknown')}"
3739
3856
  pod_name = node.get("pod_name", "unknown")
3740
- ssh_cmd_short = f"ssh {pod_name}" if pod_name != "unknown" else "N/A"
3857
+ node_rid = node.get("reservation_id")
3858
+ ssh_cmd_short = f"ssh gpu-dev-{node_rid[:8]}" if node_rid else "N/A"
3741
3859
 
3742
3860
  table.add_row(
3743
3861
  f"Node {node.get('node_index', 0) + 1}",
@@ -3994,10 +4112,11 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
3994
4112
  )
3995
4113
 
3996
4114
  if config_path:
4115
+ node_alias = f"gpu-dev-{node_res_id[:8]}"
3997
4116
  if use_include:
3998
- rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh {pod_name}[/cyan]")
4117
+ rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh {node_alias}[/cyan]")
3999
4118
  else:
4000
- rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh -F {config_path} {pod_name}[/cyan]")
4119
+ rprint(f"[green]✅ Node {node_idx + 1}:[/green] [cyan]ssh -F {config_path} {node_alias}[/cyan]")
4001
4120
  else:
4002
4121
  rprint(f"[yellow]⚠️ Node {node_idx + 1}: Failed to create SSH config[/yellow]")
4003
4122
 
@@ -4025,12 +4144,13 @@ def get_ssh_config_cmd(ctx: click.Context, reservation_id: Optional[str]) -> Non
4025
4144
  )
4026
4145
 
4027
4146
  if config_path:
4147
+ host_alias = f"gpu-dev-{reservation_id[:8]}"
4028
4148
  rprint(f"[green]✅ SSH config created:[/green] [cyan]{config_path}[/cyan]\n")
4029
4149
  if use_include:
4030
- rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh {pod_name}[/cyan]")
4150
+ rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh {host_alias}[/cyan]")
4031
4151
  rprint(f"[dim] or:[/dim] [cyan]gpu-dev connect {reservation_id[:8]}[/cyan]")
4032
4152
  else:
4033
- rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh -F {config_path} {pod_name}[/cyan]")
4153
+ rprint(f"[green]🎉 You can now connect with:[/green] [cyan]ssh -F {config_path} {host_alias}[/cyan]")
4034
4154
  rprint(f"[dim] or:[/dim] [cyan]gpu-dev connect {reservation_id[:8]}[/cyan]")
4035
4155
  else:
4036
4156
  rprint("[red]❌ Failed to create SSH config[/red]")
@@ -4597,13 +4717,13 @@ def ssh_include(action: str):
4597
4717
 
4598
4718
  \b
4599
4719
  When enabled:
4600
- • Simple SSH commands: ssh <pod-name>
4601
- • VS Code Remote works: code --remote ssh-remote+<pod-name>
4720
+ • Simple SSH commands: ssh gpu-dev-<reservation-id>
4721
+ • VS Code Remote works: code --remote ssh-remote+gpu-dev-<reservation-id>
4602
4722
  • Cursor Remote works: Open Remote SSH in Cursor
4603
4723
 
4604
4724
  \b
4605
4725
  When disabled:
4606
- • Need -F flag: ssh -F ~/.gpu-dev/<id>-sshconfig <pod-name>
4726
+ • Need -F flag: ssh -F ~/.gpu-dev/<id>-sshconfig gpu-dev-<reservation-id>
4607
4727
  • VS Code/Cursor requires manual config setup
4608
4728
 
4609
4729
  \b
@@ -29,6 +29,15 @@ class Config:
29
29
  "description": "Spot-only us-east-1 environment (T4/L4/CPU)",
30
30
  "spot_types": ["b300", "b200", "h200", "h100", "a100", "t4", "l4", "rtxpro6000"],
31
31
  },
32
+ # Staging (us-west-1, tf "default" workspace, environment=test). Same
33
+ # standard resource prefix as prod, just a different region — so only the
34
+ # region changes. Live capacity: cpu-x86/arm + t4. Used for integration
35
+ # tests. Select via `GPU_DEV_ENVIRONMENT=staging` (or the "test" env alias).
36
+ "staging": {
37
+ "region": "us-west-1",
38
+ "workspace": "default",
39
+ "description": "Staging (us-west-1, cpu + t4)",
40
+ },
32
41
  }
33
42
  DEFAULT_ENVIRONMENT = "prod"
34
43
 
@@ -43,19 +52,33 @@ class Config:
43
52
  # Load unified config (handles migration from legacy files)
44
53
  self.user_config = self._load_config()
45
54
 
46
- # Get region: env vars take priority (for spot routing), then config, then default
55
+ # Active environment: GPU_DEV_ENVIRONMENT env wins (handy for tests/CI),
56
+ # then the persisted config, then the default. Its region/prefix back the
57
+ # fallbacks below so e.g. `GPU_DEV_ENVIRONMENT=staging` reaches us-west-2.
58
+ env_override = os.getenv("GPU_DEV_ENVIRONMENT")
59
+ env_name = env_override or self.user_config.get(
60
+ "environment", self.DEFAULT_ENVIRONMENT)
61
+ env_cfg = self.ENVIRONMENTS.get(env_name, {})
62
+
63
+ # Get region: AWS_* env vars take priority (for spot routing); then an
64
+ # explicit GPU_DEV_ENVIRONMENT switch uses that env's region (beating the
65
+ # persisted one); then the persisted config; then the env's region; default.
47
66
  env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
48
67
  if env_region and env_region != self.user_config.get("region"):
49
68
  self.aws_region = env_region
69
+ elif env_override and env_cfg.get("region"):
70
+ self.aws_region = env_cfg["region"]
50
71
  elif self.user_config.get("region"):
51
72
  self.aws_region = self.user_config["region"]
73
+ elif env_cfg.get("region"):
74
+ self.aws_region = env_cfg["region"]
52
75
  else:
53
76
  self.aws_region = "us-east-2"
54
77
 
55
78
  os.environ["AWS_DEFAULT_REGION"] = self.aws_region
56
79
 
57
- # Resource naming convention - no config needed!
58
- self.prefix = "pytorch-gpu-dev"
80
+ # Resource naming convention — per-environment prefix (default for prod).
81
+ self.prefix = env_cfg.get("prefix", "pytorch-gpu-dev")
59
82
 
60
83
  # Construct ARNs from convention
61
84
  self.queue_name = f"{self.prefix}-reservation-queue"