gpu-dev 0.7.10__tar.gz → 0.7.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/PKG-INFO +1 -1
  2. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +51 -4
  3. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +13 -1
  4. gpu_dev-0.7.12/docs/GPU_DEV_SUBMIT.md +89 -0
  5. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/PKG-INFO +1 -1
  6. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/SOURCES.txt +2 -0
  7. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/pyproject.toml +1 -1
  8. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/Dockerfile +3 -2
  9. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_processor/index.py +63 -40
  10. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_submit.py +47 -1
  11. gpu_dev-0.7.12/tests/unit/lambda_fn/test_finalize_no_ssh.py +24 -0
  12. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/.github/workflows/no-gitlinks.yml +0 -0
  13. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/.github/workflows/publish.yml +0 -0
  14. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/.github/workflows/tests.yml +0 -0
  15. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/.gitignore +0 -0
  16. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/CLAUDE.md +0 -0
  17. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/README.md +0 -0
  18. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/admin/README.md +0 -0
  19. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/admin/generate_stats.py +0 -0
  20. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/admin/requirements.txt +0 -0
  21. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/README.md +0 -0
  22. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  23. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  24. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  25. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  26. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  27. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  28. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  29. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  30. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  31. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  32. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/conftest.py +0 -0
  33. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/FAST_REPRO_DESIGN.md +0 -0
  34. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/SDK_REPRO.md +0 -0
  35. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/USER_GUIDE.md +0 -0
  36. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/devgpu-features.html +0 -0
  37. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/docker-mark-blue.svg +0 -0
  38. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/docs/icons8-cursor-ai.svg +0 -0
  39. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/dependency_links.txt +0 -0
  40. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/entry_points.txt +0 -0
  41. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/requires.txt +0 -0
  42. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/gpu_dev.egg-info/top_level.txt +0 -0
  43. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/post-may-2026.md +0 -0
  44. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/CLAUDE.md +0 -0
  45. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/architecture.html +0 -0
  46. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/cli-demo.html +0 -0
  47. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/devgpu-features.html +0 -0
  48. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/docker-mark-blue.svg +0 -0
  49. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/feedback.png +0 -0
  50. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/gpu-fleet.html +0 -0
  51. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/icons8-cursor-ai.svg +0 -0
  52. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/index.html +0 -0
  53. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/k8s-under-the-hood.html +0 -0
  54. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/multinode.html +0 -0
  55. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/osdc-future-plans.html +0 -0
  56. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/problem.png +0 -0
  57. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/pyproject.toml +0 -0
  58. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/sandbox.html +0 -0
  59. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/sdk-demo.html +0 -0
  60. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/teaser.html +0 -0
  61. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/thesis.html +0 -0
  62. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/title-vid.mp4 +0 -0
  63. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/weneedgpus.png +0 -0
  64. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/presentation/wow.html +0 -0
  65. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/README.md +0 -0
  66. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/batch_multi_gpu.py +0 -0
  67. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/interactive_debug.py +0 -0
  68. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/parallel_experiments.ipynb +0 -0
  69. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/quickstart.ipynb +0 -0
  70. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/run_tests.py +0 -0
  71. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/examples/submit_job.py +0 -0
  72. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/__init__.py +0 -0
  73. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
  74. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  75. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_backend/aws.py +0 -0
  76. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
  77. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  78. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
  79. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
  80. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  81. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
  82. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  83. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/config.py +0 -0
  84. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/enums.py +0 -0
  85. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/errors.py +0 -0
  86. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/common/models.py +0 -0
  87. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/src/gpu_dev/py.typed +0 -0
  88. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/sdk/python/tests/test_models.py +0 -0
  89. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/setup.cfg +0 -0
  90. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-deck/backend.tf +0 -0
  91. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-deck/main.tf +0 -0
  92. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-deck/terraform.tfvars.example +0 -0
  93. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  94. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  95. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/README.md +0 -0
  96. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/alb.tf +0 -0
  97. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ami-baker.tf +0 -0
  98. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/availability.tf +0 -0
  99. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/backend.tf +0 -0
  100. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/build-node.tf +0 -0
  101. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/check_b200.py +0 -0
  102. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  103. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  104. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  105. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  106. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/bash_profile +0 -0
  107. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/bashrc +0 -0
  108. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  109. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  110. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  111. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  112. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/motd_script +0 -0
  113. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  114. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/profile +0 -0
  115. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  116. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  117. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  118. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/shell_env +0 -0
  119. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/ssh_config +0 -0
  120. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/zprofile +0 -0
  121. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/zshrc +0 -0
  122. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  123. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker-build.tf +0 -0
  124. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  125. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  126. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ecr.tf +0 -0
  127. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/efs.tf +0 -0
  128. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/eks.tf +0 -0
  129. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/expiry.tf +0 -0
  130. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/git-cache.tf +0 -0
  131. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  132. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/kubernetes.tf +0 -0
  133. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  134. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  135. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  136. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  137. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  138. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  139. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  140. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  141. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  142. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  143. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  144. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  145. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  146. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  147. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/lambda.tf +0 -0
  148. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/list_b200.py +0 -0
  149. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/main.tf +0 -0
  150. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/mig-config.tf +0 -0
  151. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  152. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  153. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  154. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  155. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  156. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  157. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/monitoring.tf +0 -0
  158. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  159. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/outputs.tf +0 -0
  160. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/pyproject.toml +0 -0
  161. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/pytorch-ondemand.tf +0 -0
  162. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/pytorch-prebuild.tf +0 -0
  163. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/queue.tf +0 -0
  164. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/route53.tf +0 -0
  165. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  166. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  167. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  168. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  169. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  170. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  171. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  172. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  173. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  174. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  175. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  176. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/switch-to.sh +0 -0
  177. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  178. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  179. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  180. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  181. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  182. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/variables.tf +0 -0
  183. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/terraform-gpu-devservers/warm-pool.tf +0 -0
  184. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/__init__.py +0 -0
  185. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/README.md +0 -0
  186. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/__init__.py +0 -0
  187. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/conftest.py +0 -0
  188. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_claude.py +0 -0
  189. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_cpu_lifecycle.py +0 -0
  190. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_repro_known_failure.py +0 -0
  191. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_t4_lifecycle.py +0 -0
  192. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/integration/test_warm_pool.py +0 -0
  193. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/submit/README.md +0 -0
  194. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/submit/fail/run.sh +0 -0
  195. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/submit/multinode/run.sh +0 -0
  196. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/submit/success/run.sh +0 -0
  197. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/__init__.py +0 -0
  198. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/__init__.py +0 -0
  199. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_auth.py +0 -0
  200. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_avail.py +0 -0
  201. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_cancel.py +0 -0
  202. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_config_cmd.py +0 -0
  203. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_config_module.py +0 -0
  204. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_connect.py +0 -0
  205. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_disks.py +0 -0
  206. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_edit.py +0 -0
  207. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_interactive.py +0 -0
  208. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_list_show.py +0 -0
  209. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_name_generator.py +0 -0
  210. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_repro.py +0 -0
  211. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_reservations_mgr.py +0 -0
  212. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_reserve.py +0 -0
  213. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_smoke.py +0 -0
  214. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/cli/test_ssh_alias.py +0 -0
  215. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/__init__.py +0 -0
  216. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_availability.py +0 -0
  217. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_cancellation.py +0 -0
  218. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_claim.py +0 -0
  219. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_mig_gpu_config.py +0 -0
  220. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_pod_resources.py +0 -0
  221. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_ref_staging.py +0 -0
  222. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_smoke.py +0 -0
  223. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_version_gate.py +0 -0
  224. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/lambda_fn/test_warm_pool.py +0 -0
  225. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/__init__.py +0 -0
  226. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_backend_aws.py +0 -0
  227. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_client.py +0 -0
  228. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_errors_enums.py +0 -0
  229. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_models_extra.py +0 -0
  230. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_sandbox.py +0 -0
  231. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_sdk_config.py +0 -0
  232. {gpu_dev-0.7.10 → gpu_dev-0.7.12}/tests/unit/sdk/test_transport_ssh.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.7.10
3
+ Version: 0.7.12
4
4
  Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1724,6 +1724,47 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
1724
1724
  "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86", "cpu-spot"]
1725
1725
 
1726
1726
 
1727
+ def _build_submit_remote_script(workdir: str, remote_cmd: str, ref: Optional[str],
1728
+ no_build: bool) -> str:
1729
+ """Build the remote shell script `submit` runs over SSH (under `bash -lc`).
1730
+
1731
+ Without --ref this is just `cd <workdir> && <cmd>`. With --ref the pytorch
1732
+ tree is staged in the *background* in-pod (stage-pytorch &), and the tree is
1733
+ only chowned to dev + the ref fully checked out at the very end. Running the
1734
+ user command before that finishes is the footgun Driss hit: a root-owned tree
1735
+ (git "dubious ownership") and a source/installed-torch mismatch (the ref is
1736
+ checked out but the prebuilt .so is the stale base build -> `import torch`
1737
+ fails). So with --ref we prepend a preamble that:
1738
+ 1. waits for staging to finish (`.pytorch-staging` marker removed at end),
1739
+ 2. marks /home/dev/pytorch a git safe.directory for the dev user,
1740
+ 3. unless --no-build, rebuilds incrementally so installed torch == the
1741
+ checked-out source (warm build/ -> ~tens of seconds; a rebuild failure
1742
+ exits 90 before the user command runs).
1743
+ The rebuild/safe.directory only touch pytorch when staging actually ran
1744
+ (`.pytorch-ready` present), so --disk reservations (ref ignored, no staging)
1745
+ are unaffected.
1746
+ """
1747
+ import shlex
1748
+ cd_run = f"cd {shlex.quote(workdir)} && {remote_cmd}"
1749
+ if not ref:
1750
+ return cd_run
1751
+ lines = [
1752
+ 'if [ -e /home/dev/.pytorch-staging ]; then',
1753
+ ' echo "[gpu-dev] waiting for background pytorch --ref staging to finish…"',
1754
+ ' for _i in $(seq 1 3600); do [ -e /home/dev/.pytorch-staging ] || break; sleep 1; done',
1755
+ 'fi',
1756
+ 'if [ -f /home/dev/.pytorch-ready ]; then',
1757
+ ' git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true',
1758
+ ]
1759
+ if not no_build:
1760
+ lines += [
1761
+ ' echo "[gpu-dev] rebuilding torch to match --ref (pip install -e . --no-build-isolation)…"',
1762
+ ' ( cd /home/dev/pytorch && pip install -e . --no-build-isolation ) || { echo "[gpu-dev] torch rebuild failed"; exit 90; }',
1763
+ ]
1764
+ lines += ['fi', cd_run]
1765
+ return "\n".join(lines)
1766
+
1767
+
1727
1768
  @main.command(context_settings={"ignore_unknown_options": True})
1728
1769
  @click.option("--gpu-type", type=click.Choice(_SUBMIT_GPU_TYPES, case_sensitive=False), default="a100", show_default=True)
1729
1770
  @click.option("--gpus", type=int, default=1, show_default=True, help="GPU count (multinode if > per-node max).")
@@ -1743,6 +1784,8 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
1743
1784
  @click.option("--runtime", type=click.Path(exists=True, file_okay=False, resolve_path=True), default=None,
1744
1785
  help="Local directory to rsync to /workspace/submit-<id>/ on master node before run.")
1745
1786
  @click.option("--no-pull", is_flag=True, help="Skip syncing the remote workspace back to --runtime after the job finishes.")
1787
+ @click.option("--no-build", is_flag=True,
1788
+ help="With --ref, skip the incremental torch rebuild before the command (Python-only PRs / quick checks). Default: rebuild so `import torch` reflects the ref.")
1746
1789
  @click.option("--keep-alive", is_flag=True, help="Don't cancel the reservation when the job exits.")
1747
1790
  @click.option("--name", type=str, default=None, help="Reservation name.")
1748
1791
  @click.option("--timeout", type=int, default=24 * 60, show_default=True,
@@ -1750,7 +1793,7 @@ _SUBMIT_GPU_TYPES = ["b300", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g"
1750
1793
  @click.argument("command", nargs=-1, required=True)
1751
1794
  @click.pass_context
1752
1795
  def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dockerfile, dockerimage, preserve_entrypoint,
1753
- runtime, no_pull, keep_alive, name, timeout, command):
1796
+ runtime, no_pull, no_build, keep_alive, name, timeout, command):
1754
1797
  """Submit a job: reserve, sync code, run, sync results back, auto-cancel.
1755
1798
 
1756
1799
  \b
@@ -1961,11 +2004,15 @@ def submit(ctx, gpu_type, gpus, hours, disk, ref, no_persistent_disk, spot, dock
1961
2004
  else:
1962
2005
  workdir = "/home/dev"
1963
2006
 
1964
- # Run remote command via login shell so MULTINODE_* etc. are loaded
2007
+ # Run remote command via login shell so MULTINODE_* etc. are loaded. With
2008
+ # --ref, the script first waits for background pytorch staging + rebuilds
2009
+ # so `import torch` matches the checked-out ref (see helper docstring).
1965
2010
  remote_cmd = " ".join(shlex.quote(c) for c in command)
1966
2011
  rprint(f"[cyan]🚀 Running on {ssh_alias}: {remote_cmd}[/cyan]\n")
1967
- ssh_run = ssh_base + [ssh_alias,
1968
- f"cd {shlex.quote(workdir)} && bash -lc {shlex.quote(remote_cmd)}"]
2012
+ if ref and not no_build:
2013
+ rprint("[dim] (--ref: will wait for staging + rebuild torch first; pass --no-build to skip)[/dim]")
2014
+ remote_script = _build_submit_remote_script(workdir, remote_cmd, ref, no_build)
2015
+ ssh_run = ssh_base + [ssh_alias, f"bash -lc {shlex.quote(remote_script)}"]
1969
2016
  rc = subprocess.call(ssh_run)
1970
2017
  rprint(f"\n[dim]Job exited with code {rc}[/dim]")
1971
2018
 
@@ -55,11 +55,23 @@ async def tunnel_ssh(target_host: str, target_port: int):
55
55
  # WebSocket URL - wss:// for secure WebSocket
56
56
  ws_url = f"wss://{proxy_host}/tunnel/{target_host}"
57
57
 
58
+ # Verify TLS against certifi's CA bundle. The default SSL context uses the OS
59
+ # trust store, which on macOS python.org builds is often empty
60
+ # ("unable to get local issuer certificate" / CERTIFICATE_VERIFY_FAILED).
61
+ # certifi ships the Mozilla bundle, so this works without the manual
62
+ # "Install Certificates.command" step.
63
+ ssl_ctx = ssl_module.create_default_context()
64
+ try:
65
+ import certifi
66
+ ssl_ctx.load_verify_locations(certifi.where())
67
+ except Exception:
68
+ pass # fall back to the default trust store
69
+
58
70
  last_exc = None
59
71
  for attempt in range(MAX_RETRIES):
60
72
  try:
61
73
  async with websockets.connect(
62
- ws_url, open_timeout=20,
74
+ ws_url, ssl=ssl_ctx, open_timeout=20,
63
75
  ping_interval=30, ping_timeout=10,
64
76
  ) as websocket:
65
77
  # Set up stdin/stdout for SSH
@@ -0,0 +1,89 @@
1
+ # `gpu-dev submit` — guide & footguns
2
+
3
+ `gpu-dev submit` reserves a box, (optionally) rsyncs a local dir up, runs your
4
+ command over SSH, syncs results back, and auto-cancels. It's the non-interactive
5
+ sibling of `gpu-dev reserve` — good for CI-style validation, one-shot test runs,
6
+ and scripted repros.
7
+
8
+ ```bash
9
+ # run a script in a local dir on 1x H100, sync results back, auto-cancel
10
+ gpu-dev submit --runtime ./ --gpu-type h100 -- bash run.sh
11
+
12
+ # validate a PyTorch PR's tests on H100 (stages + builds the PR for you)
13
+ gpu-dev submit --gpu-type h100 --no-persistent-disk --ref pr/186015 -- \
14
+ python test/test_foo.py -k some_test
15
+
16
+ # keep the box after the job (debug a failure interactively)
17
+ gpu-dev submit --keep-alive --gpu-type h100 -- pytest test/test_x.py
18
+ ```
19
+
20
+ Exit code = your command's exit code (so it composes in scripts/CI).
21
+
22
+ ---
23
+
24
+ ## Footguns (read before your first `--ref` run)
25
+
26
+ ### 1. `--ref` stages PyTorch in the background — `submit` now waits for it
27
+ With `--ref`, the in-pod startup checks out your ref into `/home/dev/pytorch`
28
+ **in the background** and only chowns the tree to `dev` + finishes the checkout
29
+ at the very end. Historically `submit` could SSH in and run your command before
30
+ that finished, so you'd hit:
31
+ - a **root-owned** `/home/dev/pytorch` (git: *"detected dubious ownership"*), and
32
+ - a **source/installed-torch mismatch** → `import torch` fails (the ref source is
33
+ checked out but the importable `.so` is still the stale prebuilt base).
34
+
35
+ `submit` now **waits for staging to complete**, marks the tree a git
36
+ `safe.directory`, and (by default) **rebuilds incrementally** so the installed
37
+ torch matches the checked-out ref before your command runs. You don't need the
38
+ `sudo chown` / `safe.directory` workaround anymore.
39
+
40
+ ### 2. `--ref` rebuilds torch by default — use `--no-build` to skip
41
+ The dropped-in `build/` + `.so` come from the **base** tree, not your ref. To make
42
+ `import torch` reflect your ref's compiled (C++/CUDA) changes, `submit --ref`
43
+ runs `pip install -e . --no-build-isolation` (incremental, warm `build/` →
44
+ typically tens of seconds; a cold/cross-arch build is much longer).
45
+
46
+ - Pass **`--no-build`** for Python-only PRs or quick checks — skips the rebuild
47
+ (import still works; it just won't include compiled changes).
48
+ - A rebuild failure exits **90** *before* your command runs (so a broken build
49
+ doesn't masquerade as a test failure).
50
+
51
+ ### 3. Prebuilt fast path is **prod-arch only** (H100 / B200)
52
+ The by-SHA / viable-strict prebuilt trees are compiled for `sm_90;sm_100`
53
+ (H100/B200). On other GPU types (t4, a100, l4, …) or staging there's no matching
54
+ prebuilt, so `--ref` falls back to a **full from-scratch build** — slow. Validate
55
+ ref-based jobs on `--gpu-type h100` (or `b200`).
56
+
57
+ ### 4. `--ref` is ignored with `--disk`
58
+ A persistent disk brings its own `/home/dev/pytorch`; `--ref` does **not** stage
59
+ onto a `--disk` reservation (and `submit` won't rebuild it). Use
60
+ `--no-persistent-disk` (or omit `--disk`) when you want a ref staged.
61
+
62
+ ### 5. `--preserve-entrypoint` needs SSH
63
+ `submit` runs your command over SSH, so a custom image with
64
+ `--preserve-entrypoint` must still expose the SSH harness or `submit` can't reach
65
+ it. For pure entrypoint containers, use `reserve`, not `submit`.
66
+
67
+ ### 6. Results sync-back is best-effort
68
+ With `--runtime`, output is rsync'd back to your local dir when the job exits
69
+ (unless `--no-pull`). If the box dies mid-job (spot reclaim, expiry) the sync-back
70
+ may be partial — you'll see a warning. For long jobs prefer `--keep-alive` and
71
+ pull manually, or write important artifacts to `/shared-personal` (persists
72
+ across reservations).
73
+
74
+ ### 7. `--hours` is a ceiling, not the runtime
75
+ It's the reservation lifetime cap; the job auto-cancels as soon as your command
76
+ exits (unless `--keep-alive`). Set it high enough that queueing + build + run fit.
77
+
78
+ ---
79
+
80
+ ## Finding footguns early
81
+
82
+ - `gpu-dev submit --keep-alive … -- true` then `gpu-dev connect <id>` — get a
83
+ box in the exact submit state and poke around before committing a real run.
84
+ - With `--ref`, watch staging directly: `tail -f /home/dev/.pytorch-staging.log`
85
+ in the pod; `.pytorch-ready` (HEAD sha) is written when staging is done.
86
+ - `python -c "import torch; print(torch.__file__, torch.version.git_version)"`
87
+ confirms which torch you're actually importing vs. the ref you asked for.
88
+
89
+ Found a new one? Add it here and ping `oncall:pytorch_release_engineering`.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.7.10
3
+ Version: 0.7.12
4
4
  Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -24,6 +24,7 @@ cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py
24
24
  cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py
25
25
  cli-tools/scripts/clear_stale_disk_locks.py
26
26
  docs/FAST_REPRO_DESIGN.md
27
+ docs/GPU_DEV_SUBMIT.md
27
28
  docs/SDK_REPRO.md
28
29
  docs/USER_GUIDE.md
29
30
  docs/devgpu-features.html
@@ -212,6 +213,7 @@ tests/unit/lambda_fn/__init__.py
212
213
  tests/unit/lambda_fn/test_availability.py
213
214
  tests/unit/lambda_fn/test_cancellation.py
214
215
  tests/unit/lambda_fn/test_claim.py
216
+ tests/unit/lambda_fn/test_finalize_no_ssh.py
215
217
  tests/unit/lambda_fn/test_mig_gpu_config.py
216
218
  tests/unit/lambda_fn/test_pod_resources.py
217
219
  tests/unit/lambda_fn/test_ref_staging.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.7.10"
7
+ version = "0.7.12"
8
8
  description = "CLI + Python SDK for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -148,12 +148,13 @@ COPY ssh_config /etc/ssh/sshd_config
148
148
  # Bump CLAUDE_CODE_BUILD to bust the layer cache and re-fetch the latest Claude Code
149
149
  # (the installer always grabs latest; without a bump Docker reuses the cached layer).
150
150
  USER root
151
- ARG CLAUDE_CODE_BUILD=2026-05-29
151
+ ARG CLAUDE_CODE_BUILD=2026-06-09
152
152
  RUN echo "Claude Code build marker: $CLAUDE_CODE_BUILD" && \
153
153
  curl -fsSL https://claude.ai/install.sh | HOME=/opt/claude bash || echo "Claude native install failed (non-fatal at build time)"
154
154
  RUN if [ -e /opt/claude/.local/bin/claude ]; then \
155
155
  ln -sf /opt/claude/.local/bin/claude /usr/local/bin/claude; \
156
156
  chmod -R a+rX /opt/claude; \
157
+ echo "Installed Claude Code (native): $(/usr/local/bin/claude --version 2>/dev/null || echo unknown)"; \
157
158
  fi
158
159
 
159
160
  # Set up npm global directory for dev user (kept for ad-hoc dev-installed CLIs).
@@ -176,7 +177,7 @@ RUN npm install -g --prefix /usr/local @openai/codex || echo "Codex CLI install
176
177
  # leaves /usr/local/bin/codex as a SYMLINK to that codex.js, so we must `rm -f` it first —
177
178
  # writing through the symlink would clobber codex.js itself, making the wrapper exec itself
178
179
  # (infinite recursion -> codex hangs on launch).
179
- RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IHdpcmVkIHRvIEdQVC01LjUgb24gQVdTIEJlZHJvY2sgKHVzLWVhc3QtMiBtYW50bGUgZW5kcG9pbnQpLgojIEF1dGggdmlhIHRoZSBwb2QgSVJTQSAtPiBzaG9ydC1saXZlZCAofjEyaCkgQmVkcm9jayBiZWFyZXIgdG9rZW47IG5vIHBlci11c2VyIGtleS4KIyBSZWFzb25pbmcgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgZW52IChkZWZhdWx0IGhpZ2gpLiBUaGUgY29uZmlnIGlzIChyZSl3cml0dGVuIG9uCiMgZXZlcnkgbGF1bmNoOiAvaG9tZS9kZXYgaXMgZXBoZW1lcmFsLCBhbmQgdGhpcyBhbHNvIHNlbGYtaGVhbHMgYSAvbW9kZWwgbWlzaGFwCiMgKHRoZSBwaWNrZXIgY2FuIGNvcnJ1cHQgdGhlIG1vZGVsIGlkOyByZXN0YXJ0aW5nIGNvZGV4IHJlc2V0cyBpdCkuCnNldCArZQpSRUFMPS91c3IvbG9jYWwvbGliL25vZGVfbW9kdWxlcy9Ab3BlbmFpL2NvZGV4L2Jpbi9jb2RleC5qcwpFRkZPUlQ9IiR7Q09ERVhfRUZGT1JUOi1oaWdofSIKbWtkaXIgLXAgIiRIT01FLy5jb2RleCIgMj4vZGV2L251bGwKY2F0ID4gIiRIT01FLy5jb2RleC9jb25maWcudG9tbCIgPDxDRkcKbW9kZWwgPSAib3BlbmFpLmdwdC01LjUiCm1vZGVsX3Byb3ZpZGVyID0gImJlZHJvY2siCndlYl9zZWFyY2ggPSAiZGlzYWJsZWQiCm1vZGVsX2NvbnRleHRfd2luZG93ID0gMjcyMDAwCm1vZGVsX21heF9vdXRwdXRfdG9rZW5zID0gMTI4MDAwCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKClttb2RlbF9wcm92aWRlcnMuYmVkcm9ja10KbmFtZSA9ICJBV1MgQmVkcm9jayAoR1BULTUuNSkiCmJhc2VfdXJsID0gImh0dHBzOi8vYmVkcm9jay1tYW50bGUudXMtZWFzdC0yLmFwaS5hd3Mvb3BlbmFpL3YxIgplbnZfa2V5ID0gIk9QRU5BSV9BUElfS0VZIgp3aXJlX2FwaSA9ICJyZXNwb25zZXMiCkNGRwpUT0s9IiQocHl0aG9uMyAtYyAnZnJvbSBhd3NfYmVkcm9ja190b2tlbl9nZW5lcmF0b3IgaW1wb3J0IHByb3ZpZGVfdG9rZW47IHByaW50KHByb3ZpZGVfdG9rZW4ocmVnaW9uPSJ1cy1lYXN0LTIiKSknIDI+L2Rldi9udWxsKSIKWyAtbiAiJFRPSyIgXSAmJiBleHBvcnQgT1BFTkFJX0FQSV9LRVk9IiRUT0siCmV4ZWMgIiRSRUFMIiAiJEAiCg==' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
180
+ RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IHdpcmVkIHRvIEdQVC01LjUgb24gQVdTIEJlZHJvY2sgKHVzLWVhc3QtMiBtYW50bGUgZW5kcG9pbnQpLgojIEF1dGggdmlhIHRoZSBwb2QgSVJTQSAtPiBzaG9ydC1saXZlZCAofjEyaCkgQmVkcm9jayBiZWFyZXIgdG9rZW47IG5vIHBlci11c2VyIGtleS4KIyBSZWFzb25pbmcgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgZW52IChkZWZhdWx0IGhpZ2gpLiBUaGUgY29uZmlnIGlzIChyZSl3cml0dGVuIG9uCiMgZXZlcnkgbGF1bmNoOiAvaG9tZS9kZXYgaXMgZXBoZW1lcmFsLCBhbmQgdGhpcyBhbHNvIHNlbGYtaGVhbHMgYSAvbW9kZWwgbWlzaGFwCiMgKHRoZSBwaWNrZXIgY2FuIGNvcnJ1cHQgdGhlIG1vZGVsIGlkOyByZXN0YXJ0aW5nIGNvZGV4IHJlc2V0cyBpdCkuCnNldCArZQpSRUFMPS91c3IvbG9jYWwvbGliL25vZGVfbW9kdWxlcy9Ab3BlbmFpL2NvZGV4L2Jpbi9jb2RleC5qcwpFRkZPUlQ9IiR7Q09ERVhfRUZGT1JUOi1oaWdofSIKbWtkaXIgLXAgIiRIT01FLy5jb2RleCIgMj4vZGV2L251bGwKY2F0ID4gIiRIT01FLy5jb2RleC9jb25maWcudG9tbCIgPDxDRkcKbW9kZWwgPSAib3BlbmFpLmdwdC01LjUiCm1vZGVsX3Byb3ZpZGVyID0gImJlZHJvY2siCndlYl9zZWFyY2ggPSAiZGlzYWJsZWQiCm1vZGVsX2NvbnRleHRfd2luZG93ID0gMjcyMDAwCm1vZGVsX21heF9vdXRwdXRfdG9rZW5zID0gMTI4MDAwCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKClttb2RlbF9wcm92aWRlcnMuYmVkcm9ja10KbmFtZSA9ICJBV1MgQmVkcm9jayAoR1BULTUuNSkiCmJhc2VfdXJsID0gImh0dHBzOi8vYmVkcm9jay1tYW50bGUudXMtZWFzdC0yLmFwaS5hd3Mvb3BlbmFpL3YxIgplbnZfa2V5ID0gIk9QRU5BSV9BUElfS0VZIgp3aXJlX2FwaSA9ICJyZXNwb25zZXMiCkNGRwpUT0s9IiQoL3Vzci9iaW4vcHl0aG9uMyAtYyAnZnJvbSBhd3NfYmVkcm9ja190b2tlbl9nZW5lcmF0b3IgaW1wb3J0IHByb3ZpZGVfdG9rZW47IHByaW50KHByb3ZpZGVfdG9rZW4ocmVnaW9uPSJ1cy1lYXN0LTIiKSknIDI+L2Rldi9udWxsKSIKWyAtbiAiJFRPSyIgXSAmJiBleHBvcnQgT1BFTkFJX0FQSV9LRVk9IiRUT0siCmV4ZWMgIiRSRUFMIiAiJEAiCg==' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
180
181
 
181
182
  USER dev
182
183
 
@@ -3832,40 +3832,73 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
3832
3832
  f"MAIN FLOW: Pod is ready, checking SSH daemon status from logs for {reservation_id}"
3833
3833
  )
3834
3834
 
3835
+ # Let the user know we're past pod creation and waiting on the service.
3836
+ # On persistent-disk reservations the entrypoint restores the disk before
3837
+ # sshd binds, so this can legitimately take a few minutes.
3838
+ update_reservation_status(
3839
+ reservation_id,
3840
+ "preparing",
3841
+ "Container running — restoring your environment and starting SSH…"
3842
+ if use_persistent_disk
3843
+ else "Container running — starting SSH service…",
3844
+ )
3845
+
3835
3846
  record_trace_event(trace_data, "ssh_ready_check_start")
3836
3847
  ssh_ready = False
3837
3848
  try:
3838
3849
  v1 = client.CoreV1Api(k8s_client)
3839
3850
 
3840
- # Poll for SSH daemon: 100ms for first 8s, then backoff to 5s
3841
- # Default image starts SSH in ~2-5s, so rapid polling catches it instantly
3842
- # Custom images may take longer, backoff keeps API load reasonable
3843
- max_attempts = 60
3851
+ # Poll pod logs for the sshd-ready marker. Fast (100ms) for the first
3852
+ # 8s to catch the common fast path instantly, then back off to 5s.
3853
+ # Slow-disk startups restore the disk *before* sshd binds, so allow up
3854
+ # to ~150s. If the marker never appears we finalize anyway below —
3855
+ # routing is already in place and the SSH proxy retries until sshd binds.
3856
+ deadline = time.time() + 150.0
3844
3857
  elapsed = 0.0
3845
-
3846
- for attempt in range(max_attempts):
3858
+ attempt = 0
3859
+ logs = ""
3860
+ while time.time() < deadline:
3847
3861
  logs = v1.read_namespaced_pod_log(
3848
- name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=100
3862
+ name=pod_name, namespace="gpu-dev", container="gpu-dev", tail_lines=200
3849
3863
  )
3850
3864
  if "SSH daemon starting on port 22" in logs or "Server listening on" in logs:
3851
3865
  logger.info(
3852
3866
  f"SSH daemon confirmed running in pod logs for {pod_name} (attempt {attempt + 1}, {elapsed:.1f}s elapsed)")
3853
3867
  ssh_ready = True
3854
3868
  break
3855
- else:
3856
- if attempt < max_attempts - 1:
3857
- delay = 0.1 if elapsed < 8.0 else min(1.0 + (elapsed - 8.0) * 0.3, 5.0)
3858
- time.sleep(delay)
3859
- elapsed += delay
3860
- else:
3861
- logger.warning(
3862
- f"SSH daemon not detected after {max_attempts} attempts, logs preview: {logs[-200:]}")
3869
+ delay = 0.1 if elapsed < 8.0 else min(1.0 + (elapsed - 8.0) * 0.3, 5.0)
3870
+ time.sleep(delay)
3871
+ elapsed += delay
3872
+ attempt += 1
3873
+ if not ssh_ready:
3874
+ logger.warning(
3875
+ f"SSH daemon marker not seen for {pod_name} after {elapsed:.1f}s, logs preview: {logs[-200:]}")
3863
3876
  except Exception as e:
3864
3877
  logger.warning(f"Could not check SSH daemon logs: {e}")
3865
3878
  # Assume ready if pod is running (NLB will handle routing)
3866
3879
  ssh_ready = True
3867
3880
  record_trace_event(trace_data, "ssh_ready_check_end")
3868
3881
 
3882
+ # If the sshd marker never showed, don't orphan the reservation in
3883
+ # 'preparing'. Only a genuinely broken pod should fail here; otherwise the
3884
+ # pod is just slow to bind sshd (disk restore) — routing is already stored,
3885
+ # so we finalize anyway and let the SSH proxy retry until sshd is up.
3886
+ if not ssh_ready:
3887
+ logger.warning(
3888
+ f"MAIN FLOW: SSH daemon not confirmed for reservation {reservation_id}, checking pod status for errors")
3889
+ pod_info = update_pod_status_and_events(k8s_client, pod_name, reservation_id)
3890
+ if not should_finalize_without_ssh_marker(pod_info):
3891
+ update_reservation_status(
3892
+ reservation_id,
3893
+ "failed",
3894
+ f"Pod failed to start properly: {pod_info['display_message']}",
3895
+ )
3896
+ raise RuntimeError(f"Pod failed: {pod_info['display_message']}")
3897
+ logger.warning(
3898
+ f"SSH daemon not confirmed for {pod_name}, but pod is healthy — "
3899
+ f"finalizing connection anyway (SSH proxy retries until sshd binds)")
3900
+ ssh_ready = True
3901
+
3869
3902
  if ssh_ready:
3870
3903
  # Update status: Finalizing connection
3871
3904
  update_reservation_status(
@@ -3985,28 +4018,6 @@ def allocate_gpu_resources(reservation_id: str, request: dict[str, Any], trace_d
3985
4018
  f"Failed to trigger availability update: {update_error}")
3986
4019
  # Don't fail the reservation for this
3987
4020
 
3988
- else:
3989
- logger.warning(
3990
- f"MAIN FLOW: SSH connectivity test FAILED for reservation {reservation_id}, checking pod status for errors")
3991
- # Check pod status using our consolidated monitoring function
3992
- pod_info = update_pod_status_and_events(
3993
- k8s_client, pod_name, reservation_id)
3994
- if pod_info["has_errors"]:
3995
- update_reservation_status(
3996
- reservation_id,
3997
- "failed",
3998
- f"Pod failed to start properly: {pod_info['display_message']}",
3999
- )
4000
- raise RuntimeError(
4001
- f"Pod failed: {pod_info['display_message']}")
4002
- else:
4003
- # Pod is running but SSH not ready yet - keep as preparing
4004
- # Status message already updated by update_pod_status_and_events
4005
- pass
4006
- logger.warning(
4007
- f"SSH not ready yet for {pod_name}, keeping reservation in preparing state"
4008
- )
4009
-
4010
4021
  # GPU allocation handled automatically by K8s scheduler
4011
4022
 
4012
4023
  # Store trace data in DynamoDB if tracing is enabled
@@ -4057,6 +4068,18 @@ def delete_sqs_message(record: dict[str, Any]) -> None:
4057
4068
  logger.error(f"Error deleting SQS message: {str(e)}")
4058
4069
 
4059
4070
 
4071
+ def should_finalize_without_ssh_marker(pod_info: dict) -> bool:
4072
+ """Decide what to do when the sshd-ready log marker never appeared.
4073
+
4074
+ The pod's routing (domain mapping) is stored before the readiness poll, so a
4075
+ slow sshd (e.g. a persistent-disk restore that runs before sshd binds) is not
4076
+ a failure — finalizing anyway lets the CLI's SSH proxy retry until sshd is up,
4077
+ instead of orphaning the reservation in 'preparing' forever. Only a pod that
4078
+ actually reports errors should fail.
4079
+ """
4080
+ return not pod_info.get("has_errors", False)
4081
+
4082
+
4060
4083
  def update_reservation_status(reservation_id: str, status: str, detailed_status: str = None, failure_reason: str = None) -> None:
4061
4084
  """
4062
4085
  Update reservation status with unified status tracking.
@@ -6328,7 +6351,7 @@ EOF
6328
6351
  # Only start Jupyter if enabled at creation time
6329
6352
  if [ "$JUPYTER_ENABLED" = "true" ]; then
6330
6353
  echo "[STARTUP] Starting Jupyter Lab in background..."
6331
- nohup su - dev -c "cd /workspace && /opt/conda/bin/jupyter-lab --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
6354
+ nohup su - dev -c "cd /workspace && $(command -v jupyter-lab || echo /usr/local/bin/jupyter-lab) --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
6332
6355
  echo "[STARTUP] Jupyter Lab started (check /tmp/jupyter.log for details)"
6333
6356
  else
6334
6357
  echo "[STARTUP] Jupyter Lab configured but not started (use 'gpu-dev edit --enable-jupyter' to enable)"
@@ -8487,7 +8510,7 @@ def update_pod_status_and_events(k8s_client, pod_name: str, reservation_id: str)
8487
8510
  if pod_phase == "Pending":
8488
8511
  display_message = "⏳ Pod pending"
8489
8512
  elif pod_phase == "Running":
8490
- display_message = "🚀 Container running"
8513
+ display_message = "🚀 Container running — starting SSH service…"
8491
8514
  else:
8492
8515
  display_message = f"Pod phase: {pod_phase}"
8493
8516
 
@@ -9296,7 +9319,7 @@ def enable_jupyter_in_pod(
9296
9319
 
9297
9320
  # Start Jupyter as dev user in background (config already exists)
9298
9321
  echo "Starting Jupyter Lab with existing config..."
9299
- nohup su - dev -c "cd /workspace && /opt/conda/bin/jupyter-lab --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
9322
+ nohup su - dev -c "cd /workspace && $(command -v jupyter-lab || echo /usr/local/bin/jupyter-lab) --config=/home/dev/.jupyter/jupyter_lab_config.py" > /tmp/jupyter.log 2>&1 &
9300
9323
 
9301
9324
  # Wait for startup
9302
9325
  sleep 3
@@ -19,12 +19,58 @@ from unittest.mock import MagicMock, patch
19
19
 
20
20
  import pytest
21
21
 
22
- from gpu_dev_cli.cli import main
22
+ from gpu_dev_cli.cli import main, _build_submit_remote_script
23
23
 
24
24
 
25
25
  USER_INFO = {"user_id": "u-123", "github_user": "octocat"}
26
26
 
27
27
 
28
+ # ---------------------------------------------------------------------------
29
+ # _build_submit_remote_script — the --ref staging-gate + rebuild preamble
30
+ # (regression for Driss's footguns: root-owned tree + source/installed mismatch)
31
+ # ---------------------------------------------------------------------------
32
+ def test_remote_script_no_ref_is_plain_cd_run():
33
+ s = _build_submit_remote_script("/workspace/x", "python a.py", ref=None, no_build=False)
34
+ assert s == "cd /workspace/x && python a.py"
35
+ assert "pytorch-staging" not in s
36
+ assert "no-build-isolation" not in s
37
+
38
+
39
+ def test_remote_script_with_ref_waits_and_rebuilds():
40
+ s = _build_submit_remote_script("/home/dev", "pytest q.py", ref="pr/123", no_build=False)
41
+ # waits for the background staging marker
42
+ assert "/home/dev/.pytorch-staging" in s
43
+ # only acts once staging actually completed
44
+ assert "/home/dev/.pytorch-ready" in s
45
+ # marks safe.directory for the dev user (fixes git "dubious ownership")
46
+ assert "safe.directory /home/dev/pytorch" in s
47
+ # rebuilds so installed torch matches the checked-out ref
48
+ assert "pip install -e . --no-build-isolation" in s
49
+ # user command still runs last, in the workdir
50
+ assert s.rstrip().endswith("cd /home/dev && pytest q.py")
51
+
52
+
53
+ def test_remote_script_ref_no_build_skips_rebuild():
54
+ s = _build_submit_remote_script("/home/dev", "pytest q.py", ref="pr/123", no_build=True)
55
+ assert "/home/dev/.pytorch-staging" in s # still waits for staging
56
+ assert "safe.directory /home/dev/pytorch" in s # still fixes ownership
57
+ assert "no-build-isolation" not in s # but no rebuild
58
+ assert s.rstrip().endswith("cd /home/dev && pytest q.py")
59
+
60
+
61
+ def test_remote_script_quotes_workdir():
62
+ s = _build_submit_remote_script("/work space/x", "echo hi", ref=None, no_build=False)
63
+ assert "'/work space/x'" in s
64
+
65
+
66
+ def test_no_build_flag_threaded_and_defaults_false(cli_runner):
67
+ # --no-build is accepted; with --ref it changes the rebuild preamble. Here we
68
+ # just assert the flag parses (reservation returns None -> exit 2).
69
+ res, rm = _run(cli_runner, ["--ref", "pr/1", "--no-build", "--", "x"])
70
+ assert res.exit_code == 2
71
+ rm.create_reservation.assert_called_once()
72
+
73
+
28
74
  # ---------------------------------------------------------------------------
29
75
  # patch harness
30
76
  # ---------------------------------------------------------------------------
@@ -0,0 +1,24 @@
1
+ """Unit tests for the slow-sshd finalize decision.
2
+
3
+ Regression for the orphaned-`preparing` bug: a persistent-disk reservation
4
+ restores its disk *before* sshd binds, so the readiness poll's log marker never
5
+ shows within the window. The main flow used to leave such reservations stuck in
6
+ `preparing` forever. It now finalizes anyway (routing is already stored, the SSH
7
+ proxy retries) and only fails when the pod itself reports errors.
8
+ """
9
+
10
+
11
+ def test_finalize_when_pod_healthy_but_no_ssh_marker(lambda_index):
12
+ # Running pod, no errors, sshd marker not seen -> finalize anyway.
13
+ info = {"has_errors": False, "display_message": "🚀 Container running — starting SSH service…"}
14
+ assert lambda_index.should_finalize_without_ssh_marker(info) is True
15
+
16
+
17
+ def test_do_not_finalize_when_pod_has_errors(lambda_index):
18
+ info = {"has_errors": True, "display_message": "❌ ImagePullBackOff"}
19
+ assert lambda_index.should_finalize_without_ssh_marker(info) is False
20
+
21
+
22
+ def test_missing_has_errors_key_defaults_to_finalize(lambda_index):
23
+ # Defensive: a partial pod_info dict shouldn't strand the reservation.
24
+ assert lambda_index.should_finalize_without_ssh_marker({}) is True
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes