gpu-dev 0.7.12__tar.gz → 0.7.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/PKG-INFO +1 -1
  2. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +254 -9
  3. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +26 -2
  4. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/gpu_dev.egg-info/PKG-INFO +1 -1
  5. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/gpu_dev.egg-info/SOURCES.txt +3 -0
  6. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/pyproject.toml +1 -1
  7. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/Dockerfile +24 -9
  8. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +148 -11
  9. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/reservation_processor/index.py +109 -4
  10. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda.tf +37 -0
  11. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/main.tf +2 -4
  12. gpu_dev-0.7.14/tests/unit/cli/test_debug.py +155 -0
  13. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_repro.py +88 -1
  14. gpu_dev-0.7.14/tests/unit/lambda_fn/test_dead_pod_cleanup.py +177 -0
  15. gpu_dev-0.7.14/tests/unit/lambda_fn/test_get_logs.py +59 -0
  16. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_mig_gpu_config.py +8 -7
  17. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_pod_resources.py +17 -8
  18. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/.github/workflows/no-gitlinks.yml +0 -0
  19. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/.github/workflows/publish.yml +0 -0
  20. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/.github/workflows/tests.yml +0 -0
  21. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/.gitignore +0 -0
  22. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/CLAUDE.md +0 -0
  23. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/README.md +0 -0
  24. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/admin/README.md +0 -0
  25. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/admin/generate_stats.py +0 -0
  26. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/admin/requirements.txt +0 -0
  27. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/README.md +0 -0
  28. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  29. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  30. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  31. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  32. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  33. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  34. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  35. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  36. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  37. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  38. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/conftest.py +0 -0
  39. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/docs/FAST_REPRO_DESIGN.md +0 -0
  40. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/docs/GPU_DEV_SUBMIT.md +0 -0
  41. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/docs/SDK_REPRO.md +0 -0
  42. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/docs/USER_GUIDE.md +0 -0
  43. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/docs/devgpu-features.html +0 -0
  44. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/docs/docker-mark-blue.svg +0 -0
  45. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/docs/icons8-cursor-ai.svg +0 -0
  46. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/gpu_dev.egg-info/dependency_links.txt +0 -0
  47. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/gpu_dev.egg-info/entry_points.txt +0 -0
  48. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/gpu_dev.egg-info/requires.txt +0 -0
  49. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/gpu_dev.egg-info/top_level.txt +0 -0
  50. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/post-may-2026.md +0 -0
  51. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/CLAUDE.md +0 -0
  52. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/architecture.html +0 -0
  53. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/cli-demo.html +0 -0
  54. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/devgpu-features.html +0 -0
  55. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/docker-mark-blue.svg +0 -0
  56. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/feedback.png +0 -0
  57. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/gpu-fleet.html +0 -0
  58. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/icons8-cursor-ai.svg +0 -0
  59. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/index.html +0 -0
  60. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/k8s-under-the-hood.html +0 -0
  61. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/multinode.html +0 -0
  62. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/osdc-future-plans.html +0 -0
  63. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/problem.png +0 -0
  64. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/pyproject.toml +0 -0
  65. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/sandbox.html +0 -0
  66. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/sdk-demo.html +0 -0
  67. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/teaser.html +0 -0
  68. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/thesis.html +0 -0
  69. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/title-vid.mp4 +0 -0
  70. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/weneedgpus.png +0 -0
  71. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/presentation/wow.html +0 -0
  72. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/README.md +0 -0
  73. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/examples/batch_multi_gpu.py +0 -0
  74. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/examples/interactive_debug.py +0 -0
  75. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/examples/parallel_experiments.ipynb +0 -0
  76. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/examples/quickstart.ipynb +0 -0
  77. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/examples/run_tests.py +0 -0
  78. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/examples/submit_job.py +0 -0
  79. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/__init__.py +0 -0
  80. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
  81. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  82. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_backend/aws.py +0 -0
  83. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
  84. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  85. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_sync/client.py +0 -0
  86. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
  87. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  88. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
  89. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  90. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/common/config.py +0 -0
  91. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/common/enums.py +0 -0
  92. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/common/errors.py +0 -0
  93. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/common/models.py +0 -0
  94. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/src/gpu_dev/py.typed +0 -0
  95. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/sdk/python/tests/test_models.py +0 -0
  96. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/setup.cfg +0 -0
  97. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-deck/backend.tf +0 -0
  98. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-deck/main.tf +0 -0
  99. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-deck/terraform.tfvars.example +0 -0
  100. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  101. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  102. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/README.md +0 -0
  103. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/alb.tf +0 -0
  104. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/ami-baker.tf +0 -0
  105. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/availability.tf +0 -0
  106. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/backend.tf +0 -0
  107. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/build-node.tf +0 -0
  108. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/check_b200.py +0 -0
  109. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  110. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  111. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  112. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  113. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/bash_profile +0 -0
  114. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/bashrc +0 -0
  115. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  116. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  117. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  118. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  119. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/motd_script +0 -0
  120. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  121. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/profile +0 -0
  122. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  123. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  124. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  125. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/shell_env +0 -0
  126. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/ssh_config +0 -0
  127. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/zprofile +0 -0
  128. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/zshrc +0 -0
  129. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  130. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker-build.tf +0 -0
  131. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  132. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  133. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/ecr.tf +0 -0
  134. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/efs.tf +0 -0
  135. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/eks.tf +0 -0
  136. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/expiry.tf +0 -0
  137. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/git-cache.tf +0 -0
  138. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  139. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/kubernetes.tf +0 -0
  140. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  141. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  142. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  143. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  144. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  145. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  146. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  147. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  148. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  149. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  150. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  151. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  152. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  153. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/list_b200.py +0 -0
  154. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/mig-config.tf +0 -0
  155. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  156. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  157. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  158. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  159. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  160. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  161. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/monitoring.tf +0 -0
  162. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  163. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/outputs.tf +0 -0
  164. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/pyproject.toml +0 -0
  165. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/pytorch-ondemand.tf +0 -0
  166. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/pytorch-prebuild.tf +0 -0
  167. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/queue.tf +0 -0
  168. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/route53.tf +0 -0
  169. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  170. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  171. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  172. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  173. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  174. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  175. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  176. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  177. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  178. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  179. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  180. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/switch-to.sh +0 -0
  181. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  182. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  183. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  184. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  185. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  186. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/variables.tf +0 -0
  187. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/terraform-gpu-devservers/warm-pool.tf +0 -0
  188. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/__init__.py +0 -0
  189. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/integration/README.md +0 -0
  190. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/integration/__init__.py +0 -0
  191. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/integration/conftest.py +0 -0
  192. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/integration/test_claude.py +0 -0
  193. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/integration/test_cpu_lifecycle.py +0 -0
  194. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/integration/test_repro_known_failure.py +0 -0
  195. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/integration/test_t4_lifecycle.py +0 -0
  196. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/integration/test_warm_pool.py +0 -0
  197. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/submit/README.md +0 -0
  198. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/submit/fail/run.sh +0 -0
  199. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/submit/multinode/run.sh +0 -0
  200. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/submit/success/run.sh +0 -0
  201. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/__init__.py +0 -0
  202. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/__init__.py +0 -0
  203. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_auth.py +0 -0
  204. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_avail.py +0 -0
  205. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_cancel.py +0 -0
  206. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_config_cmd.py +0 -0
  207. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_config_module.py +0 -0
  208. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_connect.py +0 -0
  209. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_disks.py +0 -0
  210. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_edit.py +0 -0
  211. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_interactive.py +0 -0
  212. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_list_show.py +0 -0
  213. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_name_generator.py +0 -0
  214. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_reservations_mgr.py +0 -0
  215. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_reserve.py +0 -0
  216. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_smoke.py +0 -0
  217. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_ssh_alias.py +0 -0
  218. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/cli/test_submit.py +0 -0
  219. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/__init__.py +0 -0
  220. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_availability.py +0 -0
  221. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_cancellation.py +0 -0
  222. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_claim.py +0 -0
  223. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_finalize_no_ssh.py +0 -0
  224. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_ref_staging.py +0 -0
  225. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_smoke.py +0 -0
  226. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_version_gate.py +0 -0
  227. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/lambda_fn/test_warm_pool.py +0 -0
  228. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/sdk/__init__.py +0 -0
  229. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/sdk/test_backend_aws.py +0 -0
  230. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/sdk/test_client.py +0 -0
  231. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/sdk/test_errors_enums.py +0 -0
  232. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/sdk/test_models_extra.py +0 -0
  233. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/sdk/test_sandbox.py +0 -0
  234. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/sdk/test_sdk_config.py +0 -0
  235. {gpu_dev-0.7.12 → gpu_dev-0.7.14}/tests/unit/sdk/test_transport_ssh.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.7.12
3
+ Version: 0.7.14
4
4
  Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1521,19 +1521,27 @@ def reserve(
1521
1521
 
1522
1522
 
1523
1523
  @main.command(context_settings={"ignore_unknown_options": True})
1524
- @click.argument("ref")
1525
- @click.argument("test_args", nargs=-1, required=True)
1526
- @click.option("--gpu-type", default="b200", show_default=True, help="GPU type for the repro box.")
1524
+ @click.argument("ref", required=False)
1525
+ @click.argument("test_args", nargs=-1, required=False)
1526
+ @click.option("--lint", is_flag=True, default=False,
1527
+ help="Run a PyTorch lint job (lintrunner) on a CPU box instead of a python test — "
1528
+ "mirrors CI's lint (.github/scripts/lintrunner.sh): regenerates version/type "
1529
+ "stubs then runs the python/general linters. Defaults to --gpu-type cpu-x86, "
1530
+ "no torch build. PR ref lints its diff; main lints all files; extra args override scope.")
1531
+ @click.option("--clang", is_flag=True, default=False,
1532
+ help="With --lint, also run the C++ linters (CLANGTIDY/CLANGFORMAT). CI runs these in a "
1533
+ "separate job — they generate clang build files and are heavy on a full tree.")
1534
+ @click.option("--gpu-type", default=None, help="GPU type for the repro box (default: b200; cpu-x86 with --lint).")
1527
1535
  @click.option("--gpus", type=int, default=1, show_default=True)
1528
1536
  @click.option("--hours", type=float, default=3.0, show_default=True,
1529
1537
  help="Lifetime ceiling for the box.")
1530
1538
  @click.option("--no-connect", is_flag=True, default=False,
1531
- help="CI mode: run the test, auto-cancel, exit code = test result. Default (on a TTY) drops you into the box to iterate.")
1539
+ help="CI mode: run the test/lint, auto-cancel, exit code = result. Default (on a TTY) drops you into the box to iterate.")
1532
1540
  @click.option("--keep", is_flag=True, default=False,
1533
1541
  help="Never cancel the box (skip the cancel prompt / auto-cancel).")
1534
1542
  @click.pass_context
1535
- def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
1536
- """Reserve a GPU, check out a PR/commit, run a test, then drop you into the box.
1543
+ def repro(ctx, ref, test_args, lint, clang, gpu_type, gpus, hours, no_connect, keep):
1544
+ """Reserve a box, check out a PR/commit, run a test (or lint), then drop you in.
1537
1545
 
1538
1546
  By default (in a terminal) repro runs the test and then **connects you into the
1539
1547
  box** at ~/pytorch — the ref is checked out, so you can fix and re-run. The box
@@ -1546,10 +1554,32 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
1546
1554
  TEST_ARGS are passed straight to `python` inside ~/pytorch, e.g.
1547
1555
 
1548
1556
  gpu-dev repro pr/185264 test/inductor/test_flex_attention.py TestFlexAttentionCUDA.test_large_kv_int64_pointer_math_cuda
1557
+
1558
+ --lint runs lintrunner on a CPU box instead (no GPU, no torch build), mirroring
1559
+ CI's lint (regenerate version/type stubs, then the python/general linters), e.g.
1560
+
1561
+ gpu-dev repro --lint # lint main (all files)
1562
+ gpu-dev repro --lint pr/185264 # lint the PR diff (CI-equivalent)
1563
+ gpu-dev repro --lint pr/185264 --all-files # lint everything
1564
+ gpu-dev repro --lint --clang pr/185264 # also run C++ clang-tidy/format
1565
+
1566
+ The box stays up after the run: on a TTY you're dropped in and prompted to
1567
+ cancel on exit (use --keep to leave it running; --no-connect auto-cancels).
1549
1568
  """
1550
1569
  import shlex
1551
1570
  import subprocess
1552
1571
  import sys
1572
+ if not ref:
1573
+ if not lint:
1574
+ rprint("[red]❌ Provide a REF (pr/N, branch, or commit) — or use --lint to lint main.[/red]")
1575
+ sys.exit(2)
1576
+ ref = "main" # bare `repro --lint` lints current main
1577
+ if not lint and not test_args:
1578
+ rprint("[red]❌ Provide a test, e.g. gpu-dev repro pr/123 test/foo.py — or pass --lint for a lint job.[/red]")
1579
+ sys.exit(2)
1580
+ gpu_type = (gpu_type or ("cpu-x86" if lint else "b200")).lower()
1581
+ if gpu_type.startswith("cpu"):
1582
+ gpus = 0 # CPU reservations must have gpu_count=0
1553
1583
  config = load_config()
1554
1584
  reservation_mgr = ReservationManager(config)
1555
1585
  try:
@@ -1637,9 +1667,58 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
1637
1667
  f"PYTHONPATH=/home/dev/pytorch python {testcmd}"
1638
1668
  )
1639
1669
 
1670
+ runlabel, rerun_hint = "test", f"python {testcmd}"
1671
+ if lint:
1672
+ # Mirror pytorch CI's lint (.github/scripts/lintrunner.sh): regenerate version +
1673
+ # type stubs (so mypy/pyrefly are accurate), then run the python/general linters.
1674
+ # CLANGTIDY/CLANGFORMAT are a separate CI job (need generated build files, very
1675
+ # heavy on a full tree) -> opt-in via --clang. No torch build. Source-only tree
1676
+ # (cloned if a CPU pod doesn't have one). Scope mirrors CI: a PR lints its diff
1677
+ # (merge-base), main lints all files; extra args override the scope.
1678
+ if test_args:
1679
+ scope = " ".join(test_args)
1680
+ elif prnum:
1681
+ scope = "--merge-base-with origin/main"
1682
+ else:
1683
+ scope = "--all-files"
1684
+ runlabel = "lint"
1685
+ rerun_hint = f"lintrunner --skip CLANGTIDY,CLANGTIDY_EXECUTORCH_COMPATIBILITY,CLANGFORMAT {scope}"
1686
+ clang_block = (
1687
+ "echo '[lint] === C++ linters (CLANGTIDY/CLANGFORMAT) — generating clang build files (heavy)… ==='; "
1688
+ "python -m tools.linter.clang_tidy.generate_build_files 2>/dev/null || true; "
1689
+ f"lintrunner --force-color --take CLANGTIDY,CLANGFORMAT {scope}; rr=$?; [ $rr -ne 0 ] && RC=$rr; "
1690
+ ) if clang else (
1691
+ "echo '[lint] C++ linters (CLANGTIDY/CLANGFORMAT) skipped — add --clang to run them'; "
1692
+ )
1693
+ remote = (
1694
+ "set +e; "
1695
+ "git config --global --add safe.directory /home/dev/pytorch 2>/dev/null || true; "
1696
+ "if [ ! -d /home/dev/pytorch/.git ]; then echo '[lint] no pytorch tree on this pod — cloning (partial)…'; "
1697
+ "rm -rf /home/dev/pytorch; git clone --filter=blob:none https://github.com/pytorch/pytorch.git /home/dev/pytorch; fi; "
1698
+ "cd /home/dev/pytorch; "
1699
+ + resolve +
1700
+ "echo \"[lint] target ${WANT:-?}\"; "
1701
+ "git fetch origin main 2>/dev/null || true; "
1702
+ "echo \"[lint] checking out $FREF\"; " + checkout + "; "
1703
+ "echo \"[lint] HEAD $(git rev-parse --short HEAD)\"; "
1704
+ "command -v lintrunner >/dev/null 2>&1 || pip install --break-system-packages -q lintrunner; "
1705
+ # CI codegen so mypy/pyrefly see generated files (version.py + type stubs)
1706
+ "echo '[lint] regenerating version + type stubs (CI parity)…'; "
1707
+ "python -m tools.generate_torch_version --is-debug=false 2>/dev/null || true; "
1708
+ "python -m tools.pyi.gen_pyi --native-functions-path aten/src/ATen/native/native_functions.yaml "
1709
+ "--tags-path aten/src/ATen/native/tags.yaml --deprecated-functions-path tools/autograd/deprecated.yaml 2>/dev/null || true; "
1710
+ "python torch/utils/data/datapipes/gen_pyi.py 2>/dev/null || true; "
1711
+ "echo '[lint] lintrunner init…'; lintrunner init; RC=0; "
1712
+ f"echo '[lint] === python/general linters: lintrunner {scope} ==='; "
1713
+ f"lintrunner --force-color --skip CLANGTIDY,CLANGTIDY_EXECUTORCH_COMPATIBILITY,CLANGFORMAT {scope}; rr=$?; [ $rr -ne 0 ] && RC=$rr; "
1714
+ + clang_block +
1715
+ "exit $RC"
1716
+ )
1717
+
1640
1718
  # Reserve — warm claim (instant) first, else cold ephemeral. Always no-persist
1641
1719
  # (so the prebuilt tree is staged; a default disk would skip staging).
1642
- rprint(f"[cyan]🔬 repro: reserving {gpus}x {gpu_type} (warm if available)…[/cyan]")
1720
+ desc = f"{gpus}x {gpu_type}" if gpus else gpu_type
1721
+ rprint(f"[cyan]🔬 repro: reserving {desc} (warm if available)…[/cyan]")
1643
1722
  rid = ssh_cmd = None
1644
1723
  try:
1645
1724
  res = reservation_mgr.claim_direct(
@@ -1675,14 +1754,14 @@ def repro(ctx, ref, test_args, gpu_type, gpus, hours, no_connect, keep):
1675
1754
  except KeyboardInterrupt:
1676
1755
  rprint("\n[yellow]interrupted[/yellow]"); rc = 130
1677
1756
 
1678
- verdict = "[green]✓ test passed[/green]" if rc == 0 else f"[red]✗ test failed (exit {rc})[/red]"
1757
+ verdict = f"[green]✓ {runlabel} passed[/green]" if rc == 0 else f"[red]✗ {runlabel} failed (exit {rc})[/red]"
1679
1758
 
1680
1759
  # Default (TTY): drop into the box so you can fix and re-run. --no-connect is the
1681
1760
  # CI path: auto-cancel and exit with the test's code.
1682
1761
  connect = (not no_connect) and sys.stdout.isatty()
1683
1762
  if connect:
1684
1763
  rprint(f"\n{verdict} — dropping you into the box at ~/pytorch ({ref} checked out).")
1685
- rprint(f"[dim] re-run: python {testcmd}[/dim]")
1764
+ rprint(f"[dim] re-run: {rerun_hint}[/dim]")
1686
1765
  rprint(f"[dim] finish: gpu-dev cancel (from inside) • or exit this shell[/dim]\n")
1687
1766
  shell_cmd = f"{ssh_cmd} -t {shlex.quote('cd /home/dev/pytorch 2>/dev/null; exec ${SHELL:-bash} -l')}"
1688
1767
  try:
@@ -3232,6 +3311,172 @@ def show(ctx: click.Context, reservation_id: Optional[str]) -> None:
3232
3311
  rprint(f"[red]❌ Error: {str(e)}[/red]")
3233
3312
 
3234
3313
 
3314
+ def _print_recovery_hints(connection_info: dict) -> None:
3315
+ """Tell the user how to unblock/recover their own reservation based on status."""
3316
+ status = (connection_info.get("status") or "").lower()
3317
+ disk_name = connection_info.get("disk_name") or ""
3318
+ res_id = connection_info.get("reservation_id", "") or ""
3319
+ short_id = res_id[:8] if res_id else "<id>"
3320
+ hints = []
3321
+ if status in ("failed", "expired", "cancelled"):
3322
+ if disk_name:
3323
+ hints.append(
3324
+ f"Your data on disk '{disk_name}' is preserved — re-reserve with: "
3325
+ f"gpu-dev reserve --disk {disk_name}")
3326
+ hints.append(f"If that disk is stuck locked: gpu-dev disk unlock {disk_name}")
3327
+ else:
3328
+ hints.append("Re-reserve a new box with: gpu-dev reserve")
3329
+ elif status == "active":
3330
+ hints.append(
3331
+ f"If status is 'active' but you can't SSH, the pod likely died (e.g. OOM). "
3332
+ f"Free it (and your disk) with: gpu-dev cancel {short_id} — then re-reserve.")
3333
+ if disk_name:
3334
+ hints.append(f"If the disk stays locked after cancel: gpu-dev disk unlock {disk_name}")
3335
+ if hints:
3336
+ rprint("\n[bold]Recovery:[/bold]")
3337
+ for h in hints:
3338
+ rprint(f" • {h}")
3339
+
3340
+
3341
+ def _show_diagnostics(connection_info: dict) -> None:
3342
+ """Render the extra diagnostics `gpu-dev debug` adds on top of the status panel:
3343
+ failure reason, OOM events, the full status-history timeline, captured pod logs,
3344
+ and recovery hints. All sourced from data the lambdas write to DynamoDB, so it
3345
+ needs no cluster/lambda access."""
3346
+ from rich.text import Text
3347
+
3348
+ status = (connection_info.get("status") or "").lower()
3349
+
3350
+ # Failure reason / latest detailed status — shown for ANY status (the normal
3351
+ # `show` only surfaces failure_reason on 'failed'; for an active-but-dead pod
3352
+ # this is exactly what the user needs).
3353
+ failure_reason = (connection_info.get("failure_reason") or "").strip()
3354
+ detailed = (connection_info.get("current_detailed_status") or "").strip()
3355
+ if failure_reason:
3356
+ rprint(f"\n[bold red]Why it ended:[/bold red] {failure_reason}")
3357
+ elif detailed and status != "active":
3358
+ rprint(f"\n[bold]Latest status:[/bold] {detailed}")
3359
+
3360
+ # OOM events
3361
+ oom_count = int(connection_info.get("oom_count", 0) or 0)
3362
+ if oom_count > 0:
3363
+ last = connection_info.get("last_oom_at") or "unknown"
3364
+ cont = connection_info.get("oom_container") or "?"
3365
+ rprint(f"[red]⚠️ OOM:[/red] {oom_count} event(s) — last {last} (container: {cont})")
3366
+
3367
+ # Status-history timeline (the gold for "what happened to my reservation")
3368
+ history = connection_info.get("status_history") or []
3369
+ if history:
3370
+ table = Table(title="Status timeline (most recent last)", show_header=True,
3371
+ header_style="bold", box=None, pad_edge=False)
3372
+ table.add_column("Time", style="dim", no_wrap=True)
3373
+ table.add_column("Event")
3374
+ for entry in history[-40:]:
3375
+ if isinstance(entry, dict):
3376
+ table.add_row(str(entry.get("timestamp", "")), str(entry.get("message", "")))
3377
+ console.print("")
3378
+ console.print(table)
3379
+ else:
3380
+ rprint("\n[dim]No status history recorded for this reservation.[/dim]")
3381
+
3382
+ # Captured pod logs (lambda snapshot — last lines around the failure)
3383
+ pod_logs = (connection_info.get("pod_logs") or "").strip()
3384
+ if pod_logs:
3385
+ console.print(Panel(Text(pod_logs[-4000:]), title="Captured pod logs (snapshot)",
3386
+ border_style="yellow"))
3387
+
3388
+ _print_recovery_hints(connection_info)
3389
+
3390
+
3391
+ def _show_lambda_logs(reservation_mgr, reservation_id: str, user_id: str) -> None:
3392
+ """Fetch + render the raw lambda (CloudWatch) logs for a reservation."""
3393
+ from rich.text import Text
3394
+ rprint("\n[bold]Fetching lambda logs from CloudWatch…[/bold] [dim](a few seconds)[/dim]")
3395
+ result = reservation_mgr.get_reservation_logs(reservation_id, user_id)
3396
+ if result is None:
3397
+ rprint("[yellow]Could not reach the log backend (it may not be deployed yet, "
3398
+ "or you lack lambda:InvokeFunctionUrl access).[/yellow]")
3399
+ return
3400
+ if result.get("error"):
3401
+ rprint(f"[yellow]Log query: {result['error']}[/yellow]")
3402
+ lines = result.get("lines") or []
3403
+ if not lines:
3404
+ rprint("[dim]No lambda log lines found for this reservation (outside the "
3405
+ "retention window, or none recorded).[/dim]")
3406
+ return
3407
+ body = "\n".join(f"{ln.get('timestamp','')} {ln.get('message','')}".rstrip()
3408
+ for ln in lines)
3409
+ console.print(Panel(Text(body[-16000:]),
3410
+ title=f"Lambda logs · {len(lines)} line(s)", border_style="cyan"))
3411
+
3412
+
3413
+ @main.command()
3414
+ @click.argument("reservation_id", required=False)
3415
+ @click.option("--logs", "show_logs", is_flag=True,
3416
+ help="Also fetch the raw lambda logs for this reservation from CloudWatch.")
3417
+ @click.pass_context
3418
+ def debug(ctx: click.Context, reservation_id: Optional[str], show_logs: bool) -> None:
3419
+ """Diagnose your own reservation — why a box died or won't connect.
3420
+
3421
+ Shows the status timeline, failure reason, OOM events, and captured pod logs,
3422
+ plus recovery steps — all without needing cluster or lambda access. Add --logs
3423
+ to also pull the raw reservation/expiry lambda logs from CloudWatch.
3424
+
3425
+ \b
3426
+ Examples:
3427
+ gpu-dev debug # pick from your active reservations
3428
+ gpu-dev debug abc12345 # a specific reservation (id prefix ok)
3429
+ gpu-dev debug abc12345 --logs # + raw lambda logs from CloudWatch
3430
+
3431
+ For a recently failed/expired box, find its id with 'gpu-dev list' then
3432
+ 'gpu-dev debug <id>'.
3433
+ """
3434
+ try:
3435
+ config = load_config()
3436
+ user_info = authenticate_user(config)
3437
+ reservation_mgr = ReservationManager(config)
3438
+
3439
+ # In-pod fast path: the pod's own reservation id is on the env.
3440
+ if reservation_id is None:
3441
+ reservation_id = os.environ.get("GPU_DEV_RESERVATION_ID") or None
3442
+
3443
+ if reservation_id is None:
3444
+ reservations = _fetch_reservations_cross_region(
3445
+ reservation_mgr, user_info["user_id"],
3446
+ ["active", "preparing", "queued", "pending"], config)
3447
+ if not reservations:
3448
+ rprint("[yellow]📋 No active reservations.[/yellow] To debug a recent "
3449
+ "failed/expired one, find its id with [bold]gpu-dev list[/bold] "
3450
+ "then run [bold]gpu-dev debug <id>[/bold].")
3451
+ return
3452
+ if len(reservations) == 1:
3453
+ reservation_id = reservations[0].get("reservation_id")
3454
+ else:
3455
+ selected = select_reservation_interactive(reservations, "debug")
3456
+ if not selected or selected in ("__QUIT__", "__ALL__"):
3457
+ rprint("[yellow]Cancelled.[/yellow]")
3458
+ return
3459
+ reservation_id = selected
3460
+
3461
+ connection_info = reservation_mgr.get_connection_info(
3462
+ reservation_id, user_info["user_id"])
3463
+ if not connection_info:
3464
+ rprint(f"[red]❌ No reservation found matching '{reservation_id}'[/red] "
3465
+ "(try a longer id prefix, or check 'gpu-dev list').")
3466
+ return
3467
+
3468
+ _show_single_reservation(connection_info)
3469
+ _show_diagnostics(connection_info)
3470
+ if show_logs:
3471
+ _show_lambda_logs(reservation_mgr, connection_info["reservation_id"],
3472
+ user_info["user_id"])
3473
+
3474
+ except RuntimeError as e:
3475
+ rprint(f"[red]❌ {str(e)}[/red]")
3476
+ except Exception as e:
3477
+ rprint(f"[red]❌ Error: {str(e)}[/red]")
3478
+
3479
+
3235
3480
 
3236
3481
  def _maybe_show_sdk_tip() -> None:
3237
3482
  """For a user's first few reservations, nudge them toward the Python SDK +
@@ -613,7 +613,7 @@ class ReservationManager:
613
613
  pass
614
614
  return self._direct_url or None
615
615
 
616
- def _signed_post(self, url: str, payload: dict) -> Optional[dict]:
616
+ def _signed_post(self, url: str, payload: dict, timeout: int = 20) -> Optional[dict]:
617
617
  """SigV4-signed POST to the Function URL. Returns parsed JSON or None."""
618
618
  try:
619
619
  creds = self.config.session.get_credentials()
@@ -623,13 +623,29 @@ class ReservationManager:
623
623
  aws_req = AWSRequest(method="POST", url=url, data=data,
624
624
  headers={"Content-Type": "application/json"})
625
625
  SigV4Auth(creds, "lambda", self.config.aws_region).add_auth(aws_req)
626
- resp = requests.post(url, data=data, headers=dict(aws_req.headers), timeout=20)
626
+ resp = requests.post(url, data=data, headers=dict(aws_req.headers), timeout=timeout)
627
627
  if resp.status_code != 200:
628
628
  return None
629
629
  return resp.json()
630
630
  except Exception:
631
631
  return None
632
632
 
633
+ def get_reservation_logs(self, reservation_id: str, user_id: str) -> Optional[Dict[str, Any]]:
634
+ """Fetch a reservation's lambda logs (CloudWatch Logs Insights) via the
635
+ processor Function URL. Returns {"lines": [...]} / {"error": ...}, or None if
636
+ the backend/URL is unavailable. Used by `gpu-dev debug --logs`."""
637
+ url = self._get_direct_url()
638
+ if not url:
639
+ return None
640
+ payload = {
641
+ "action": "get_logs",
642
+ "reservation_id": reservation_id,
643
+ "user_id": user_id,
644
+ "version": get_version(),
645
+ }
646
+ # CloudWatch Logs Insights queries take longer than a claim — allow ~70s.
647
+ return self._signed_post(url, payload, timeout=70)
648
+
633
649
  def claim_direct(self, *, user_id: str, gpu_count: int, gpu_type: str,
634
650
  duration_hours: Union[int, float], name: Optional[str] = None,
635
651
  github_user: Optional[str] = None, ref: Optional[str] = None) -> Optional[Dict[str, Any]]:
@@ -999,11 +1015,19 @@ class ReservationManager:
999
1015
  "jupyter_enabled": reservation.get("jupyter_enabled", False),
1000
1016
  "jupyter_error": reservation.get("jupyter_error", ""),
1001
1017
  "ebs_volume_id": reservation.get("ebs_volume_id", ""),
1018
+ "disk_name": reservation.get("disk_name", ""),
1002
1019
  "secondary_users": reservation.get("secondary_users", []),
1003
1020
  "warning": reservation.get("warning", ""),
1004
1021
  "is_multinode": is_multinode,
1005
1022
  "pod_ip": reservation.get("pod_ip", ""),
1023
+ "node_ip": reservation.get("node_ip", ""),
1024
+ "node_name": reservation.get("node_name", ""),
1006
1025
  "fqdn": reservation.get("fqdn", ""),
1026
+ # Health/diagnostics (surfaced by `gpu-dev debug`); written by the
1027
+ # reservation + expiry lambdas. Present off the raw item, not always set.
1028
+ "oom_count": int(reservation.get("oom_count", 0) or 0),
1029
+ "last_oom_at": reservation.get("last_oom_at", ""),
1030
+ "oom_container": reservation.get("oom_container", ""),
1007
1031
  }
1008
1032
 
1009
1033
  # If multi-node, fetch all nodes in the group
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.7.12
3
+ Version: 0.7.14
4
4
  Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -198,6 +198,7 @@ tests/unit/cli/test_cancel.py
198
198
  tests/unit/cli/test_config_cmd.py
199
199
  tests/unit/cli/test_config_module.py
200
200
  tests/unit/cli/test_connect.py
201
+ tests/unit/cli/test_debug.py
201
202
  tests/unit/cli/test_disks.py
202
203
  tests/unit/cli/test_edit.py
203
204
  tests/unit/cli/test_interactive.py
@@ -213,7 +214,9 @@ tests/unit/lambda_fn/__init__.py
213
214
  tests/unit/lambda_fn/test_availability.py
214
215
  tests/unit/lambda_fn/test_cancellation.py
215
216
  tests/unit/lambda_fn/test_claim.py
217
+ tests/unit/lambda_fn/test_dead_pod_cleanup.py
216
218
  tests/unit/lambda_fn/test_finalize_no_ssh.py
219
+ tests/unit/lambda_fn/test_get_logs.py
217
220
  tests/unit/lambda_fn/test_mig_gpu_config.py
218
221
  tests/unit/lambda_fn/test_pod_resources.py
219
222
  tests/unit/lambda_fn/test_ref_staging.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.7.12"
7
+ version = "0.7.14"
8
8
  description = "CLI + Python SDK for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -46,8 +46,12 @@ RUN for attempt in 1 2 3; do \
46
46
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
47
47
  apt-get install -y nodejs
48
48
 
49
- # Install older CUDA toolkits alongside base CUDA 13.2
49
+ # Install additional CUDA toolkits alongside base CUDA 13.2
50
50
  # Base image already has NVIDIA repo configured, no need for cuda-keyring
51
+ # NOTE: cuda-toolkit-13-3 is intentionally NOT here. CUDA 13.3 ships a unified
52
+ # `cccl-13-3` package that `Breaks` `cuda-cccl-12-8`/`-12-9`, so 13.3 cannot coexist
53
+ # with the 12.8/12.9 toolkits in one image. To add 13.3 we'd have to drop 12.8/12.9
54
+ # (or hand-curate 13.3 sub-packages that exclude cccl). Kept 12.8-13.2 for now.
51
55
  RUN apt-get update && apt-get install -y --no-install-recommends \
52
56
  cuda-toolkit-12-8 \
53
57
  cuda-toolkit-12-9 \
@@ -163,21 +167,32 @@ WORKDIR /home/dev
163
167
  RUN mkdir -p ~/.npm-global && \
164
168
  npm config set prefix ~/.npm-global
165
169
 
166
- # OpenAI Codex CLI on GPT-5.5 via AWS Bedrock (GA 2026-06-01). Installed system-wide
167
- # (parallels Claude above), then /usr/local/bin/codex is replaced with a thin wrapper that
168
- # auths via the pod IRSA — it mints a short-lived Bedrock bearer token (no per-user OpenAI
169
- # key) and pins the bedrock-mantle provider + GPT-5.5 metadata. Reasoning effort is set with
170
- # the CODEX_EFFORT env var (default high); the wrapper rewrites ~/.codex/config.toml each
171
- # launch (home is ephemeral) so a /model picker mishap self-heals on restart. IAM is already
172
- # in place (pod IRSA: bedrock-mantle:* + aws-marketplace:Subscribe).
170
+ # OpenAI Codex CLI on OpenAI gpt-5.x via AWS Bedrock. Installed system-wide (parallels
171
+ # Claude above), then /usr/local/bin/codex is replaced with a thin wrapper that auths via
172
+ # the pod IRSA — it mints a short-lived Bedrock bearer token (AWS_BEARER_TOKEN_BEDROCK), no
173
+ # per-user key. The wrapper uses codex's NATIVE `amazon-bedrock` model provider (the Bedrock
174
+ # Mantle path serves the OpenAI Responses API for supported OpenAI models per the official
175
+ # Codex/Bedrock docs), so NO custom endpoint/wire_api config is needed. Model via CODEX_MODEL
176
+ # (default openai.gpt-5.4), effort via CODEX_EFFORT (default high). The wrapper forces
177
+ # AWS_REGION=us-east-1.
178
+ #
179
+ # Why gpt-5.4 default (2026-06-16): gpt-5.5 is mid-rollout on Bedrock us-east-1 — it works
180
+ # intermittently but ~30% of calls still 404 "Engine not found" (us-east-2 fails outright).
181
+ # gpt-5.4 is rock-solid in us-east-1. To switch to 5.5 once AWS's rollout stabilizes, change
182
+ # the default above to openai.gpt-5.5 (one line) — region is already us-east-1. Users can opt
183
+ # in early with CODEX_MODEL=openai.gpt-5.5. The wrapper rewrites ~/.codex/config.toml each
184
+ # launch. IAM already in place (pod IRSA: bedrock-mantle:* — native Mantle path does NOT need
185
+ # bedrock:CallWithBearerToken).
173
186
  USER root
187
+ # Always install the latest codex (the native amazon-bedrock provider is stable across
188
+ # releases, so no need to pin — each image rebuild tracks latest). Validated on 0.140.0.
174
189
  RUN npm install -g --prefix /usr/local @openai/codex || echo "Codex CLI install failed (non-fatal at build time)"
175
190
  # Bedrock wrapper, base64-embedded to avoid heredoc/quoting fragility. It execs the real
176
191
  # launcher at /usr/local/lib/node_modules/@openai/codex/bin/codex.js. CRITICAL: `npm install`
177
192
  # leaves /usr/local/bin/codex as a SYMLINK to that codex.js, so we must `rm -f` it first —
178
193
  # writing through the symlink would clobber codex.js itself, making the wrapper exec itself
179
194
  # (infinite recursion -> codex hangs on launch).
180
- RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IHdpcmVkIHRvIEdQVC01LjUgb24gQVdTIEJlZHJvY2sgKHVzLWVhc3QtMiBtYW50bGUgZW5kcG9pbnQpLgojIEF1dGggdmlhIHRoZSBwb2QgSVJTQSAtPiBzaG9ydC1saXZlZCAofjEyaCkgQmVkcm9jayBiZWFyZXIgdG9rZW47IG5vIHBlci11c2VyIGtleS4KIyBSZWFzb25pbmcgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgZW52IChkZWZhdWx0IGhpZ2gpLiBUaGUgY29uZmlnIGlzIChyZSl3cml0dGVuIG9uCiMgZXZlcnkgbGF1bmNoOiAvaG9tZS9kZXYgaXMgZXBoZW1lcmFsLCBhbmQgdGhpcyBhbHNvIHNlbGYtaGVhbHMgYSAvbW9kZWwgbWlzaGFwCiMgKHRoZSBwaWNrZXIgY2FuIGNvcnJ1cHQgdGhlIG1vZGVsIGlkOyByZXN0YXJ0aW5nIGNvZGV4IHJlc2V0cyBpdCkuCnNldCArZQpSRUFMPS91c3IvbG9jYWwvbGliL25vZGVfbW9kdWxlcy9Ab3BlbmFpL2NvZGV4L2Jpbi9jb2RleC5qcwpFRkZPUlQ9IiR7Q09ERVhfRUZGT1JUOi1oaWdofSIKbWtkaXIgLXAgIiRIT01FLy5jb2RleCIgMj4vZGV2L251bGwKY2F0ID4gIiRIT01FLy5jb2RleC9jb25maWcudG9tbCIgPDxDRkcKbW9kZWwgPSAib3BlbmFpLmdwdC01LjUiCm1vZGVsX3Byb3ZpZGVyID0gImJlZHJvY2siCndlYl9zZWFyY2ggPSAiZGlzYWJsZWQiCm1vZGVsX2NvbnRleHRfd2luZG93ID0gMjcyMDAwCm1vZGVsX21heF9vdXRwdXRfdG9rZW5zID0gMTI4MDAwCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKClttb2RlbF9wcm92aWRlcnMuYmVkcm9ja10KbmFtZSA9ICJBV1MgQmVkcm9jayAoR1BULTUuNSkiCmJhc2VfdXJsID0gImh0dHBzOi8vYmVkcm9jay1tYW50bGUudXMtZWFzdC0yLmFwaS5hd3Mvb3BlbmFpL3YxIgplbnZfa2V5ID0gIk9QRU5BSV9BUElfS0VZIgp3aXJlX2FwaSA9ICJyZXNwb25zZXMiCkNGRwpUT0s9IiQoL3Vzci9iaW4vcHl0aG9uMyAtYyAnZnJvbSBhd3NfYmVkcm9ja190b2tlbl9nZW5lcmF0b3IgaW1wb3J0IHByb3ZpZGVfdG9rZW47IHByaW50KHByb3ZpZGVfdG9rZW4ocmVnaW9uPSJ1cy1lYXN0LTIiKSknIDI+L2Rldi9udWxsKSIKWyAtbiAiJFRPSyIgXSAmJiBleHBvcnQgT1BFTkFJX0FQSV9LRVk9IiRUT0siCmV4ZWMgIiRSRUFMIiAiJEAiCg==' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
195
+ RUN rm -f /usr/local/bin/codex && echo 'IyEvdXNyL2Jpbi9lbnYgYmFzaAojIENvZGV4IG9uIE9wZW5BSSBncHQtNS54IHZpYSBBV1MgQmVkcm9jayB1c2luZyBjb2RleCdzIE5BVElWRSBgYW1hem9uLWJlZHJvY2tgCiMgcHJvdmlkZXIuIFJlZ2lvbiB1cy1lYXN0LTEgKGdwdC01LnggTWFudGxlIHJlZ2lvbikuIEF1dGg6IGEgc2hvcnQtbGl2ZWQgQmVkcm9jawojIGJlYXJlciB0b2tlbiBtaW50ZWQgZnJvbSB0aGUgcG9kIElSU0EgKG5vIHBlci11c2VyIGtleSkuIE1vZGVsIHZpYSBDT0RFWF9NT0RFTAojIChkZWZhdWx0IG9wZW5haS5ncHQtNS40KSwgZWZmb3J0IHZpYSBDT0RFWF9FRkZPUlQgKGhpZ2gpLgojCiMgbW9kZWxfY29udGV4dF93aW5kb3cgaXMgc2V0IGV4cGxpY2l0bHkgYmVjYXVzZSBjb2RleCdzIGNhdGFsb2cgZG9lc24ndCBrbm93IHRoZQojIEJlZHJvY2stcHJlZml4ZWQgaWQgIm9wZW5haS5ncHQtNS54IiBhbmQgb3RoZXJ3aXNlIHdhcm5zICJNb2RlbCBtZXRhZGF0YSBub3QgZm91bmQsCiMgZGVmYXVsdGluZyB0byBmYWxsYmFjayBtZXRhZGF0YSIuIDI3MjAwMCBpcyBncHQtNS41J3MgYnVuZGxlZCBjb250ZXh0IHdpbmRvdy4KIwojIGdwdC01LjUgbm90ZSAoMjAyNi0wNi0xNik6IHByb3Zpc2lvbmVkIGluIHVzLWVhc3QtMSBidXQgbWlkLXJvbGxvdXQg4oCUIH4zMCUgb2YgY2FsbHMKIyBzdGlsbCA0MDQgIkVuZ2luZSBub3QgZm91bmQiLiBEZWZhdWx0IHN0YXlzIGdwdC01LjQgKHNvbGlkKTsgc3dpdGNoIHRoZSBkZWZhdWx0IHRvCiMgb3BlbmFpLmdwdC01LjUgb25jZSBBV1Mgc3RhYmlsaXplcywgb3Igb3B0IGluIG5vdyB3aXRoIENPREVYX01PREVMPW9wZW5haS5ncHQtNS41LgpzZXQgK2UKTU9ERUw9IiR7Q09ERVhfTU9ERUw6LW9wZW5haS5ncHQtNS40fSIKRUZGT1JUPSIke0NPREVYX0VGRk9SVDotaGlnaH0iCmV4cG9ydCBBV1NfUkVHSU9OPXVzLWVhc3QtMSBBV1NfREVGQVVMVF9SRUdJT049dXMtZWFzdC0xCm1rZGlyIC1wICIkSE9NRS8uY29kZXgiCmNhdCA+ICIkSE9NRS8uY29kZXgvY29uZmlnLnRvbWwiIDw8Q0ZHCm1vZGVsX3Byb3ZpZGVyID0gImFtYXpvbi1iZWRyb2NrIgptb2RlbCA9ICIkTU9ERUwiCm1vZGVsX3JlYXNvbmluZ19lZmZvcnQgPSAiJEVGRk9SVCIKbW9kZWxfY29udGV4dF93aW5kb3cgPSAyNzIwMDAKd2ViX3NlYXJjaCA9ICJkaXNhYmxlZCIKQ0ZHClRPSz0iJCgvdXNyL2Jpbi9weXRob24zIC1jICdmcm9tIGF3c19iZWRyb2NrX3Rva2VuX2dlbmVyYXRvciBpbXBvcnQgcHJvdmlkZV90b2tlbjsgcHJpbnQocHJvdmlkZV90b2tlbihyZWdpb249InVzLWVhc3QtMSIpKScgMj4vZGV2L251bGwpIgpbIC1uICIkVE9LIiBdICYmIGV4cG9ydCBBV1NfQkVBUkVSX1RPS0VOX0JFRFJPQ0s9IiRUT0siCmV4ZWMgL3Vzci9sb2NhbC9saWIvbm9kZV9tb2R1bGVzL0BvcGVuYWkvY29kZXgvYmluL2NvZGV4LmpzICIkQCIK' | base64 -d > /usr/local/bin/codex && chmod 0755 /usr/local/bin/codex
181
196
 
182
197
  USER dev
183
198