gpu-dev 0.6.6__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/.github/workflows/publish.yml +6 -1
  2. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/CLAUDE.md +40 -0
  3. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/PKG-INFO +11 -15
  4. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/README.md +9 -13
  5. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +301 -35
  6. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +101 -0
  7. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +12 -0
  8. gpu_dev-0.7.1/docs/SDK_REPRO.md +73 -0
  9. {gpu_dev-0.6.6/cli-tools/gpu-dev-cli → gpu_dev-0.7.1}/gpu_dev.egg-info/PKG-INFO +11 -15
  10. {gpu_dev-0.6.6/cli-tools/gpu-dev-cli → gpu_dev-0.7.1}/gpu_dev.egg-info/SOURCES.txt +31 -7
  11. {gpu_dev-0.6.6/cli-tools/gpu-dev-cli → gpu_dev-0.7.1}/gpu_dev.egg-info/top_level.txt +1 -0
  12. gpu_dev-0.7.1/presentation/architecture.html +401 -0
  13. gpu_dev-0.7.1/presentation/cli-demo.html +381 -0
  14. gpu_dev-0.7.1/presentation/devgpu-features.html +292 -0
  15. gpu_dev-0.7.1/presentation/docker-mark-blue.svg +12 -0
  16. gpu_dev-0.7.1/presentation/feedback.png +0 -0
  17. gpu_dev-0.7.1/presentation/gpu-fleet.html +355 -0
  18. gpu_dev-0.7.1/presentation/icons8-cursor-ai.svg +1 -0
  19. gpu_dev-0.7.1/presentation/index.html +563 -0
  20. gpu_dev-0.7.1/presentation/k8s-under-the-hood.html +369 -0
  21. gpu_dev-0.7.1/presentation/multinode.html +421 -0
  22. gpu_dev-0.7.1/presentation/osdc-future-plans.html +191 -0
  23. gpu_dev-0.7.1/presentation/problem.png +0 -0
  24. gpu_dev-0.7.1/presentation/sandbox.html +745 -0
  25. gpu_dev-0.7.1/presentation/sdk-demo.html +518 -0
  26. gpu_dev-0.7.1/presentation/thesis.html +70 -0
  27. gpu_dev-0.7.1/presentation/title-vid.mp4 +0 -0
  28. gpu_dev-0.7.1/presentation/weneedgpus.png +0 -0
  29. gpu_dev-0.7.1/presentation/wow.html +166 -0
  30. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/pyproject.toml +15 -5
  31. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/examples/parallel_experiments.ipynb +18 -16
  32. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/__init__.py +7 -1
  33. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_backend/aws.py +90 -1
  34. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_sync/client.py +44 -1
  35. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/common/models.py +1 -0
  36. gpu_dev-0.7.1/terraform-deck/backend.tf +8 -0
  37. gpu_dev-0.7.1/terraform-deck/main.tf +223 -0
  38. gpu_dev-0.7.1/terraform-deck/terraform.tfvars.example +4 -0
  39. gpu_dev-0.7.1/terraform-gpu-devservers/build-node.tf +99 -0
  40. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/Dockerfile +8 -2
  41. gpu_dev-0.7.1/terraform-gpu-devservers/git-cache.tf +539 -0
  42. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +26 -2
  43. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/availability_updater/index.py +5 -0
  44. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/reservation_processor/index.py +818 -40
  45. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/shared/dns_utils.py +5 -0
  46. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda.tf +46 -10
  47. gpu_dev-0.7.1/terraform-gpu-devservers/pytorch-prebuild.tf +194 -0
  48. gpu_dev-0.7.1/terraform-gpu-devservers/warm-pool.tf +31 -0
  49. gpu_dev-0.6.6/sdk/python/pyproject.toml +0 -27
  50. gpu_dev-0.6.6/terraform-gpu-devservers/git-cache.tf +0 -313
  51. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/.github/workflows/no-gitlinks.yml +0 -0
  52. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/.gitignore +0 -0
  53. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/README.md +0 -0
  54. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/admin/README.md +0 -0
  55. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/admin/generate_stats.py +0 -0
  56. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/admin/requirements.txt +0 -0
  57. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  58. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  59. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  60. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  61. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  62. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  63. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  64. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  65. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  66. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/docs/USER_GUIDE.md +0 -0
  67. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/docs/devgpu-features.html +0 -0
  68. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/docs/docker-mark-blue.svg +0 -0
  69. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/docs/icons8-cursor-ai.svg +0 -0
  70. {gpu_dev-0.6.6/cli-tools/gpu-dev-cli → gpu_dev-0.7.1}/gpu_dev.egg-info/dependency_links.txt +0 -0
  71. {gpu_dev-0.6.6/cli-tools/gpu-dev-cli → gpu_dev-0.7.1}/gpu_dev.egg-info/entry_points.txt +0 -0
  72. {gpu_dev-0.6.6/cli-tools/gpu-dev-cli → gpu_dev-0.7.1}/gpu_dev.egg-info/requires.txt +0 -0
  73. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/README.md +0 -0
  74. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/examples/batch_multi_gpu.py +0 -0
  75. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/examples/interactive_debug.py +0 -0
  76. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/examples/quickstart.ipynb +0 -0
  77. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/examples/run_tests.py +0 -0
  78. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/examples/submit_job.py +0 -0
  79. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
  80. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  81. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
  82. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  83. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
  84. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  85. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
  86. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  87. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/common/config.py +0 -0
  88. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/common/enums.py +0 -0
  89. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/common/errors.py +0 -0
  90. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/src/gpu_dev/py.typed +0 -0
  91. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/tests/__init__.py +0 -0
  92. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/sdk/python/tests/test_models.py +0 -0
  93. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/setup.cfg +0 -0
  94. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  95. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  96. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/README.md +0 -0
  97. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/alb.tf +0 -0
  98. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/ami-baker.tf +0 -0
  99. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/availability.tf +0 -0
  100. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/backend.tf +0 -0
  101. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/check_b200.py +0 -0
  102. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  103. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  104. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  105. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  106. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/bash_profile +0 -0
  107. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/bashrc +0 -0
  108. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  109. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  110. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  111. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  112. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/motd_script +0 -0
  113. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  114. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/profile +0 -0
  115. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  116. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  117. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  118. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/shell_env +0 -0
  119. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/ssh_config +0 -0
  120. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/zprofile +0 -0
  121. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/zshrc +0 -0
  122. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  123. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker-build.tf +0 -0
  124. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  125. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  126. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/ecr.tf +0 -0
  127. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/efs.tf +0 -0
  128. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/eks.tf +0 -0
  129. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/expiry.tf +0 -0
  130. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/kubernetes.tf +0 -0
  131. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  132. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  133. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  134. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  135. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  136. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  137. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  138. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  139. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  140. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  141. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  142. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  143. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/list_b200.py +0 -0
  144. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/main.tf +0 -0
  145. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/mig-config.tf +0 -0
  146. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  147. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  148. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  149. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  150. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  151. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  152. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/monitoring.tf +0 -0
  153. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  154. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/outputs.tf +0 -0
  155. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/pyproject.toml +0 -0
  156. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/queue.tf +0 -0
  157. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/route53.tf +0 -0
  158. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  159. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  160. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  161. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  162. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  163. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  164. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  165. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  166. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  167. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  168. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  169. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/switch-to.sh +0 -0
  170. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  171. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  172. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  173. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  174. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  175. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/terraform-gpu-devservers/variables.tf +0 -0
  176. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/tests/submit/README.md +0 -0
  177. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/tests/submit/fail/run.sh +0 -0
  178. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/tests/submit/multinode/run.sh +0 -0
  179. {gpu_dev-0.6.6 → gpu_dev-0.7.1}/tests/submit/success/run.sh +0 -0
@@ -28,7 +28,7 @@ jobs:
28
28
  echo "::error::Tag version ($TAG_VERSION) does not match package version ($PKG_VERSION)"
29
29
  exit 1
30
30
  fi
31
- - name: Build package
31
+ - name: "Build package (gpu-dev = CLI + SDK)"
32
32
  run: uv build
33
33
  - name: Generate attestations
34
34
  uses: actions/attest-build-provenance@v2
@@ -36,3 +36,8 @@ jobs:
36
36
  subject-path: dist/*
37
37
  - name: Publish to PyPI
38
38
  uses: pypa/gh-action-pypi-publish@release/v1
39
+ with:
40
+ # Multi-package + re-run safety: skip any file already on PyPI (e.g. a
41
+ # gpu-dev version that published before a sibling failed) instead of
42
+ # erroring on the duplicate.
43
+ skip-existing: true
@@ -51,6 +51,41 @@ Currently we're working on a developer servers with GPUs in AWS. This means we'l
51
51
 
52
52
  # AGENT SECTION
53
53
 
54
+ ## Instant-sandboxes branch — WIP & things to fix (2026-05-29)
55
+
56
+ Big push on warm pools + instant claims + prebuilt pytorch. Tracking state here so it's not lost.
57
+
58
+ **Committed, needs deploy/activation:**
59
+ - `tf apply` (branch `instant-sandboxes`): warm-pool reconciler + fail-open claim hook, async hot-refill on claim, async per-user EFS mount, processor self-invoke IAM, Bedrock marketplace perms on pod IRSA, pytorch `ref` staging, availability counts warm-ready as available, git-cache worktree snapshot + `pytorch-snapshot` DaemonSet, processor Function URL.
60
+ - Reinstall **CLI + SDK**: `--direct` (default on) synchronous claim, `--ref` (pr/commit/branch), `--no-persist`+`--disk` conflict guard, Function-URL cache (`~/.config/gpu-dev/direct-url.json`).
61
+ - Rebuild **gpu-dev image**: Claude Code cache-bust (latest), `~/.local/bin` on PATH (bash+zsh, all disks).
62
+ - **Meta/fbcode**: grant the user IAM role `lambda:InvokeFunctionUrl` + `lambda:GetFunctionUrlConfig` (scoped to reservation-processor) so `--direct` works; otherwise it falls back to SQS silently.
63
+
64
+ **Prebuilt viable/strict + warm ccache (importable torch + marginal C++ build) — COMMITTED on `instant-sandboxes`, needs `tf apply`:**
65
+ - [x] Dedicated `m7i.48xlarge` build node group (always-on). `build-node.tf`, node `ip-10-0-26-237` up.
66
+ - [x] Hourly **stateful incremental** build CronJob (`pytorch-prebuild.tf`): `concurrencyPolicy=Forbid` + flock (the "build queue"), **CUDA 13.2** (matches the cu13 nvshmem ABI in the image — 12.8 fails at nvlink), `TORCH_CUDA_ARCH_LIST=9.0;10.0` (see arch note below), `BUILD_TEST=0`, builds at **`/home/dev/pytorch`** on a hostPath (path-match for relocatable incremental), `CCACHE_DIR=/ccache_shared/build-node`, only when viable/strict SHA bumps. Publishes via rsync to `/ccache_shared/prebuilt/pytorch-<arch>`.
67
+ - [x] `pytorch-snapshot` DaemonSet (in `git-cache.tf`) arch-aware: rsyncs the built tree from the shared EFS to each node's `/mnt/nvme/pytorch-built` (arch via `uname -m`; arm skips gracefully). Existing master worktree HTTP pull unchanged.
68
+ - [x] `stage-pytorch` (lambda) reflink-copies the built tree into `/home/dev/pytorch` + sets `PYTHONPATH` (`/etc/profile.d/zz-pytorch.sh` + `*_ext`) so `import torch` works with no pod-side build. With `--ref`: same tree (warm `build/`), checkout the ref, rebuild is incremental. Applies to warm pods too.
69
+ - **Publish/cache decision:** reuse the existing `ccache_shared` EFS (everyone already mounts it) under `/prebuilt`; no new EFS/S3. EFS here = plain NFS volume mounts, not CSI. ccache is shared by build node + ALL dev pods (incl persistent-disk) so a user's own build benefits from the build node's compiles.
70
+ - **Validated build numbers** (m7i, 128 jobs, CUDA 13.2, `9.0;10.0`, BUILD_TEST=0): cold (build/ gone, ccache 86% warm = node-replacement case) **~21m**; incremental (1 cutlass kernel + 386MB relink) **~42s**; ninja no-op **~22s**; ccache **86.5%** hit. Result: `torch 2.13.0a0`, imports, `get_arch_list()=['sm_90','sm_100']`.
71
+ - [ ] **Cleanup:** delete the manual test pod `gpu-dev-buildtest` (gpu-dev ns) — done with empirical measurement (kept for now in case more measurements needed). It holds a warm `/root/pt` build tree.
72
+ - [ ] **Reflink caveat:** stage-pytorch uses `cp -a --reflink=auto || cp -a`. For the drop-in to be *instant* (not a 20-40GB copy), the pod's `/home/dev` (dev-home emptyDir) and the node's `/mnt/nvme` must be the **same filesystem**. Verify node bootstrap puts kubelet emptyDir on `/mnt/nvme`; else it falls back to a full copy (correct, slower).
73
+
74
+ **To fix / todo:**
75
+ - [ ] **Direct/warm claim path drops `--ref` and `--no-persist`:** a `reserve --ref X --no-persist` (no `--disk`) still satisfies the line-1388 `claim_direct` condition (it doesn't exclude `ref`), so it goes the warm/direct path which doesn't carry `ref`/`no_persistent_disk` → the user got their **default persistent disk** + no PR staged (reservation `5e83bb5b`: `no_persistent_disk=false, disk_name=default, pytorch_ref=null, version=null`). Fix: exclude `ref` (and honor `no_persistent_disk`) from the direct fast-path, OR thread `ref`/`no_persistent_disk` through `claim_direct`+`handle_direct_claim`. Workaround for now: `--no-direct --no-persist --ref`.
76
+ - [x] **Warm full-GPU (1-GPU) pods + evict-on-demand** (DONE, commit c1211e3): `_evict_warm_for_capacity` deletes the minimum warm-ready pods on a single node when no node has enough free GPUs (gated in `get_target_az_for_reservation` before the Pending fallback; reconciler tops the pool back up). Also covers full **MIG** nodes filling up (not just full-GPU) — warm pods no longer block 2/4/8-GPU or full-node requests. Added `WARM_POOL_TARGETS` `h100:1, b200:1` (safe now that they're evictable). `get_available_gpus_on_node` counts warm pods as used, so placement avoids them until eviction frees them. Needs `tf apply`.
77
+ - [ ] **CLI install hygiene:** user's `~/.venv` has BOTH `gpu-dev 0.6.6` (editable→repo) and a stale duplicate `gpu-dev-cli 0.3.5` (also editable, same dir, different dist name). `pip uninstall gpu-dev-cli` to remove the confusing duplicate; the real package is `gpu-dev`.
78
+ - [ ] **Publish via tarball, not rsync-to-EFS:** rsync of the raw tree (.git + build/ = 100k+ small files) to EFS stalled at 0 files in 13min (NFS per-file round-trips). Switched publish + DaemonSet to a single `zstd` tarball (sequential I/O). (committed)
79
+ - [ ] **Prebuilt built WITHOUT cuDNN** — `import torch` warns "compiled without cuDNN/MIOpen". CI/nightly build with cudnn9. Add libcudnn to the gpu-dev image + `USE_CUDNN=1` to the build recipe for fidelity (conv/cudnn-dependent ops + tests). Irrelevant for flex-attention int64 test; matters generally.
80
+ - [ ] **`--ref pr/N` uses `pull/N/head`, not `/merge`** — `/head` is the PR author's raw branch tip (often based on old trunk, missing trunk-added tests); CI tests `/merge` (PR merged onto current trunk). For CI-repro fidelity, `pr/N` should fetch `pull/N/merge` (fall back to `/head` if no merge ref). `stage-pytorch` REF case in `index.py`. (This is why `pull/185479/head` lacked `test_large_kv_int64_pointer_math_cuda`.)
81
+ - [ ] **Misleading disconnect/expiry message** — on `gpu-dev connect` connection loss OR reservation expiry, the CLI prints "❌ Authentication failed. You don't have SSH access... ask the primary user to add you" even for the PRIMARY user's own expired/cancelled reservation. Distinguish: (a) reservation expired -> "Reservation <id> expired at <time>"; (b) cancelled -> "Reservation was cancelled"; (c) connection dropped but still active -> "Connection lost, reconnect with gpu-dev connect <id>"; (d) genuine auth failure -> the current add-user message. Check reservation status before assuming auth failure.
82
+ - [ ] **`gpu-dev cancel` from inside the pod** — show "Shutting down this reservation..." (graceful message) instead of an abrupt SSH drop, so the user knows the disconnect was intentional.
83
+ - [ ] SSH CA certs to drop the ~0.33s `kubectl exec` key injection on warm claim (auth-model change).
84
+ - [ ] AMI baker re-bakes on every base-EKS-AMI roll (5 baked AMIs in 2 days): pin the base AMI version + clean up old `gpu-dev-baked-*`.
85
+ - [ ] **Warm pods: gate `warm-state=ready` on staging completion** (NOW MORE IMPORTANT — the built tree is ~30GB, and on GPU nodes it's a `cp` not reflink, so staging takes ~1-3min; a claim in that window hands over a half-copied tree). Two options: (a) claim-time check — exec `[ -f /home/dev/.pytorch-staging ]` in `try_claim_warm_pod`, skip pods still staging (simple, but adds ~0.5s exec to every warm claim); (b) label-flip — create with `warm-state=provisioning`, reconciler exec-checks staging + flips to `ready` (no claim latency, but 4 interacting changes: create label + reconciler flip + eviction must also target `provisioning` + claim already filters `ready`). Prefer (b). Marker: `.pytorch-staging` present during, removed when done; `.pytorch-ready` written at end.
86
+ - [ ] **Image-rebuild propagation gap:** pods use `imagePullPolicy=IfNotPresent` + `:latest`, so a rebuilt image does NOT reach pods until the node re-pulls. After every image rebuild you must `kubectl rollout restart daemonset gpu-dev-image-prepuller -n kube-system` (re-pull on all GPU nodes, ~5min) **and** recycle warm pods, else pods run the stale cached image (this is why claude/PATH looked unfixed). Automate later: reconciler recycles warm pods when the `:latest` digest changes (and/or trigger the prepuller restart from the image-build step).
87
+ - [x] **Prebuilt build archs (CORRECTED):** use plain `TORCH_CUDA_ARCH_LIST=9.0;10.0` — **NOT** `9.0a;10.0a`. You never put the `a` in the list yourself. PyTorch's `cmake/Codegen.cmake` (`_BUILD_FOR_ADDITIONAL_ARCHS`, gated on `compute_90`/`compute_100` being present) auto-adds `sm_90a`/`sm_100a` to exactly the cutlass kernels that need Hopper wgmma/TMA (`RowwiseScaledMM.cu`, `ScaledGroupMM.cu`, `GroupMM.cu`). Verified in `compile_commands.json`: the RowwiseScaledMM line shows all four (sm_90, sm_90a, sm_100, sm_100a). Forcing `9.0a` for the whole build is non-CI and would drop the plain SASS / other archs. Per-commit **trunk** CI builds narrow per-runner arch (`9.0` alone for H100 jobs, `10.0` for B200) — nightly builds the fat `7.5;8.0;8.6;9.0;10.0;12.0+PTX`; we match trunk + "9+" for our H100/B200 fleet. To add A100/T4/L4 later, widen to `8.0;8.9;9.0;10.0` (still one build). CUDA 13.2 (image default), not 12.8.
88
+
54
89
  ## Issues I found with the description above
55
90
 
56
91
  - I am not sure terraform-aws-github-runner is correctly described. Next time I go over this code for maintenance or adding something, I'll inform the user of what I think should change. This is not an active goal though, just a sidequest.
@@ -329,6 +364,11 @@ module "us_east_1" {
329
364
  - **Scale up T4 instances** - Add 3 more T4 nodes (g4dn.12xlarge) to cluster
330
365
  - **Scale up L4 instances** - Add 3 more L4 nodes (g6.12xlarge) to cluster
331
366
  - **Add on-demand H100/H200/B200 capacity** - Add at least 2 nodes each of H100 (p5.48xlarge), H200 (p5e.48xlarge), and B200 (p6-b200.48xlarge) as on-demand capacity in addition to existing reserved instances
367
+ - **Run pytorch tests via gpu-dev** - Add a way to run a specific test / set of tests in ../pytorch (see `python run.py` in pytorch for how tests are normally invoked). Short term: `gpu-dev test <paths/test ids>` that reserves, stages pytorch (via --ref), and runs the test command. Long term (stretch, "magic TD"): an agent does target determination from the repo diff, picks the affected tests, kicks off a gpu-dev run, and streams test output back. Builds on the warm-pool + pytorch-snapshot work (instant-sandboxes branch).
368
+ - **Warm pool follow-ups** (from instant-sandboxes branch):
369
+ - Claim-with-ref: today an explicit `--ref` skips the warm pool (cold path). Could instead claim a warm pod and incrementally `git fetch`+checkout the ref in-place.
370
+ - Availability display: warm-ready pods count as "used" in the availability table, so `gpu-dev avail` under-reports free MIG/CPU even though a claim is instant. Reconcile the display with warm claimability.
371
+ - CPU/MIG node disk: the pytorch-snapshot DaemonSet writes ~5-10GB to /mnt/nvme (root disk on nodes without instance NVMe); confirm CPU dev node root volumes are sized for it.
332
372
  - **Future features**:
333
373
  - Multi-server (16 GPU) reservations
334
374
  - GitHub organization/team verification
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.6.6
4
- Summary: CLI tool for PyTorch GPU developer server reservations
3
+ Version: 0.7.1
4
+ Summary: CLI + Python SDK for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
7
7
  Description-Content-Type: text/markdown
@@ -34,7 +34,7 @@ print(result.stdout)
34
34
  sandbox.cancel()
35
35
  ```
36
36
 
37
- Install: `pip install -e sdk/python/` see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
37
+ The SDK ships inside the `gpu-dev` package: `pip install gpu-dev`, then `from gpu_dev import GpuDev`. See [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
38
38
 
39
39
  ---
40
40
 
@@ -701,23 +701,19 @@ gpu-dev disk list-content <disk-name>
701
701
  ### Getting Help
702
702
 
703
703
  - Use `gpu-dev help` or `gpu-dev <command> --help`
704
- - Report issues: https://github.com/anthropics/claude-code/issues
704
+ - Report issues: https://github.com/wdvr/osdc/issues
705
705
 
706
706
  ---
707
707
 
708
708
  ## Development
709
709
 
710
710
  ```bash
711
- # Install development dependencies
712
- poetry install --with dev
713
-
714
- # Run tests
715
- poetry run pytest
716
-
717
- # Format code
718
- poetry run black .
719
- poetry run isort .
711
+ # Editable install from the repo (one package: CLI + SDK)
712
+ pip install -e .
720
713
 
721
- # Type checking
722
- poetry run mypy .
714
+ # Build the distribution the way CI does (uv)
715
+ uv build # gpu-dev (CLI + SDK)
723
716
  ```
717
+
718
+ Releases are tag-driven: pushing a `v*` tag runs `.github/workflows/publish.yml`,
719
+ which builds and publishes both packages to PyPI.
@@ -16,7 +16,7 @@ print(result.stdout)
16
16
  sandbox.cancel()
17
17
  ```
18
18
 
19
- Install: `pip install -e sdk/python/` see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
19
+ The SDK ships inside the `gpu-dev` package: `pip install gpu-dev`, then `from gpu_dev import GpuDev`. See [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
20
20
 
21
21
  ---
22
22
 
@@ -683,23 +683,19 @@ gpu-dev disk list-content <disk-name>
683
683
  ### Getting Help
684
684
 
685
685
  - Use `gpu-dev help` or `gpu-dev <command> --help`
686
- - Report issues: https://github.com/anthropics/claude-code/issues
686
+ - Report issues: https://github.com/wdvr/osdc/issues
687
687
 
688
688
  ---
689
689
 
690
690
  ## Development
691
691
 
692
692
  ```bash
693
- # Install development dependencies
694
- poetry install --with dev
695
-
696
- # Run tests
697
- poetry run pytest
698
-
699
- # Format code
700
- poetry run black .
701
- poetry run isort .
693
+ # Editable install from the repo (one package: CLI + SDK)
694
+ pip install -e .
702
695
 
703
- # Type checking
704
- poetry run mypy .
696
+ # Build the distribution the way CI does (uv)
697
+ uv build # gpu-dev (CLI + SDK)
705
698
  ```
699
+
700
+ Releases are tag-driven: pushing a `v*` tag runs `.github/workflows/publish.yml`,
701
+ which builds and publishes both packages to PyPI.