gpu-dev 0.5.32__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/PKG-INFO +23 -3
  2. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/README.md +22 -2
  3. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +23 -3
  4. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +21 -0
  5. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +1 -1
  6. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +134 -68
  7. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +63 -15
  8. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +15 -2
  9. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +45 -14
  10. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/pyproject.toml +1 -1
  11. gpu_dev-0.6.2/sdk/python/README.md +187 -0
  12. gpu_dev-0.6.2/sdk/python/examples/quickstart.ipynb +365 -0
  13. gpu_dev-0.6.2/sdk/python/pyproject.toml +27 -0
  14. gpu_dev-0.6.2/sdk/python/src/gpu_dev/__init__.py +60 -0
  15. gpu_dev-0.6.2/sdk/python/src/gpu_dev/_async/__init__.py +2 -0
  16. gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/aws.py +315 -0
  17. gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/protocol.py +53 -0
  18. gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  19. gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/client.py +245 -0
  20. gpu_dev-0.6.2/sdk/python/src/gpu_dev/_sync/sandbox.py +243 -0
  21. gpu_dev-0.6.2/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  22. gpu_dev-0.6.2/sdk/python/src/gpu_dev/_transport/ssh.py +121 -0
  23. gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  24. gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/config.py +45 -0
  25. gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/enums.py +44 -0
  26. gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/errors.py +33 -0
  27. gpu_dev-0.6.2/sdk/python/src/gpu_dev/common/models.py +73 -0
  28. gpu_dev-0.6.2/sdk/python/src/gpu_dev/py.typed +0 -0
  29. gpu_dev-0.6.2/sdk/python/tests/__init__.py +0 -0
  30. gpu_dev-0.6.2/sdk/python/tests/test_models.py +69 -0
  31. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/kubernetes.tf +66 -70
  32. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/index.py +308 -171
  33. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda.tf +29 -5
  34. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/main.tf +6 -4
  35. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/mig-config.tf +2 -1
  36. gpu_dev-0.6.2/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  37. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/al2023-user-data.sh +91 -4
  38. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +3 -0
  39. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/.github/workflows/no-gitlinks.yml +0 -0
  40. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/.github/workflows/publish.yml +0 -0
  41. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/.gitignore +0 -0
  42. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/CLAUDE.md +0 -0
  43. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/README.md +0 -0
  44. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/admin/README.md +0 -0
  45. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/admin/generate_stats.py +0 -0
  46. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/admin/requirements.txt +0 -0
  47. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  48. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  49. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  50. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  51. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  52. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  53. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  54. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  55. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  56. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  57. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  58. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/docs/USER_GUIDE.md +0 -0
  59. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/docs/devgpu-features.html +0 -0
  60. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/docs/docker-mark-blue.svg +0 -0
  61. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/docs/icons8-cursor-ai.svg +0 -0
  62. /gpu_dev-0.5.32/terraform-gpu-devservers/subnet-0fe3a2c45570091ad → /gpu_dev-0.6.2/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  63. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/setup.cfg +0 -0
  64. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  65. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  66. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/README.md +0 -0
  67. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/alb.tf +0 -0
  68. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ami-baker.tf +0 -0
  69. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/availability.tf +0 -0
  70. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/backend.tf +0 -0
  71. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/check_b200.py +0 -0
  72. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  73. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  74. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  75. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  76. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  77. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bash_profile +0 -0
  78. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bashrc +0 -0
  79. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  80. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  81. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  82. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  83. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/motd_script +0 -0
  84. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  85. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/profile +0 -0
  86. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  87. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  88. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  89. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/shell_env +0 -0
  90. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/ssh_config +0 -0
  91. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zprofile +0 -0
  92. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zshrc +0 -0
  93. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  94. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-build.tf +0 -0
  95. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  96. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  97. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ecr.tf +0 -0
  98. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/efs.tf +0 -0
  99. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/eks.tf +0 -0
  100. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/expiry.tf +0 -0
  101. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/git-cache.tf +0 -0
  102. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  103. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  104. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  105. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  106. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  107. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  108. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  109. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  110. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  111. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  112. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  113. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  114. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  115. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  116. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  117. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/list_b200.py +0 -0
  118. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  119. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  120. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  121. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  122. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  123. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  124. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/monitoring.tf +0 -0
  125. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  126. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/outputs.tf +0 -0
  127. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/pyproject.toml +0 -0
  128. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/queue.tf +0 -0
  129. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/route53.tf +0 -0
  130. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  131. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  132. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  133. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  134. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  135. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  136. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  137. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  138. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  139. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  140. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/switch-to.sh +0 -0
  141. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  142. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  143. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  144. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/terraform-gpu-devservers/variables.tf +0 -0
  145. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/tests/submit/README.md +0 -0
  146. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/tests/submit/fail/run.sh +0 -0
  147. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/tests/submit/multinode/run.sh +0 -0
  148. {gpu_dev-0.5.32 → gpu_dev-0.6.2}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.32
3
+ Version: 0.6.2
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -16,9 +16,29 @@ Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
18
 
19
- # GPU Developer CLI
19
+ # GPU Developer CLI & SDK
20
20
 
21
- A command-line tool for reserving and managing GPU development servers on AWS EKS.
21
+ A command-line tool and Python SDK for reserving and managing GPU development servers.
22
+
23
+ ## Python SDK
24
+
25
+ For programmatic access, use the [Python SDK](../../sdk/python/README.md):
26
+
27
+ ```python
28
+ from gpu_dev import GpuDev
29
+
30
+ client = GpuDev()
31
+ sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
32
+ result = sandbox.exec("nvidia-smi")
33
+ print(result.stdout)
34
+ sandbox.cancel()
35
+ ```
36
+
37
+ Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
38
+
39
+ ---
40
+
41
+ ## CLI
22
42
 
23
43
  ## Table of Contents
24
44
 
@@ -1,6 +1,26 @@
1
- # GPU Developer CLI
1
+ # GPU Developer CLI & SDK
2
2
 
3
- A command-line tool for reserving and managing GPU development servers on AWS EKS.
3
+ A command-line tool and Python SDK for reserving and managing GPU development servers.
4
+
5
+ ## Python SDK
6
+
7
+ For programmatic access, use the [Python SDK](../../sdk/python/README.md):
8
+
9
+ ```python
10
+ from gpu_dev import GpuDev
11
+
12
+ client = GpuDev()
13
+ sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
14
+ result = sandbox.exec("nvidia-smi")
15
+ print(result.stdout)
16
+ sandbox.cancel()
17
+ ```
18
+
19
+ Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
20
+
21
+ ---
22
+
23
+ ## CLI
4
24
 
5
25
  ## Table of Contents
6
26
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.32
3
+ Version: 0.6.2
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -16,9 +16,29 @@ Requires-Dist: websockets>=12.0
16
16
  Requires-Dist: certifi>=2023.7.22
17
17
  Requires-Dist: mcp>=1.0.0
18
18
 
19
- # GPU Developer CLI
19
+ # GPU Developer CLI & SDK
20
20
 
21
- A command-line tool for reserving and managing GPU development servers on AWS EKS.
21
+ A command-line tool and Python SDK for reserving and managing GPU development servers.
22
+
23
+ ## Python SDK
24
+
25
+ For programmatic access, use the [Python SDK](../../sdk/python/README.md):
26
+
27
+ ```python
28
+ from gpu_dev import GpuDev
29
+
30
+ client = GpuDev()
31
+ sandbox = client.reserve(gpu_type="h100", gpu_count=2, hours=4)
32
+ result = sandbox.exec("nvidia-smi")
33
+ print(result.stdout)
34
+ sandbox.cancel()
35
+ ```
36
+
37
+ Install: `pip install -e sdk/python/` — see [SDK docs](../../sdk/python/README.md) and [quickstart notebook](../../sdk/python/examples/quickstart.ipynb).
38
+
39
+ ---
40
+
41
+ ## CLI
22
42
 
23
43
  ## Table of Contents
24
44
 
@@ -30,6 +30,27 @@ docs/USER_GUIDE.md
30
30
  docs/devgpu-features.html
31
31
  docs/docker-mark-blue.svg
32
32
  docs/icons8-cursor-ai.svg
33
+ sdk/python/README.md
34
+ sdk/python/pyproject.toml
35
+ sdk/python/examples/quickstart.ipynb
36
+ sdk/python/src/gpu_dev/__init__.py
37
+ sdk/python/src/gpu_dev/py.typed
38
+ sdk/python/src/gpu_dev/_async/__init__.py
39
+ sdk/python/src/gpu_dev/_backend/__init__.py
40
+ sdk/python/src/gpu_dev/_backend/aws.py
41
+ sdk/python/src/gpu_dev/_backend/protocol.py
42
+ sdk/python/src/gpu_dev/_sync/__init__.py
43
+ sdk/python/src/gpu_dev/_sync/client.py
44
+ sdk/python/src/gpu_dev/_sync/sandbox.py
45
+ sdk/python/src/gpu_dev/_transport/__init__.py
46
+ sdk/python/src/gpu_dev/_transport/ssh.py
47
+ sdk/python/src/gpu_dev/common/__init__.py
48
+ sdk/python/src/gpu_dev/common/config.py
49
+ sdk/python/src/gpu_dev/common/enums.py
50
+ sdk/python/src/gpu_dev/common/errors.py
51
+ sdk/python/src/gpu_dev/common/models.py
52
+ sdk/python/tests/__init__.py
53
+ sdk/python/tests/test_models.py
33
54
  terraform-gpu-devservers/.terraform.lock.hcl
34
55
  terraform-gpu-devservers/README.md
35
56
  terraform-gpu-devservers/alb.tf
@@ -13,7 +13,7 @@ from rich.spinner import Spinner
13
13
  # SSH validation result is cached locally for 24h. New keys pushed to GitHub still take effect
14
14
  # at reservation time (pods fetch live keys via init container) — caching only skips the
15
15
  # pre-flight "are you who you say you are" check.
16
- _SSH_CACHE_TTL_SECONDS = 24 * 60 * 60
16
+ _SSH_CACHE_TTL_SECONDS = 14 * 24 * 60 * 60
17
17
  _SSH_CACHE_PATH = Path(os.path.expanduser("~/.config/gpu-dev/ssh-validation-cache.json"))
18
18
 
19
19
  # Cache for authenticate_user. STS GetCallerIdentity is stable per AWS profile and slow under SSO
@@ -41,33 +41,51 @@ from .interactive import (
41
41
  console = Console()
42
42
 
43
43
 
44
+ _east1_table = None
45
+
44
46
  def _fetch_reservations_cross_region(reservation_mgr, user_filter, statuses, config=None):
45
47
  """Fetch reservations from current region + prod-east1 if on prod."""
46
- reservations = reservation_mgr.list_reservations(
47
- user_filter=user_filter, statuses_to_include=statuses)
48
- # Cross-region fetch
49
- try:
48
+ global _east1_table
49
+
50
+ from concurrent.futures import ThreadPoolExecutor
51
+
52
+ def _fetch_primary():
53
+ return reservation_mgr.list_reservations(
54
+ user_filter=user_filter, statuses_to_include=statuses)
55
+
56
+ def _fetch_east1():
57
+ global _east1_table
50
58
  cfg = config or load_config()
51
- if cfg.user_config.get("environment") == "prod":
52
- east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
53
- if east1_env:
54
- import boto3 as _b3
55
- east1_ddb = _b3.resource("dynamodb", region_name=east1_env["region"])
56
- east1_table = east1_ddb.Table("pytorch-gpu-dev-reservations")
57
- for st in (statuses or ["active"]):
58
- resp = east1_table.query(
59
- IndexName="StatusIndex",
60
- KeyConditionExpression="#s = :status",
61
- ExpressionAttributeNames={"#s": "status"},
62
- ExpressionAttributeValues={":status": st},
63
- )
64
- for item in resp.get("Items", []):
65
- if user_filter and item.get("user_id") != user_filter:
66
- continue
67
- item["_region"] = "us-east-1"
68
- reservations.append(item)
59
+ if cfg.user_config.get("environment") != "prod":
60
+ return []
61
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
62
+ if not east1_env or not user_filter:
63
+ return []
64
+ if _east1_table is None:
65
+ _east1_table = cfg.session.resource(
66
+ "dynamodb", region_name=east1_env["region"]
67
+ ).Table("pytorch-gpu-dev-reservations")
68
+ results = []
69
+ for st in (statuses or ["active"]):
70
+ resp = _east1_table.query(
71
+ IndexName="UserStatusIndex",
72
+ KeyConditionExpression="user_id = :uid AND #s = :status",
73
+ ExpressionAttributeNames={"#s": "status"},
74
+ ExpressionAttributeValues={":uid": user_filter, ":status": st},
75
+ )
76
+ for item in resp.get("Items", []):
77
+ item["_region"] = "us-east-1"
78
+ results.append(item)
79
+ return results
80
+
81
+ try:
82
+ with ThreadPoolExecutor(max_workers=2) as ex:
83
+ f1 = ex.submit(_fetch_primary)
84
+ f2 = ex.submit(_fetch_east1)
85
+ reservations = f1.result()
86
+ reservations.extend(f2.result())
69
87
  except Exception:
70
- pass
88
+ reservations = _fetch_primary()
71
89
  return reservations
72
90
 
73
91
 
@@ -608,6 +626,8 @@ def main(ctx: click.Context) -> None:
608
626
  )
609
627
  @click.option("--spot", is_flag=True, default=False,
610
628
  help="Acknowledge spot instance (~1/3 cost, may be preempted with 2-min notice). Required for spot-only types.")
629
+ @click.option("--fast-cache", is_flag=True, default=False, hidden=True,
630
+ help="Use NVMe local cache for faster session restore (experimental).")
611
631
  @click.pass_context
612
632
  def reserve(
613
633
  ctx: click.Context,
@@ -629,6 +649,7 @@ def reserve(
629
649
  disk: Optional[str],
630
650
  node_label: tuple,
631
651
  spot: bool = False,
652
+ fast_cache: bool = False,
632
653
  ) -> None:
633
654
  """Reserve GPU development server(s)
634
655
 
@@ -746,7 +767,10 @@ def reserve(
746
767
  else:
747
768
  f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
748
769
  ssh_result = None
749
- f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
770
+ # Only fetch availability if we need the interactive picker
771
+ need_interactive = gpu_type is None
772
+ if need_interactive:
773
+ f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
750
774
 
751
775
  # Surface auth failure first (most actionable).
752
776
  try:
@@ -758,7 +782,7 @@ def reserve(
758
782
 
759
783
  if ssh_result is None:
760
784
  ssh_result = f_ssh.result()
761
- availability_info = f_avail.result()
785
+ availability_info = f_avail.result() if need_interactive else None
762
786
 
763
787
  # Surface SSH validation failure with the same UX as before.
764
788
  if not ssh_result.get("valid"):
@@ -897,6 +921,13 @@ def reserve(
897
921
 
898
922
  else:
899
923
  # Non-interactive mode - use defaults and validate
924
+ # Route --spot to east1 when on prod (env vars override config region)
925
+ if spot and load_config().user_config.get("environment") == "prod":
926
+ east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
927
+ if east1_cfg:
928
+ import os as _os
929
+ _os.environ["AWS_REGION"] = east1_cfg["region"]
930
+
900
931
  if gpu_type is None:
901
932
  gpu_type = "a100"
902
933
  if hours is None:
@@ -1101,11 +1132,13 @@ def reserve(
1101
1132
  rprint(f"[red]❌ {str(e)}[/red]")
1102
1133
  return
1103
1134
 
1104
- # Validate SSH key matches configured GitHub username
1105
- live.update(Spinner("dots", text="🔐 Validating SSH key..."))
1135
+ # Validate SSH key matches configured GitHub username (cached, ~0ms)
1106
1136
  if not _validate_ssh_key_or_exit(config, live):
1107
1137
  return
1108
1138
 
1139
+ live.update(Spinner("dots", text="📡 Preparing reservation..."))
1140
+ reservation_mgr = ReservationManager(config)
1141
+
1109
1142
  # Track if user explicitly requests no persistent disk
1110
1143
  explicit_no_disk = explicit_no_disk_from_param
1111
1144
 
@@ -1217,11 +1250,6 @@ def reserve(
1217
1250
  rprint(f"[yellow]Use a different disk or wait for the reservation to end[/yellow]")
1218
1251
  return
1219
1252
 
1220
- live.update(
1221
- Spinner("dots", text="📡 Setting up reservation manager...")
1222
- )
1223
- reservation_mgr = ReservationManager(config)
1224
-
1225
1253
  # Submit reservation request
1226
1254
  live.update(
1227
1255
  Spinner("dots", text="📡 Submitting reservation request...")
@@ -1357,6 +1385,7 @@ def reserve(
1357
1385
  spot=spot,
1358
1386
  node_labels=node_labels if node_labels else None,
1359
1387
  trace=trace,
1388
+ fast_cache=fast_cache,
1360
1389
  )
1361
1390
  reservation_ids = [reservation_id] if reservation_id else None
1362
1391
 
@@ -2568,10 +2597,21 @@ def cancel(
2568
2597
  with Live(
2569
2598
  Spinner("dots", text="📡 Cancelling reservations..."), console=console
2570
2599
  ) as live:
2600
+ # Build east1 reservation manager for cross-region cancellations
2601
+ east1_mgr = None
2602
+ east1_env = Config.ENVIRONMENTS.get("prod-east1", {})
2603
+ if east1_env:
2604
+ import os as _os
2605
+ _east1_config = Config()
2606
+ _east1_config.aws_region = east1_env["region"]
2607
+ east1_mgr = ReservationManager(_east1_config)
2608
+
2571
2609
  for reservation in reservations:
2572
2610
  res_id = reservation.get("reservation_id", "")
2573
2611
  if res_id:
2574
- success = reservation_mgr.cancel_reservation(
2612
+ # Use east1 manager for east1 reservations
2613
+ mgr = east1_mgr if reservation.get("_region") in ("east1", "us-east-1") and east1_mgr else reservation_mgr
2614
+ success = mgr.cancel_reservation(
2575
2615
  res_id, user_info["user_id"]
2576
2616
  )
2577
2617
  if success:
@@ -2869,36 +2909,42 @@ def _show_availability() -> None:
2869
2909
  ) as live:
2870
2910
  config = load_config()
2871
2911
 
2872
- # Authenticate using AWS credentials
2912
+ # Authenticate and fetch availability (both regions in parallel)
2873
2913
  try:
2874
2914
  user_info = authenticate_user(config)
2875
2915
  reservation_mgr = ReservationManager(config)
2876
- availability_info = reservation_mgr.get_gpu_availability_by_type()
2916
+
2917
+ from concurrent.futures import ThreadPoolExecutor
2918
+ _env_name = config.user_config.get("environment", "prod")
2919
+ _east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
2920
+
2921
+ def _fetch_east1_spot():
2922
+ if _env_name != "prod" or not _east1_spot_types:
2923
+ return {}
2924
+ east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
2925
+ east1_table = config.session.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability")
2926
+ result = {}
2927
+ for item in east1_table.scan().get("Items", []):
2928
+ gt = item.get("gpu_type", "")
2929
+ if gt in _east1_spot_types:
2930
+ result[gt] = {
2931
+ "available": int(item.get("available_gpus", 0)),
2932
+ "total": int(item.get("total_gpus", 0)),
2933
+ "max_reservable": int(item.get("max_reservable", 0)),
2934
+ "spot_info": item.get("spot_info", {}),
2935
+ }
2936
+ return result
2937
+
2938
+ with ThreadPoolExecutor(max_workers=2) as ex:
2939
+ f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
2940
+ f_spot = ex.submit(_fetch_east1_spot)
2941
+ availability_info = f_avail.result()
2942
+ spot_region_info = f_spot.result()
2877
2943
  except RuntimeError as e:
2878
2944
  live.stop()
2879
2945
  rprint(f"[red]❌ {str(e)}[/red]")
2880
2946
  return
2881
2947
 
2882
- # Cross-region: fetch spot availability from prod-east1
2883
- spot_region_info = {}
2884
- _env_name = config.user_config.get("environment", "prod")
2885
- _east1_spot_types = frozenset(Config.ENVIRONMENTS.get("prod-east1", {}).get("spot_types", []))
2886
- if _env_name == "prod" and _east1_spot_types:
2887
- try:
2888
- import boto3 as _b3
2889
- east1_r = Config.ENVIRONMENTS["prod-east1"]["region"]
2890
- for item in _b3.resource("dynamodb", region_name=east1_r).Table("pytorch-gpu-dev-gpu-availability").scan().get("Items", []):
2891
- gt = item.get("gpu_type", "")
2892
- if gt in _east1_spot_types:
2893
- spot_region_info[gt] = {
2894
- "available": int(item.get("available_gpus", 0)),
2895
- "total": int(item.get("total_gpus", 0)),
2896
- "max_reservable": int(item.get("max_reservable", 0)),
2897
- "spot_info": item.get("spot_info", {}),
2898
- }
2899
- except Exception:
2900
- pass
2901
-
2902
2948
  if availability_info:
2903
2949
  # GPU architecture mapping (for display)
2904
2950
  gpu_architectures = {
@@ -3255,8 +3301,19 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3255
3301
  For VS Code Remote or manual SSH, use 'gpu-dev show' to see full SSH command.
3256
3302
  """
3257
3303
  import subprocess
3304
+ from pathlib import Path
3258
3305
 
3259
3306
  try:
3307
+ # Fast path: if reservation ID given, check local SSH config first (no network)
3308
+ if reservation_id:
3309
+ ssh_config_dir = Path.home() / ".gpu-dev"
3310
+ matches = list(ssh_config_dir.glob(f"{reservation_id}*-sshconfig")) if ssh_config_dir.exists() else []
3311
+ if matches:
3312
+ pod_name = f"gpu-dev-{reservation_id[:8]}"
3313
+ rprint(f"[cyan]Connecting to {pod_name}...[/cyan]\n")
3314
+ os.execvp("ssh", ["ssh", pod_name])
3315
+ return
3316
+
3260
3317
  with Live(
3261
3318
  Spinner("dots", text="📡 Fetching reservation details..."), console=console
3262
3319
  ) as live:
@@ -3301,21 +3358,30 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3301
3358
 
3302
3359
  live.start()
3303
3360
 
3304
- # If the selected reservation is from east1, switch to east1 reservation_mgr
3305
- _sel = next((r for r in (locals().get("reservations") or []) if r.get("reservation_id", "").startswith(reservation_id)), None)
3306
- if _sel and _sel.get("_region") == "us-east-1":
3307
- import os as _os
3308
- east1_cfg = Config.ENVIRONMENTS.get("prod-east1", {})
3309
- _os.environ["AWS_DEFAULT_REGION"] = east1_cfg["region"]
3310
- _east1_config = Config()
3311
- _east1_config.aws_region = east1_cfg["region"]
3312
- reservation_mgr = ReservationManager(_east1_config)
3313
-
3314
- # Get connection info
3361
+ # Try current region first, then cross-region if not found
3315
3362
  connection_info = reservation_mgr.get_connection_info(
3316
3363
  reservation_id, user_info["user_id"]
3317
3364
  )
3318
3365
 
3366
+ # If not found, try the other region
3367
+ if not connection_info:
3368
+ import os as _os
3369
+ current_env = config.user_config.get("environment", "prod")
3370
+ other_envs = {"prod": "prod-east1", "prod-east1": "prod"}
3371
+ other_env_name = other_envs.get(current_env)
3372
+ if other_env_name:
3373
+ other_env = Config.ENVIRONMENTS.get(other_env_name, {})
3374
+ if other_env:
3375
+ _os.environ["AWS_DEFAULT_REGION"] = other_env["region"]
3376
+ _other_config = Config()
3377
+ _other_config.aws_region = other_env["region"]
3378
+ other_mgr = ReservationManager(_other_config)
3379
+ connection_info = other_mgr.get_connection_info(
3380
+ reservation_id, user_info["user_id"]
3381
+ )
3382
+ if connection_info:
3383
+ reservation_mgr = other_mgr
3384
+
3319
3385
  live.stop()
3320
3386
 
3321
3387
  if not connection_info:
@@ -3864,7 +3930,7 @@ def set(key: str, value: str) -> None:
3864
3930
 
3865
3931
 
3866
3932
  @config.command()
3867
- @click.argument("env_name", type=click.Choice(["test", "prod", "prod-east1"]))
3933
+ @click.argument("env_name", type=click.Choice(["test", "prod"]))
3868
3934
  def environment(env_name: str) -> None:
3869
3935
  """Set the environment
3870
3936
 
@@ -3876,7 +3942,7 @@ def environment(env_name: str) -> None:
3876
3942
  \b
3877
3943
  Examples:
3878
3944
  gpu-dev config environment prod # Production (us-east-2)
3879
- gpu-dev config environment prod-east1 # Spot-only us-east-1
3945
+ gpu-dev config environment prod # Production (spot accessible via interactive picker)
3880
3946
  gpu-dev config environment test # Test (us-west-1)
3881
3947
 
3882
3948
  Environment configurations:
@@ -3,6 +3,7 @@
3
3
  import os
4
4
  import json
5
5
  import boto3
6
+ import botocore.exceptions
6
7
  from pathlib import Path
7
8
  from typing import Dict, Any, Optional
8
9
 
@@ -42,13 +43,14 @@ class Config:
42
43
  # Load unified config (handles migration from legacy files)
43
44
  self.user_config = self._load_config()
44
45
 
45
- # Get region from config, then AWS env vars, or default
46
- if self.user_config.get("region"):
46
+ # Get region: env vars take priority (for spot routing), then config, then default
47
+ env_region = os.getenv("AWS_REGION") or os.getenv("AWS_DEFAULT_REGION")
48
+ if env_region and env_region != self.user_config.get("region"):
49
+ self.aws_region = env_region
50
+ elif self.user_config.get("region"):
47
51
  self.aws_region = self.user_config["region"]
48
52
  else:
49
- self.aws_region = os.getenv(
50
- "AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-2")
51
- )
53
+ self.aws_region = "us-east-2"
52
54
 
53
55
  os.environ["AWS_DEFAULT_REGION"] = self.aws_region
54
56
 
@@ -71,17 +73,63 @@ class Config:
71
73
  self._sqs_client = None
72
74
  self._dynamodb = None
73
75
 
76
+ _CRED_CACHE = Path.home() / ".config" / "gpu-dev" / "aws-cred-cache.json"
77
+
74
78
  def _create_aws_session(self):
75
- """Create AWS session with profile support"""
76
- available_profiles = boto3.Session().available_profiles
77
- if "gpu-dev" in available_profiles:
78
- try:
79
- session = boto3.Session(profile_name="gpu-dev")
80
- session.get_credentials()
81
- return session
82
- except Exception:
83
- pass
84
- return boto3.Session()
79
+ """Create AWS session, caching resolved credentials to skip SSO resolution (~900ms)."""
80
+ import time as _time
81
+
82
+ # Try cached credentials first (avoids 900ms SSO resolution)
83
+ try:
84
+ if self._CRED_CACHE.exists():
85
+ cached = json.loads(self._CRED_CACHE.read_text())
86
+ if _time.time() < cached.get("expires", 0):
87
+ return boto3.Session(
88
+ aws_access_key_id=cached["access_key"],
89
+ aws_secret_access_key=cached["secret_key"],
90
+ aws_session_token=cached["token"],
91
+ region_name=self.aws_region,
92
+ )
93
+ except Exception:
94
+ pass
95
+
96
+ # Resolve credentials from SSO/profile (slow path, ~900ms)
97
+ try:
98
+ session = boto3.Session(profile_name="gpu-dev")
99
+ creds = session.get_credentials()
100
+ if not creds:
101
+ raise Exception("no credentials")
102
+ except Exception:
103
+ session = boto3.Session()
104
+ creds = session.get_credentials()
105
+
106
+ # Cache resolved credentials (safe — they're short-lived STS tokens)
107
+ try:
108
+ frozen = creds.get_frozen_credentials()
109
+ if frozen.token:
110
+ self._CRED_CACHE.parent.mkdir(parents=True, exist_ok=True)
111
+ self._CRED_CACHE.write_text(json.dumps({
112
+ "access_key": frozen.access_key,
113
+ "secret_key": frozen.secret_key,
114
+ "token": frozen.token,
115
+ "expires": _time.time() + 2700, # cache 45min (SSO tokens last ~1h)
116
+ }))
117
+ self._CRED_CACHE.chmod(0o600)
118
+ except Exception:
119
+ pass
120
+
121
+ return session
122
+
123
+ def refresh_session(self):
124
+ """Clear cached credentials and re-resolve. Called on ExpiredTokenException."""
125
+ try:
126
+ self._CRED_CACHE.unlink(missing_ok=True)
127
+ except Exception:
128
+ pass
129
+ self.session = self._create_aws_session()
130
+ self._sts_client = None
131
+ self._sqs_client = None
132
+ self._dynamodb = None
85
133
 
86
134
  @property
87
135
  def sts_client(self):
@@ -355,8 +355,21 @@ def unlock_disk(disk_name: str, user_id: str, config: Config) -> bool:
355
355
  return False
356
356
 
357
357
  if not disk['in_use']:
358
- print(f"Disk '{disk_name}' is not locked")
359
- return False
358
+ # DDB says not locked — but check if EBS volume is still physically attached
359
+ try:
360
+ ec2 = config.session.client('ec2', region_name=config.aws_region)
361
+ vols = ec2.describe_volumes(Filters=[
362
+ {"Name": "tag:gpu-dev-user", "Values": [user_id]},
363
+ {"Name": "tag:disk_name", "Values": [disk_name]},
364
+ {"Name": "status", "Values": ["in-use"]},
365
+ ]).get("Volumes", [])
366
+ if not vols:
367
+ print(f"Disk '{disk_name}' is not locked")
368
+ return False
369
+ print(f"Disk '{disk_name}' DDB lock is clear but EBS volume is still attached — sending force-detach request")
370
+ except Exception:
371
+ print(f"Disk '{disk_name}' is not locked")
372
+ return False
360
373
 
361
374
  operation_id = str(uuid.uuid4())
362
375