gpu-dev 0.6.2__tar.gz → 0.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/PKG-INFO +1 -1
  2. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +4 -0
  4. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +32 -8
  5. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/pyproject.toml +1 -1
  6. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/README.md +54 -1
  7. gpu_dev-0.6.4/sdk/python/examples/batch_multi_gpu.py +66 -0
  8. gpu_dev-0.6.4/sdk/python/examples/interactive_debug.py +54 -0
  9. gpu_dev-0.6.4/sdk/python/examples/run_tests.py +64 -0
  10. gpu_dev-0.6.4/sdk/python/examples/submit_job.py +38 -0
  11. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_backend/aws.py +21 -10
  12. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_sync/client.py +60 -0
  13. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_sync/sandbox.py +163 -0
  14. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/reservation_processor/index.py +46 -41
  15. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda.tf +2 -2
  16. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/.github/workflows/no-gitlinks.yml +0 -0
  17. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/.github/workflows/publish.yml +0 -0
  18. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/.gitignore +0 -0
  19. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/CLAUDE.md +0 -0
  20. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/README.md +0 -0
  21. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/admin/README.md +0 -0
  22. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/admin/generate_stats.py +0 -0
  23. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/admin/requirements.txt +0 -0
  24. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/README.md +0 -0
  25. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  26. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  27. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  28. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  29. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  30. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  31. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  32. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  33. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  34. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  35. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  36. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  37. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  38. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  39. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  40. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/docs/USER_GUIDE.md +0 -0
  41. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/docs/devgpu-features.html +0 -0
  42. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/docs/docker-mark-blue.svg +0 -0
  43. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/docs/icons8-cursor-ai.svg +0 -0
  44. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/examples/quickstart.ipynb +0 -0
  45. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/pyproject.toml +0 -0
  46. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/__init__.py +0 -0
  47. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
  48. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  49. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_backend/protocol.py +0 -0
  50. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  51. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  52. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
  53. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  54. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/common/config.py +0 -0
  55. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/common/enums.py +0 -0
  56. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/common/errors.py +0 -0
  57. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/common/models.py +0 -0
  58. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/src/gpu_dev/py.typed +0 -0
  59. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/tests/__init__.py +0 -0
  60. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/sdk/python/tests/test_models.py +0 -0
  61. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/setup.cfg +0 -0
  62. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  63. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  64. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/README.md +0 -0
  65. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/alb.tf +0 -0
  66. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/ami-baker.tf +0 -0
  67. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/availability.tf +0 -0
  68. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/backend.tf +0 -0
  69. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/check_b200.py +0 -0
  70. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  71. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  72. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  73. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  74. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  75. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/bash_profile +0 -0
  76. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/bashrc +0 -0
  77. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  78. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  79. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  80. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  81. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/motd_script +0 -0
  82. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  83. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/profile +0 -0
  84. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  85. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  86. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  87. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/shell_env +0 -0
  88. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/ssh_config +0 -0
  89. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/zprofile +0 -0
  90. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/zshrc +0 -0
  91. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  92. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker-build.tf +0 -0
  93. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  94. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  95. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/ecr.tf +0 -0
  96. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/efs.tf +0 -0
  97. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/eks.tf +0 -0
  98. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/expiry.tf +0 -0
  99. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/git-cache.tf +0 -0
  100. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  101. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/kubernetes.tf +0 -0
  102. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  103. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  104. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  105. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  106. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  107. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  108. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  109. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  110. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  111. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  112. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  113. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  114. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  115. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  116. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/list_b200.py +0 -0
  117. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/main.tf +0 -0
  118. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/mig-config.tf +0 -0
  119. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  120. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  121. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  122. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  123. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  124. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  125. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/monitoring.tf +0 -0
  126. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  127. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/outputs.tf +0 -0
  128. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/pyproject.toml +0 -0
  129. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/queue.tf +0 -0
  130. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/route53.tf +0 -0
  131. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  132. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  133. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  134. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  135. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  136. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  137. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  138. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  139. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  140. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  141. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  142. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/switch-to.sh +0 -0
  143. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  144. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  145. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  146. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  147. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  148. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/terraform-gpu-devservers/variables.tf +0 -0
  149. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/tests/submit/README.md +0 -0
  150. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/tests/submit/fail/run.sh +0 -0
  151. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/tests/submit/multinode/run.sh +0 -0
  152. {gpu_dev-0.6.2 → gpu_dev-0.6.4}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.6.2
3
+ Version: 0.6.4
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.6.2
3
+ Version: 0.6.4
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -32,7 +32,11 @@ docs/docker-mark-blue.svg
32
32
  docs/icons8-cursor-ai.svg
33
33
  sdk/python/README.md
34
34
  sdk/python/pyproject.toml
35
+ sdk/python/examples/batch_multi_gpu.py
36
+ sdk/python/examples/interactive_debug.py
35
37
  sdk/python/examples/quickstart.ipynb
38
+ sdk/python/examples/run_tests.py
39
+ sdk/python/examples/submit_job.py
36
40
  sdk/python/src/gpu_dev/__init__.py
37
41
  sdk/python/src/gpu_dev/py.typed
38
42
  sdk/python/src/gpu_dev/_async/__init__.py
@@ -1192,8 +1192,10 @@ def reserve(
1192
1192
  # Build choices
1193
1193
  choices = []
1194
1194
 
1195
- # Get available disks (exclude in-use and deleted disks)
1196
- available_disks = [d for d in existing_disks if not d['in_use'] and not d.get('is_deleted', False)]
1195
+ # Show all non-deleted disks, marking in-use ones as disabled
1196
+ all_disks = [d for d in existing_disks if not d.get('is_deleted', False)]
1197
+ available_disks = [d for d in all_disks if not d['in_use']]
1198
+ in_use_disks = [d for d in all_disks if d['in_use']]
1197
1199
 
1198
1200
  if available_disks:
1199
1201
  choices.append(questionary.Separator("=== Available Disks ==="))
@@ -1204,6 +1206,17 @@ def reserve(
1204
1206
  value=("select", d['name'])
1205
1207
  ))
1206
1208
 
1209
+ if in_use_disks:
1210
+ choices.append(questionary.Separator("=== In Use ==="))
1211
+ for d in in_use_disks:
1212
+ res_id = d.get('reservation_id', '?')[:8]
1213
+ display = f"{d['name']} ({d['size_gb']}GB) — in use by {res_id}"
1214
+ choices.append(questionary.Choice(
1215
+ title=display,
1216
+ value=("in_use", d['name']),
1217
+ disabled="currently in use",
1218
+ ))
1219
+
1207
1220
  choices.append(questionary.Separator("=== Options ==="))
1208
1221
  choices.append(questionary.Choice(
1209
1222
  title="Create a new disk",
@@ -3307,12 +3320,21 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3307
3320
  # Fast path: if reservation ID given, check local SSH config first (no network)
3308
3321
  if reservation_id:
3309
3322
  ssh_config_dir = Path.home() / ".gpu-dev"
3310
- matches = list(ssh_config_dir.glob(f"{reservation_id}*-sshconfig")) if ssh_config_dir.exists() else []
3311
- if matches:
3312
- pod_name = f"gpu-dev-{reservation_id[:8]}"
3313
- rprint(f"[cyan]Connecting to {pod_name}...[/cyan]\n")
3314
- os.execvp("ssh", ["ssh", pod_name])
3315
- return
3323
+ config_file = ssh_config_dir / f"{reservation_id[:8]}-sshconfig"
3324
+ if config_file.exists():
3325
+ config_text = config_file.read_text()
3326
+ fqdn_line = [l.strip() for l in config_text.splitlines() if l.strip().startswith("HostName")]
3327
+ if fqdn_line:
3328
+ fqdn = fqdn_line[0].split(None, 1)[1]
3329
+ pod_name = f"gpu-dev-{reservation_id[:8]}"
3330
+ rprint(f"[cyan]Connecting to {pod_name}...[/cyan]\n")
3331
+ import subprocess, sys
3332
+ sys.exit(subprocess.call([
3333
+ "ssh", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null",
3334
+ "-o", "ProxyCommand=gpu-dev-ssh-proxy %h %p",
3335
+ "-o", "ForwardAgent=yes",
3336
+ f"dev@{fqdn}",
3337
+ ]))
3316
3338
 
3317
3339
  with Live(
3318
3340
  Spinner("dots", text="📡 Fetching reservation details..."), console=console
@@ -3543,7 +3565,9 @@ def connect(ctx: click.Context, reservation_id: Optional[str]) -> None:
3543
3565
  except KeyboardInterrupt:
3544
3566
  rprint("\n[yellow]Connection cancelled by user[/yellow]")
3545
3567
  except Exception as e:
3568
+ import traceback
3546
3569
  rprint(f"[red]❌ Error: {str(e)}[/red]")
3570
+ traceback.print_exc()
3547
3571
 
3548
3572
 
3549
3573
  @main.command(name="get-ssh-config")
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.6.2"
7
+ version = "0.6.4"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -44,6 +44,23 @@ with client.reserve(gpu_type="t4") as sb:
44
44
  # reservation cancelled automatically
45
45
  ```
46
46
 
47
+ ## Progress Tracking
48
+
49
+ ```python
50
+ # Built-in progress logging
51
+ sandbox = client.reserve(gpu_type="h100", on_progress=True)
52
+ # [ 1.5s] pending
53
+ # [ 3.2s] preparing
54
+ # [ 8.1s] 🚀 Container running
55
+ # [ 22.4s] Ready
56
+
57
+ # Custom callback
58
+ sandbox = client.reserve(
59
+ gpu_type="h100",
60
+ on_progress=lambda msg, t: print(f"⏳ [{t:.0f}s] {msg}")
61
+ )
62
+ ```
63
+
47
64
  ## Available GPU Types
48
65
 
49
66
  | Type | GPUs/node | Architecture |
@@ -75,6 +92,7 @@ client = GpuDev(GpuDevConfig(github_user="octocat")) # Explicit config
75
92
  | `list(status=[...])` | List reservations as `Sandbox` objects |
76
93
  | `availability()` | GPU availability by type |
77
94
  | `disks()` | List persistent disks |
95
+ | `search_logs(reservation_id)` | Get processing logs for any reservation |
78
96
 
79
97
  ### `Sandbox` — Reserved Environment
80
98
 
@@ -82,6 +100,8 @@ client = GpuDev(GpuDevConfig(github_user="octocat")) # Explicit config
82
100
  sandbox = client.reserve(gpu_type="h100")
83
101
  ```
84
102
 
103
+ **Methods:**
104
+
85
105
  | Method | Description |
86
106
  |--------|-------------|
87
107
  | `exec(command, timeout=None)` | Run shell command, returns `ExecResult` |
@@ -91,7 +111,11 @@ sandbox = client.reserve(gpu_type="h100")
91
111
  | `extend(hours)` | Extend duration |
92
112
  | `refresh()` | Refresh status from server |
93
113
  | `add_user(github_username)` | Grant SSH access to another user |
94
- | `wait_until_ready(timeout_minutes)` | Block until active |
114
+ | `wait_until_ready(timeout, on_progress)` | Block until active |
115
+ | `logs()` | Get reservation processing log |
116
+ | `pod_logs(lines=50)` | Get container stdout via SSH |
117
+
118
+ **Properties:**
95
119
 
96
120
  | Property | Description |
97
121
  |----------|-------------|
@@ -101,8 +125,15 @@ sandbox = client.reserve(gpu_type="h100")
101
125
  | `gpu_count` | Number of GPUs |
102
126
  | `ssh_command` | SSH command string |
103
127
  | `pod_name` | SSH hostname |
128
+ | `fqdn` | Fully-qualified domain name |
104
129
  | `is_active` | Whether ready for commands |
105
130
  | `expires_at` | Expiration time |
131
+ | `disk_name` | Attached persistent disk |
132
+ | `instance_type` | EC2 instance type |
133
+ | `created_at` | Creation timestamp |
134
+ | `node_ip` | Node public IP |
135
+ | `detailed_status` | Detailed status message |
136
+ | `user_id` | Owner's user ID |
106
137
 
107
138
  ### `ExecResult`
108
139
 
@@ -113,6 +144,21 @@ result.stdout # "hello\n"
113
144
  result.stderr # ""
114
145
  ```
115
146
 
147
+ ## Logs & Debugging
148
+
149
+ ```python
150
+ # Reservation processing log (what happened during setup)
151
+ for entry in sandbox.logs():
152
+ print(f"[{entry['timestamp'][11:23]}] {entry['message']}")
153
+
154
+ # Look up logs for any reservation by ID prefix
155
+ for entry in client.search_logs("abc12345"):
156
+ print(f"[{entry['timestamp'][11:23]}] {entry['message']}")
157
+
158
+ # Container stdout (via SSH)
159
+ print(sandbox.pod_logs(lines=20))
160
+ ```
161
+
116
162
  ## Spot Instances
117
163
 
118
164
  Use spot instances for lower cost (may be preempted):
@@ -129,6 +175,7 @@ Data persists across reservations when using named disks:
129
175
  # First session
130
176
  sb = client.reserve(gpu_type="h100", disk_name="my-project")
131
177
  sb.exec("pip install torch && echo done")
178
+ sb.cancel()
132
179
 
133
180
  # Later session — packages still installed
134
181
  sb = client.reserve(gpu_type="h100", disk_name="my-project")
@@ -185,3 +232,9 @@ except GpuDevValidationError as e:
185
232
  except GpuDevTimeoutError:
186
233
  print("Reservation timed out — GPUs may be busy")
187
234
  ```
235
+
236
+ Credentials are cached to disk (45-min TTL) and auto-refreshed on expiry — no manual re-auth needed in long-running notebooks.
237
+
238
+ ## Interactive Notebook
239
+
240
+ See [examples/quickstart.ipynb](examples/quickstart.ipynb) for a hands-on walkthrough.
@@ -0,0 +1,66 @@
1
+ """Run the same job across multiple GPU types and compare results.
2
+
3
+ Useful for benchmarking or testing compatibility across hardware.
4
+
5
+ Usage:
6
+ python batch_multi_gpu.py
7
+ """
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+ from gpu_dev import GpuDev, GpuDevError
11
+
12
+ client = GpuDev()
13
+
14
+ BENCHMARK_CMD = """
15
+ python3 -c '
16
+ import torch, time
17
+ gpu = torch.cuda.get_device_name(0)
18
+ x = torch.randn(4096, 4096, device="cuda")
19
+ torch.cuda.synchronize()
20
+ t0 = time.time()
21
+ for _ in range(100):
22
+ y = x @ x
23
+ torch.cuda.synchronize()
24
+ ms = (time.time() - t0) * 1000
25
+ print(f"{gpu}|{ms:.0f}")
26
+ '
27
+ """
28
+
29
+ GPU_TYPES = ["t4", "l4", "rtxpro6000"]
30
+
31
+
32
+ def run_benchmark(gpu_type: str) -> dict:
33
+ try:
34
+ sb = client.reserve(
35
+ gpu_type=gpu_type,
36
+ gpu_count=1,
37
+ hours=0.25,
38
+ name=f"bench-{gpu_type}",
39
+ )
40
+ result = sb.exec(BENCHMARK_CMD.strip(), timeout=30)
41
+ sb.cancel()
42
+
43
+ if result.exit_code == 0 and "|" in result.stdout:
44
+ gpu_name, ms = result.stdout.strip().split("|")
45
+ return {"gpu_type": gpu_type, "gpu_name": gpu_name, "ms": float(ms), "ok": True}
46
+ return {"gpu_type": gpu_type, "error": result.stderr or result.stdout, "ok": False}
47
+ except GpuDevError as e:
48
+ return {"gpu_type": gpu_type, "error": str(e), "ok": False}
49
+
50
+
51
+ print(f"Benchmarking matmul 4096x4096 x100 across {len(GPU_TYPES)} GPU types...\n")
52
+
53
+ # Run in parallel
54
+ with ThreadPoolExecutor(max_workers=len(GPU_TYPES)) as ex:
55
+ futures = {ex.submit(run_benchmark, gt): gt for gt in GPU_TYPES}
56
+
57
+ print(f"{'GPU Type':15s} {'GPU Name':30s} {'Time':>8s}")
58
+ print("-" * 55)
59
+ for future in as_completed(futures):
60
+ r = future.result()
61
+ if r["ok"]:
62
+ print(f"{r['gpu_type']:15s} {r['gpu_name']:30s} {r['ms']:>7.0f}ms")
63
+ else:
64
+ print(f"{r['gpu_type']:15s} FAILED: {r['error'][:40]}")
65
+
66
+ print("\nDone")
@@ -0,0 +1,54 @@
1
+ """Interactive debugging: reserve a GPU, poke around, inspect logs.
2
+
3
+ Use this in a Python REPL or Jupyter notebook for ad-hoc debugging.
4
+
5
+ from gpu_dev import GpuDev
6
+ client = GpuDev()
7
+ exec(open("examples/interactive_debug.py").read())
8
+ """
9
+ from gpu_dev import GpuDev
10
+
11
+ client = GpuDev()
12
+
13
+ # Show what's available
14
+ print("GPU Availability:")
15
+ for gpu, info in sorted(client.availability().items()):
16
+ if info.total > 0:
17
+ print(f" {gpu:15s} {info.available:>3d}/{info.total} free")
18
+
19
+ # Show active reservations
20
+ print("\nActive reservations:")
21
+ for sb in client.list():
22
+ print(f" {sb.id[:8]} {sb.gpu_count}x {sb.gpu_type:10s} {sb.status.value:10s} disk={sb.disk_name or '-'}")
23
+
24
+ # Show disks
25
+ print("\nDisks:")
26
+ for d in client.disks():
27
+ status = "IN USE" if d.in_use else "free"
28
+ print(f" {d.name:20s} {d.snapshot_count:>3d} snapshots {status}")
29
+
30
+ # Reconnect to most recent active reservation
31
+ active = client.list(status=["active"])
32
+ if active:
33
+ sb = active[0]
34
+ print(f"\nReconnected to {sb.id[:8]} ({sb.gpu_count}x {sb.gpu_type})")
35
+ print(f" SSH: ssh {sb.pod_name}")
36
+ print(f" Disk: {sb.disk_name}")
37
+ print(f" Expires: {sb.expires_at}")
38
+
39
+ # Quick health check
40
+ result = sb.exec("nvidia-smi -L 2>&1 | head -4", timeout=5)
41
+ if result.exit_code == 0:
42
+ print(f" GPU: {result.stdout.strip()}")
43
+ else:
44
+ print(f" GPU check failed (exit {result.exit_code})")
45
+
46
+ # Show setup logs
47
+ print(f"\n Setup log:")
48
+ for entry in sb.logs():
49
+ print(f" [{entry['timestamp'][11:19]}] {entry['message'][:70]}")
50
+ else:
51
+ print("\nNo active reservations")
52
+
53
+ # Look up a past reservation's logs
54
+ # client.search_logs("abc12345")
@@ -0,0 +1,64 @@
1
+ """Run tests on a GPU server with a persistent disk snapshot.
2
+
3
+ Loads a pre-configured environment from a named disk and runs
4
+ a test suite — useful for CI or interactive debugging.
5
+
6
+ Usage:
7
+ python run_tests.py
8
+ python run_tests.py --branch feature/my-fix
9
+ """
10
+ import sys
11
+
12
+ from gpu_dev import GpuDev, GpuDevTimeoutError
13
+
14
+ branch = sys.argv[1] if len(sys.argv) > 1 else "main"
15
+ client = GpuDev()
16
+
17
+ print(f"Reserving H100 with 'pytorch-dev' disk (branch: {branch})...")
18
+
19
+ try:
20
+ sb = client.reserve(
21
+ gpu_type="h100",
22
+ gpu_count=1,
23
+ hours=2,
24
+ disk_name="pytorch-dev", # pre-compiled PyTorch environment
25
+ name=f"test-{branch[:20]}",
26
+ on_progress=True,
27
+ )
28
+ except GpuDevTimeoutError:
29
+ print("No GPU capacity available — try again later or use spot")
30
+ sys.exit(1)
31
+
32
+ print(f"\nRunning on {sb.pod_name} ({sb.instance_type})")
33
+
34
+ # Pull latest code
35
+ result = sb.exec(f"""
36
+ cd /home/dev/pytorch && \
37
+ git fetch origin && \
38
+ git checkout {branch} && \
39
+ git pull origin {branch}
40
+ """, timeout=120)
41
+ print(result.stdout[-200:] if result.stdout else "(no output)")
42
+
43
+ if result.exit_code != 0:
44
+ print(f"Git checkout failed: {result.stderr}")
45
+ sb.cancel()
46
+ sys.exit(1)
47
+
48
+ # Run tests
49
+ print(f"\nRunning tests on {branch}...")
50
+ result = sb.exec(
51
+ "cd /home/dev/pytorch && python test/run_test.py test_torch 2>&1 | tail -30",
52
+ timeout=1800,
53
+ )
54
+ print(result.stdout)
55
+
56
+ # Show timing from reservation logs
57
+ print("\nReservation timeline:")
58
+ for entry in sb.logs():
59
+ print(f" [{entry['timestamp'][11:23]}] {entry['message'][:80]}")
60
+
61
+ exit_code = result.exit_code
62
+ sb.cancel()
63
+ print(f"\nTests {'PASSED' if exit_code == 0 else 'FAILED'} (exit {exit_code})")
64
+ sys.exit(exit_code)
@@ -0,0 +1,38 @@
1
+ """Submit a training job to a GPU server and wait for results.
2
+
3
+ Usage:
4
+ python submit_job.py
5
+ """
6
+ from gpu_dev import GpuDev
7
+
8
+ client = GpuDev()
9
+
10
+ # Reserve a T4 GPU, auto-cancel when done
11
+ with client.reserve(gpu_type="t4", hours=1, name="training-job", on_progress=True) as sb:
12
+ print(f"\nReserved: {sb.id[:8]} on {sb.instance_type}")
13
+ print(f"SSH: {sb.ssh_command}\n")
14
+
15
+ # Upload training script
16
+ sb.upload("./train.py", "/home/dev/train.py")
17
+
18
+ # Run training
19
+ print("Starting training...")
20
+ result = sb.exec("cd /home/dev && python train.py 2>&1", timeout=600)
21
+ print(result.stdout)
22
+
23
+ if result.exit_code != 0:
24
+ print(f"Training failed (exit {result.exit_code})")
25
+ print(result.stderr)
26
+ else:
27
+ # Download results
28
+ sb.download("/home/dev/output/", "./results/")
29
+ print("Results downloaded to ./results/")
30
+
31
+ # Check logs if something went wrong
32
+ if result.exit_code != 0:
33
+ print("\nReservation logs:")
34
+ for entry in sb.logs("error"):
35
+ print(f" [{entry['timestamp'][11:23]}] {entry['message']}")
36
+
37
+ # Reservation auto-cancelled
38
+ print("Done — reservation cleaned up")
@@ -30,16 +30,19 @@ _PREFIX = "pytorch-gpu-dev"
30
30
  _CRED_CACHE_PATH = Path.home() / ".config" / "gpu-dev" / "aws-cred-cache.json"
31
31
  _CRED_CACHE_TTL = 2700 # 45 min (SSO session tokens typically last 1h)
32
32
 
33
- # Module-level session cache reused across AwsBackend instances in the same process
33
+ # Module-level session cache with expiry tracking
34
34
  _cached_session: boto3.Session | None = None
35
+ _cached_session_expires: float = 0
35
36
 
36
37
 
37
38
  def _get_session() -> boto3.Session:
38
39
  """Get a boto3 session with disk-cached credentials (saves ~900ms SSO resolution)."""
39
- global _cached_session
40
- if _cached_session is not None:
40
+ global _cached_session, _cached_session_expires
41
+ if _cached_session is not None and time.time() < _cached_session_expires:
41
42
  return _cached_session
42
43
 
44
+ _cached_session = None
45
+
43
46
  # Try disk-cached credentials
44
47
  try:
45
48
  if _CRED_CACHE_PATH.exists():
@@ -50,6 +53,7 @@ def _get_session() -> boto3.Session:
50
53
  aws_secret_access_key=cached["secret_key"],
51
54
  aws_session_token=cached["token"],
52
55
  )
56
+ _cached_session_expires = cached["expires"]
53
57
  return _cached_session
54
58
  except Exception:
55
59
  pass
@@ -80,6 +84,7 @@ def _get_session() -> boto3.Session:
80
84
  pass
81
85
 
82
86
  _cached_session = session
87
+ _cached_session_expires = time.time() + _CRED_CACHE_TTL
83
88
  return session
84
89
 
85
90
 
@@ -100,11 +105,13 @@ class AwsBackend:
100
105
  self._reservations = self._ddb.Table(f"{_PREFIX}-reservations")
101
106
  self._availability = self._ddb.Table(f"{_PREFIX}-gpu-availability")
102
107
  self._disks = self._ddb.Table(f"{_PREFIX}-disks")
108
+ self._queue_url: str | None = None
103
109
 
104
110
  def _refresh_on_expired(self) -> None:
105
111
  """Clear cached session and reinitialize clients."""
106
- global _cached_session
112
+ global _cached_session, _cached_session_expires
107
113
  _cached_session = None
114
+ _cached_session_expires = 0
108
115
  try:
109
116
  _CRED_CACHE_PATH.unlink(missing_ok=True)
110
117
  except Exception:
@@ -182,13 +189,17 @@ class AwsBackend:
182
189
  return self._item_to_info(item)
183
190
  return None
184
191
 
185
- resp = self._reservations.query(
186
- IndexName="UserIndex",
187
- KeyConditionExpression="user_id = :uid",
188
- FilterExpression="begins_with(reservation_id, :rid)",
189
- ExpressionAttributeValues={":uid": user_id, ":rid": reservation_id},
190
- )
192
+ query_kwargs = {
193
+ "IndexName": "UserIndex",
194
+ "KeyConditionExpression": "user_id = :uid",
195
+ "FilterExpression": "begins_with(reservation_id, :rid)",
196
+ "ExpressionAttributeValues": {":uid": user_id, ":rid": reservation_id},
197
+ }
198
+ resp = self._reservations.query(**query_kwargs)
191
199
  items = resp.get("Items", [])
200
+ while not items and "LastEvaluatedKey" in resp:
201
+ resp = self._reservations.query(**query_kwargs, ExclusiveStartKey=resp["LastEvaluatedKey"])
202
+ items = resp.get("Items", [])
192
203
  if len(items) == 1:
193
204
  return self._item_to_info(items[0])
194
205
  return None
@@ -243,3 +243,63 @@ class GpuDev:
243
243
  """
244
244
  user_info = self._auth()
245
245
  return self._backend.list_disks(user_info["user_id"])
246
+
247
+ def search_logs(
248
+ self,
249
+ reservation_id: str,
250
+ ) -> list[dict[str, str]]:
251
+ """Get status history for any reservation by ID.
252
+
253
+ Args:
254
+ reservation_id: Full or prefix (8+ chars) reservation ID.
255
+
256
+ Returns:
257
+ List of ``{"timestamp": "...", "message": "..."}`` dicts.
258
+
259
+ Example::
260
+
261
+ for entry in client.search_logs("abc12345"):
262
+ print(f"[{entry['timestamp']}] {entry['message']}")
263
+ """
264
+ from .._backend.aws import _get_session, _PREFIX
265
+
266
+ session = _get_session()
267
+ region = getattr(self._backend, "_region", "us-east-2")
268
+ ddb = session.resource("dynamodb", region_name=region)
269
+ table = ddb.Table(f"{_PREFIX}-reservations")
270
+
271
+ # Try direct lookup first, then query UserIndex by prefix
272
+ try:
273
+ user_info = self._auth()
274
+ if len(reservation_id) >= 32:
275
+ resp = table.get_item(Key={"reservation_id": reservation_id})
276
+ item = resp.get("Item")
277
+ else:
278
+ query_kwargs = {
279
+ "IndexName": "UserIndex",
280
+ "KeyConditionExpression": "user_id = :uid",
281
+ "FilterExpression": "begins_with(reservation_id, :rid)",
282
+ "ExpressionAttributeValues": {
283
+ ":uid": user_info["user_id"],
284
+ ":rid": reservation_id,
285
+ },
286
+ }
287
+ item = None
288
+ resp = table.query(**query_kwargs)
289
+ if resp.get("Items"):
290
+ item = resp["Items"][0]
291
+ else:
292
+ while "LastEvaluatedKey" in resp and not item:
293
+ resp = table.query(**query_kwargs, ExclusiveStartKey=resp["LastEvaluatedKey"])
294
+ if resp.get("Items"):
295
+ item = resp["Items"][0]
296
+
297
+ if not item:
298
+ return []
299
+ history = item.get("status_history", [])
300
+ return [
301
+ {"timestamp": str(e.get("timestamp", "")), "message": str(e.get("message", ""))}
302
+ for e in history
303
+ ]
304
+ except Exception:
305
+ return []