gpu-dev 0.6.4__tar.gz → 0.6.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/PKG-INFO +1 -1
  2. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
  4. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +35 -11
  5. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/pyproject.toml +1 -1
  6. gpu_dev-0.6.6/sdk/python/examples/parallel_experiments.ipynb +360 -0
  7. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/pyproject.toml +1 -1
  8. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/__init__.py +1 -1
  9. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_backend/aws.py +34 -1
  10. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_backend/protocol.py +8 -0
  11. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_sync/client.py +36 -0
  12. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/availability.tf +1 -1
  13. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/expiry.tf +1 -0
  14. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/availability_updater/index.py +8 -7
  15. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +1 -0
  16. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/reservation_processor/index.py +27 -13
  17. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/dns_utils.py +1 -2
  18. gpu_dev-0.6.6/terraform-gpu-devservers/lambda/shared/requirements.txt +3 -0
  19. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda.tf +2 -2
  20. gpu_dev-0.6.4/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -2
  21. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/.github/workflows/no-gitlinks.yml +0 -0
  22. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/.github/workflows/publish.yml +0 -0
  23. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/.gitignore +0 -0
  24. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/CLAUDE.md +0 -0
  25. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/README.md +0 -0
  26. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/admin/README.md +0 -0
  27. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/admin/generate_stats.py +0 -0
  28. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/admin/requirements.txt +0 -0
  29. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/README.md +0 -0
  30. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  31. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  32. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  33. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  34. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  35. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  36. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  37. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
  38. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  39. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
  40. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  41. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  42. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  43. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  44. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  45. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/docs/USER_GUIDE.md +0 -0
  46. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/docs/devgpu-features.html +0 -0
  47. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/docs/docker-mark-blue.svg +0 -0
  48. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/docs/icons8-cursor-ai.svg +0 -0
  49. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/README.md +0 -0
  50. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/examples/batch_multi_gpu.py +0 -0
  51. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/examples/interactive_debug.py +0 -0
  52. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/examples/quickstart.ipynb +0 -0
  53. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/examples/run_tests.py +0 -0
  54. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/examples/submit_job.py +0 -0
  55. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_async/__init__.py +0 -0
  56. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_backend/__init__.py +0 -0
  57. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_sync/__init__.py +0 -0
  58. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_sync/sandbox.py +0 -0
  59. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_transport/__init__.py +0 -0
  60. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/_transport/ssh.py +0 -0
  61. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/__init__.py +0 -0
  62. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/config.py +0 -0
  63. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/enums.py +0 -0
  64. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/errors.py +0 -0
  65. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/common/models.py +0 -0
  66. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/src/gpu_dev/py.typed +0 -0
  67. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/tests/__init__.py +0 -0
  68. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/sdk/python/tests/test_models.py +0 -0
  69. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/setup.cfg +0 -0
  70. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  71. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  72. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/README.md +0 -0
  73. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/alb.tf +0 -0
  74. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/ami-baker.tf +0 -0
  75. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/backend.tf +0 -0
  76. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/check_b200.py +0 -0
  77. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/cluster-autoscaler.tf +0 -0
  78. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/cmd_proxy.py +0 -0
  79. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  80. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  81. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  82. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/bash_profile +0 -0
  83. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/bashrc +0 -0
  84. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  85. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  86. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  87. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  88. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/motd_script +0 -0
  89. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  90. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/profile +0 -0
  91. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  92. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  93. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  94. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/shell_env +0 -0
  95. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/ssh_config +0 -0
  96. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/zprofile +0 -0
  97. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/zshrc +0 -0
  98. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  99. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker-build.tf +0 -0
  100. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  101. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  102. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/ecr.tf +0 -0
  103. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/efs.tf +0 -0
  104. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/eks.tf +0 -0
  105. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/git-cache.tf +0 -0
  106. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/gpu-dev-pod-irsa.tf +0 -0
  107. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/kubernetes.tf +0 -0
  108. {gpu_dev-0.6.4/terraform-gpu-devservers/lambda/reservation_expiry → gpu_dev-0.6.6/terraform-gpu-devservers/lambda/availability_updater}/requirements.txt +0 -0
  109. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  110. {gpu_dev-0.6.4/terraform-gpu-devservers/lambda/reservation_processor → gpu_dev-0.6.6/terraform-gpu-devservers/lambda/reservation_expiry}/requirements.txt +0 -0
  111. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  112. {gpu_dev-0.6.4/terraform-gpu-devservers/lambda/shared → gpu_dev-0.6.6/terraform-gpu-devservers/lambda/reservation_processor}/requirements.txt +0 -0
  113. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  114. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  115. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  116. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  117. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  118. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/list_b200.py +0 -0
  119. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/main.tf +0 -0
  120. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/mig-config.tf +0 -0
  121. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/mig-parted-config.yaml +0 -0
  122. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  123. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  124. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  125. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  126. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  127. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/monitoring.tf +0 -0
  128. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/node-termination-handler.tf +0 -0
  129. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/outputs.tf +0 -0
  130. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/pyproject.toml +0 -0
  131. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/queue.tf +0 -0
  132. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/route53.tf +0 -0
  133. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  134. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  135. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  136. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  137. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  138. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  139. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  140. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  141. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  142. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  143. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/subnet-0fe3a2c45570091ad +0 -0
  144. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/switch-to.sh +0 -0
  145. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  146. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  147. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/ami-baker-user-data.sh +0 -0
  148. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  149. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  150. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/terraform-gpu-devservers/variables.tf +0 -0
  151. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/tests/submit/README.md +0 -0
  152. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/tests/submit/fail/run.sh +0 -0
  153. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/tests/submit/multinode/run.sh +0 -0
  154. {gpu_dev-0.6.4 → gpu_dev-0.6.6}/tests/submit/success/run.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.6.4
3
+ Version: 0.6.6
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.6.4
3
+ Version: 0.6.6
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -34,6 +34,7 @@ sdk/python/README.md
34
34
  sdk/python/pyproject.toml
35
35
  sdk/python/examples/batch_multi_gpu.py
36
36
  sdk/python/examples/interactive_debug.py
37
+ sdk/python/examples/parallel_experiments.ipynb
37
38
  sdk/python/examples/quickstart.ipynb
38
39
  sdk/python/examples/run_tests.py
39
40
  sdk/python/examples/submit_job.py
@@ -154,12 +154,8 @@ def list_disks(user_id: str, config: Config) -> List[Dict]:
154
154
  List all disks for a user.
155
155
  Returns list of disk info dicts with: name, size, last_used, created_at, snapshot_count, in_use, reservation_id
156
156
  """
157
- ec2_client = get_ec2_client(config)
158
- dynamodb = get_dynamodb_resource(config)
159
-
160
- # Query DynamoDB disks table for this user's disks (with pagination)
161
- disks_table_name = config.disks_table if hasattr(config, 'disks_table') else f"{config.queue_name.rsplit('-', 1)[0]}-disks"
162
- disks_table = dynamodb.Table(disks_table_name)
157
+ dynamodb = config.dynamodb
158
+ disks_table = dynamodb.Table(config.disks_table)
163
159
 
164
160
  dynamodb_disks = []
165
161
  response = disks_table.query(
@@ -208,9 +204,6 @@ def list_disks(user_id: str, config: Config) -> List[Dict]:
208
204
  is_deleted = disk_item.get('is_deleted', False)
209
205
  delete_date = disk_item.get('delete_date')
210
206
 
211
- # Check current in_use status (check dynamically from reservations table)
212
- is_in_use, reservation_id = get_disk_in_use_status(disk_name, user_id, config)
213
-
214
207
  disks.append({
215
208
  'name': disk_name,
216
209
  'size_gb': size_gb,
@@ -219,13 +212,44 @@ def list_disks(user_id: str, config: Config) -> List[Dict]:
219
212
  'last_used': last_used,
220
213
  'snapshot_count': snapshot_count,
221
214
  'pending_snapshot_count': pending_snapshot_count,
222
- 'in_use': is_in_use,
215
+ 'in_use': bool(disk_item.get('in_use', False)),
223
216
  'is_backing_up': is_backing_up,
224
- 'reservation_id': reservation_id,
217
+ 'reservation_id': str(disk_item.get('attached_to_reservation', '')) or None,
225
218
  'is_deleted': is_deleted,
226
219
  'delete_date': delete_date,
227
220
  })
228
221
 
222
+ # Batch check: find all active reservations with disk_name set (single query)
223
+ try:
224
+ reservations_table = dynamodb.Table(config.reservations_table)
225
+ active_disks = {}
226
+ for status in ["active", "preparing", "queued", "pending"]:
227
+ resp = reservations_table.query(
228
+ IndexName="UserStatusIndex",
229
+ KeyConditionExpression="user_id = :uid AND #s = :status",
230
+ ExpressionAttributeNames={"#s": "status"},
231
+ ExpressionAttributeValues={":uid": user_id, ":status": status},
232
+ ProjectionExpression="reservation_id, disk_name",
233
+ )
234
+ for item in resp.get("Items", []):
235
+ dn = item.get("disk_name")
236
+ if dn:
237
+ active_disks[dn] = str(item.get("reservation_id", ""))[:8]
238
+
239
+ for disk in disks:
240
+ if disk["name"] in active_disks:
241
+ disk["in_use"] = True
242
+ disk["reservation_id"] = active_disks[disk["name"]]
243
+ except Exception:
244
+ pass
245
+
246
+ # Filter out expired deleted disks (delete_date has passed)
247
+ today = datetime.now(timezone.utc).strftime('%Y-%m-%d')
248
+ disks = [
249
+ d for d in disks
250
+ if not (d.get('is_deleted') and d.get('delete_date') and str(d['delete_date']) <= today)
251
+ ]
252
+
229
253
  # Sort by last_used (most recent first)
230
254
  disks.sort(key=lambda d: d['last_used'] or datetime.min.replace(tzinfo=timezone.utc), reverse=True)
231
255
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.6.4"
7
+ version = "0.6.6"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -0,0 +1,360 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Parallel Experiments with Persistent Disks\n",
8
+ "\n",
9
+ "This notebook demonstrates:\n",
10
+ "1. Creating a base environment on a persistent disk\n",
11
+ "2. Making changes (installing packages, modifying code)\n",
12
+ "3. Cloning the disk for parallel experiments\n",
13
+ "4. Running two experiments simultaneously on different GPUs\n",
14
+ "5. Comparing results and measuring timings"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "%pip install -e .. -q"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": null,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "import time\n",
33
+ "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
34
+ "from gpu_dev import GpuDev\n",
35
+ "\n",
36
+ "client = GpuDev()\n",
37
+ "print(f\"SDK v{__import__('gpu_dev').__version__}\")"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "markdown",
42
+ "metadata": {},
43
+ "source": [
44
+ "## Step 1: Create Base Environment on Persistent Disk\n",
45
+ "\n",
46
+ "Reserve a GPU with a persistent disk and set up the base experiment."
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "t0 = time.time()\n",
56
+ "\n",
57
+ "base = client.reserve(\n",
58
+ " gpu_type=\"t4\",\n",
59
+ " gpu_count=1,\n",
60
+ " hours=1,\n",
61
+ " disk_name=\"experiment-base\",\n",
62
+ " name=\"base-setup\",\n",
63
+ ")\n",
64
+ "\n",
65
+ "reserve_time = time.time() - t0\n",
66
+ "print(f\"Reserved in {reserve_time:.1f}s\")\n",
67
+ "print(f\"Disk: {base.disk_name}\")\n",
68
+ "print(f\"GPU: {base.gpu_type} x{base.gpu_count}\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "# Set up the base experiment: install packages + write training script\n",
78
+ "base.exec(\"pip install -q wandb timm\")\n",
79
+ "\n",
80
+ "# Write a parameterized training script\n",
81
+ "base.exec(r\"\"\"\n",
82
+ "cat > /home/dev/train.py << 'SCRIPT'\n",
83
+ "import torch\n",
84
+ "import torch.nn as nn\n",
85
+ "import time\n",
86
+ "import json\n",
87
+ "import os\n",
88
+ "import sys\n",
89
+ "\n",
90
+ "# Read experiment config from env\n",
91
+ "LR = float(os.environ.get('LR', '0.001'))\n",
92
+ "BATCH_SIZE = int(os.environ.get('BATCH_SIZE', '64'))\n",
93
+ "EPOCHS = int(os.environ.get('EPOCHS', '5'))\n",
94
+ "EXP_NAME = os.environ.get('EXP_NAME', 'default')\n",
95
+ "\n",
96
+ "print(f\"Experiment: {EXP_NAME}\")\n",
97
+ "print(f\"Config: lr={LR}, batch_size={BATCH_SIZE}, epochs={EPOCHS}\")\n",
98
+ "print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
99
+ "print(f\"PyTorch: {torch.__version__}\")\n",
100
+ "\n",
101
+ "# Simple CNN on synthetic data\n",
102
+ "model = nn.Sequential(\n",
103
+ " nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),\n",
104
+ " nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),\n",
105
+ " nn.Flatten(), nn.Linear(64 * 8 * 8, 10)\n",
106
+ ").cuda()\n",
107
+ "\n",
108
+ "optimizer = torch.optim.Adam(model.parameters(), lr=LR)\n",
109
+ "criterion = nn.CrossEntropyLoss()\n",
110
+ "\n",
111
+ "results = {'experiment': EXP_NAME, 'lr': LR, 'batch_size': BATCH_SIZE, 'losses': [], 'epoch_times': []}\n",
112
+ "\n",
113
+ "for epoch in range(EPOCHS):\n",
114
+ " t_start = time.time()\n",
115
+ " epoch_loss = 0\n",
116
+ " for step in range(50):\n",
117
+ " x = torch.randn(BATCH_SIZE, 3, 32, 32, device='cuda')\n",
118
+ " y = torch.randint(0, 10, (BATCH_SIZE,), device='cuda')\n",
119
+ " loss = criterion(model(x), y)\n",
120
+ " optimizer.zero_grad()\n",
121
+ " loss.backward()\n",
122
+ " optimizer.step()\n",
123
+ " epoch_loss += loss.item()\n",
124
+ " avg_loss = epoch_loss / 50\n",
125
+ " epoch_time = time.time() - t_start\n",
126
+ " results['losses'].append(avg_loss)\n",
127
+ " results['epoch_times'].append(epoch_time)\n",
128
+ " print(f\" Epoch {epoch+1}/{EPOCHS}: loss={avg_loss:.4f} ({epoch_time:.2f}s)\")\n",
129
+ "\n",
130
+ "results['final_loss'] = results['losses'][-1]\n",
131
+ "results['avg_epoch_time'] = sum(results['epoch_times']) / len(results['epoch_times'])\n",
132
+ "\n",
133
+ "with open(f'/home/dev/results_{EXP_NAME}.json', 'w') as f:\n",
134
+ " json.dump(results, f)\n",
135
+ "print(f\"Results saved to /home/dev/results_{EXP_NAME}.json\")\n",
136
+ "SCRIPT\n",
137
+ "\"\"\")\n",
138
+ "\n",
139
+ "# Verify\n",
140
+ "result = base.exec(\"ls -la /home/dev/train.py && python3 -c 'import wandb, timm; print(\\\"packages OK\\\")'\")\n",
141
+ "print(result.stdout.strip())"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "markdown",
146
+ "metadata": {},
147
+ "source": [
148
+ "## Step 2: Shut Down and Clone the Disk\n",
149
+ "\n",
150
+ "Cancel the base reservation (disk is snapshotted automatically),\n",
151
+ "then clone it for a parallel experiment."
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "metadata": {},
158
+ "outputs": [],
159
+ "source": [
160
+ "t0 = time.time()\n",
161
+ "base.cancel()\n",
162
+ "cancel_time = time.time() - t0\n",
163
+ "print(f\"Base cancelled in {cancel_time:.1f}s (disk snapshotted)\")"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "# Clone the disk for the second experiment\n",
173
+ "t0 = time.time()\n",
174
+ "client.clone_disk(\"experiment-base\", \"experiment-variant\")\n",
175
+ "clone_time = time.time() - t0\n",
176
+ "print(f\"Disk cloned in {clone_time:.1f}s\")\n",
177
+ "\n",
178
+ "# Show both disks\n",
179
+ "for disk in client.disks():\n",
180
+ " if 'experiment' in disk.name:\n",
181
+ " print(f\" {disk.name:25s} {disk.size_gb}GB {disk.snapshot_count} snapshots\")"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "markdown",
186
+ "metadata": {},
187
+ "source": [
188
+ "## Step 3: Run Parallel Experiments\n",
189
+ "\n",
190
+ "Launch two reservations simultaneously \u2014 one on the original disk (high LR),\n",
191
+ "one on the cloned disk (low LR). Both have the same training script pre-installed."
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": null,
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "experiments = [\n",
201
+ " {\"name\": \"high-lr\", \"disk\": \"experiment-base\", \"env\": \"LR=0.01 BATCH_SIZE=128 EPOCHS=5 EXP_NAME=high_lr\"},\n",
202
+ " {\"name\": \"low-lr\", \"disk\": \"experiment-variant\", \"env\": \"LR=0.0001 BATCH_SIZE=32 EPOCHS=5 EXP_NAME=low_lr\"},\n",
203
+ "]\n",
204
+ "\n",
205
+ "def run_experiment(exp):\n",
206
+ " \"\"\"Reserve GPU, run training, collect results, cancel.\"\"\"\n",
207
+ " timings = {}\n",
208
+ " \n",
209
+ " # Reserve\n",
210
+ " t0 = time.time()\n",
211
+ " sb = client.reserve(\n",
212
+ " gpu_type=\"t4\",\n",
213
+ " gpu_count=1,\n",
214
+ " hours=0.5,\n",
215
+ " disk_name=exp[\"disk\"],\n",
216
+ " name=exp[\"name\"],\n",
217
+ " )\n",
218
+ " timings['reserve'] = time.time() - t0\n",
219
+ " \n",
220
+ " # Run training\n",
221
+ " t0 = time.time()\n",
222
+ " result = sb.exec(f\"{exp['env']} python3 /home/dev/train.py\", timeout=120)\n",
223
+ " timings['train'] = time.time() - t0\n",
224
+ " train_output = result.stdout.strip()\n",
225
+ " \n",
226
+ " # Collect results\n",
227
+ " exp_name = exp['env'].split('EXP_NAME=')[1].split()[0]\n",
228
+ " result = sb.exec(f\"cat /home/dev/results_{exp_name}.json\")\n",
229
+ " import json\n",
230
+ " results = json.loads(result.stdout.strip())\n",
231
+ " \n",
232
+ " # Cancel\n",
233
+ " t0 = time.time()\n",
234
+ " sb.cancel()\n",
235
+ " timings['cancel'] = time.time() - t0\n",
236
+ " \n",
237
+ " return {\n",
238
+ " 'experiment': exp['name'],\n",
239
+ " 'timings': timings,\n",
240
+ " 'results': results,\n",
241
+ " 'train_output': train_output,\n",
242
+ " }\n",
243
+ "\n",
244
+ "# Run both experiments in parallel\n",
245
+ "t_total = time.time()\n",
246
+ "with ThreadPoolExecutor(max_workers=2) as pool:\n",
247
+ " futures = {pool.submit(run_experiment, exp): exp['name'] for exp in experiments}\n",
248
+ " outputs = {}\n",
249
+ " for future in as_completed(futures):\n",
250
+ " name = futures[future]\n",
251
+ " outputs[name] = future.result()\n",
252
+ " print(f\"\u2705 {name} completed\")\n",
253
+ "\n",
254
+ "total_time = time.time() - t_total\n",
255
+ "print(f\"\\nBoth experiments completed in {total_time:.1f}s (parallel)\")"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "markdown",
260
+ "metadata": {},
261
+ "source": [
262
+ "## Step 4: Compare Results"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": null,
268
+ "metadata": {},
269
+ "outputs": [],
270
+ "source": [
271
+ "print(\"=\" * 60)\n",
272
+ "print(f\"{'Metric':<25s} {'High LR':>15s} {'Low LR':>15s}\")\n",
273
+ "print(\"=\" * 60)\n",
274
+ "\n",
275
+ "high = outputs['high-lr']['results']\n",
276
+ "low = outputs['low-lr']['results']\n",
277
+ "\n",
278
+ "print(f\"{'Learning Rate':<25s} {high['lr']:>15.4f} {low['lr']:>15.4f}\")\n",
279
+ "print(f\"{'Batch Size':<25s} {high['batch_size']:>15d} {low['batch_size']:>15d}\")\n",
280
+ "print(f\"{'Final Loss':<25s} {high['final_loss']:>15.4f} {low['final_loss']:>15.4f}\")\n",
281
+ "print(f\"{'Avg Epoch Time (s)':<25s} {high['avg_epoch_time']:>15.2f} {low['avg_epoch_time']:>15.2f}\")\n",
282
+ "print()\n",
283
+ "\n",
284
+ "# Loss progression\n",
285
+ "print(\"Loss progression:\")\n",
286
+ "for i in range(len(high['losses'])):\n",
287
+ " print(f\" Epoch {i+1}: high_lr={high['losses'][i]:.4f} low_lr={low['losses'][i]:.4f}\")"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "markdown",
292
+ "metadata": {},
293
+ "source": [
294
+ "## Step 5: Timing Breakdown"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": null,
300
+ "metadata": {},
301
+ "outputs": [],
302
+ "source": [
303
+ "print(\"\\n\u23f1\ufe0f Timing Breakdown\")\n",
304
+ "print(\"=\" * 60)\n",
305
+ "print(f\"{'Phase':<25s} {'High LR':>15s} {'Low LR':>15s}\")\n",
306
+ "print(\"-\" * 60)\n",
307
+ "\n",
308
+ "for phase in ['reserve', 'train', 'cancel']:\n",
309
+ " h = outputs['high-lr']['timings'][phase]\n",
310
+ " l = outputs['low-lr']['timings'][phase]\n",
311
+ " print(f\"{phase.capitalize():<25s} {h:>14.1f}s {l:>14.1f}s\")\n",
312
+ "\n",
313
+ "print(\"-\" * 60)\n",
314
+ "h_total = sum(outputs['high-lr']['timings'].values())\n",
315
+ "l_total = sum(outputs['low-lr']['timings'].values())\n",
316
+ "print(f\"{'Total (sequential)':<25s} {h_total:>14.1f}s {l_total:>14.1f}s\")\n",
317
+ "print(f\"{'Total (parallel)':<25s} {total_time:>14.1f}s {'\u2014':>15s}\")\n",
318
+ "print(f\"{'Speedup':<25s} {(h_total + l_total) / total_time:>14.1f}x {'':>15s}\")\n",
319
+ "print()\n",
320
+ "print(f\"Disk clone time: {clone_time:.1f}s\")\n",
321
+ "print(f\"Base setup + cancel: {reserve_time + cancel_time:.1f}s\")"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "markdown",
326
+ "metadata": {},
327
+ "source": [
328
+ "## Cleanup\n",
329
+ "\n",
330
+ "Remove the experiment disks if you don't need them."
331
+ ]
332
+ },
333
+ {
334
+ "cell_type": "code",
335
+ "execution_count": null,
336
+ "metadata": {},
337
+ "outputs": [],
338
+ "source": [
339
+ "# Uncomment to delete experiment disks:\n",
340
+ "# client.delete_disk(\"experiment-base\")\n",
341
+ "# client.delete_disk(\"experiment-variant\")\n",
342
+ "print(\"Done! Disks preserved for inspection.\")\n",
343
+ "print(\"Delete with: client.delete_disk('experiment-base')\")"
344
+ ]
345
+ }
346
+ ],
347
+ "metadata": {
348
+ "kernelspec": {
349
+ "display_name": "Python 3",
350
+ "language": "python",
351
+ "name": "python3"
352
+ },
353
+ "language_info": {
354
+ "name": "python",
355
+ "version": "3.12.0"
356
+ }
357
+ },
358
+ "nbformat": 4,
359
+ "nbformat_minor": 4
360
+ }
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev-sdk"
7
- version = "0.1.0"
7
+ version = "0.6.5"
8
8
  description = "Python SDK for GPU development server reservations"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -57,4 +57,4 @@ __all__ = [
57
57
  "ExecResult",
58
58
  ]
59
59
 
60
- __version__ = "0.1.0"
60
+ __version__ = "0.6.5"
@@ -166,7 +166,7 @@ class AwsBackend:
166
166
  "no_persistent_disk": params.get("no_persistent_disk", False),
167
167
  "github_user": params.get("github_user", ""),
168
168
  "preserve_entrypoint": params.get("preserve_entrypoint", False),
169
- "version": "0.6.0",
169
+ "version": __import__("gpu_dev").__version__,
170
170
  }
171
171
  if params.get("disk_name"):
172
172
  message["disk_name"] = params["disk_name"]
@@ -282,6 +282,39 @@ class AwsBackend:
282
282
  for item in resp.get("Items", [])
283
283
  ]
284
284
 
285
+ def clone_disk(self, user_id: str, source_disk: str, target_disk: str) -> str:
286
+ import uuid
287
+ from datetime import datetime, timezone
288
+ operation_id = str(uuid.uuid4())
289
+ self._sqs.send_message(
290
+ QueueUrl=self._get_queue_url(),
291
+ MessageBody=json.dumps({
292
+ "action": "clone_disk",
293
+ "operation_id": operation_id,
294
+ "user_id": user_id,
295
+ "source_disk": source_disk,
296
+ "target_disk": target_disk,
297
+ "requested_at": datetime.now(timezone.utc).isoformat(),
298
+ }),
299
+ )
300
+ return operation_id
301
+
302
+ def delete_disk(self, user_id: str, disk_name: str) -> str:
303
+ import uuid
304
+ from datetime import datetime, timezone
305
+ operation_id = str(uuid.uuid4())
306
+ self._sqs.send_message(
307
+ QueueUrl=self._get_queue_url(),
308
+ MessageBody=json.dumps({
309
+ "action": "delete_disk",
310
+ "operation_id": operation_id,
311
+ "user_id": user_id,
312
+ "disk_name": disk_name,
313
+ "requested_at": datetime.now(timezone.utc).isoformat(),
314
+ }),
315
+ )
316
+ return operation_id
317
+
285
318
  def add_user(self, reservation_id: str, user_id: str, github_username: str) -> bool:
286
319
  message = {
287
320
  "type": "add_user",
@@ -48,6 +48,14 @@ class Backend(Protocol):
48
48
  """List persistent disks for a user."""
49
49
  ...
50
50
 
51
+ def clone_disk(self, user_id: str, source_disk: str, target_disk: str) -> str:
52
+ """Clone a disk. Returns operation_id."""
53
+ ...
54
+
55
+ def delete_disk(self, user_id: str, disk_name: str) -> str:
56
+ """Delete a disk. Returns operation_id."""
57
+ ...
58
+
51
59
  def add_user(self, reservation_id: str, user_id: str, github_username: str) -> bool:
52
60
  """Grant SSH access to another user."""
53
61
  ...
@@ -244,6 +244,42 @@ class GpuDev:
244
244
  user_info = self._auth()
245
245
  return self._backend.list_disks(user_info["user_id"])
246
246
 
247
+ def clone_disk(self, source: str, target: str, *, poll: bool = True, timeout: int = 120) -> str:
248
+ """Clone a persistent disk.
249
+
250
+ Args:
251
+ source: Name of the source disk.
252
+ target: Name for the new cloned disk.
253
+ poll: Wait for the clone to complete (default True).
254
+ timeout: Max seconds to wait when polling.
255
+
256
+ Returns:
257
+ Operation ID.
258
+ """
259
+ user_info = self._auth()
260
+ op_id = self._backend.clone_disk(user_info["user_id"], source, target)
261
+ if poll:
262
+ import time
263
+ deadline = time.time() + timeout
264
+ while time.time() < deadline:
265
+ disks = self._backend.list_disks(user_info["user_id"])
266
+ if any(d.name == target for d in disks):
267
+ return op_id
268
+ time.sleep(2)
269
+ return op_id
270
+
271
+ def delete_disk(self, name: str) -> str:
272
+ """Delete a persistent disk.
273
+
274
+ Args:
275
+ name: Disk name to delete.
276
+
277
+ Returns:
278
+ Operation ID.
279
+ """
280
+ user_info = self._auth()
281
+ return self._backend.delete_disk(user_info["user_id"], name)
282
+
247
283
  def search_logs(
248
284
  self,
249
285
  reservation_id: str,
@@ -25,7 +25,7 @@ resource "aws_lambda_function" "availability_updater" {
25
25
  function_name = "${var.prefix}-availability-updater"
26
26
  role = aws_iam_role.availability_updater_role.arn
27
27
  handler = "index.handler"
28
- runtime = "python3.11"
28
+ runtime = "python3.13"
29
29
  timeout = 300
30
30
  # 1769 MB is the sweet spot — Lambda allocates one full vCPU at this threshold.
31
31
  # Beyond 1769 MB you get fractional second vCPUs (less linear gain), and our work is single-threaded.
@@ -15,6 +15,7 @@ resource "aws_lambda_function" "reservation_expiry" {
15
15
  environment {
16
16
  variables = {
17
17
  RESERVATIONS_TABLE = aws_dynamodb_table.gpu_reservations.name
18
+ DISKS_TABLE_NAME = aws_dynamodb_table.disks.name
18
19
  EKS_CLUSTER_NAME = aws_eks_cluster.gpu_dev_cluster.name
19
20
  REGION = local.current_config.aws_region
20
21
  WARNING_MINUTES = "30" # Warn 30 minutes before expiry
@@ -330,6 +330,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None, active_reservations=
330
330
 
331
331
  single_node_max = 0 # Max available on any single node
332
332
  schedulable_total_gpus = 0 # Total GPUs on schedulable (non-cordoned) nodes
333
+ full_node_gpu_counts = [] # Track actual GPU count per full node (accounts for MIG)
333
334
  for node in nodes.items:
334
335
  if is_node_ready_and_schedulable(node):
335
336
  available_on_node = get_available_gpus_on_node(v1, node, gpu_type)
@@ -349,24 +350,24 @@ def update_gpu_availability(gpu_type: str, k8s_client=None, active_reservations=
349
350
  # Count as full node if all GPUs are available
350
351
  if total_on_node > 0 and available_on_node == total_on_node:
351
352
  full_nodes_available += 1
353
+ full_node_gpu_counts.append(total_on_node)
352
354
 
353
355
  total_gpus = schedulable_total_gpus
354
356
  # For MIG SKUs override running_instances to the number of MIG-partitioned nodes
355
357
  if is_mig_sku:
356
358
  running_instances = sum(1 for n in nodes.items if is_node_ready_and_schedulable(n) and int((n.status.allocatable or {}).get(resource_name, "0")) > 0)
357
359
 
358
- # Calculate max reservable considering multinode scenarios
359
- # Only high-end GPU types support multinode (up to 4 nodes = 32 GPUs)
360
+ # Calculate max reservable using actual per-node GPU counts (not ASG gpus_per_instance)
361
+ # This correctly accounts for MIG-configured nodes that have fewer full GPUs
360
362
  multinode_gpu_types = ['h100', 'h200', 'b200', 'a100']
361
- if gpu_type in multinode_gpu_types and gpus_per_instance == 8:
362
- max_nodes = min(4, full_nodes_available) # Up to 4 nodes
363
- max_reservable = max_nodes * gpus_per_instance # e.g., 4 * 8 = 32 GPUs
363
+ if gpu_type in multinode_gpu_types and full_node_gpu_counts:
364
+ # Sum the top N full nodes (up to 4 for multinode)
365
+ sorted_counts = sorted(full_node_gpu_counts, reverse=True)
366
+ max_reservable = sum(sorted_counts[:4])
364
367
 
365
- # If no full nodes available, fall back to single node max
366
368
  if max_reservable == 0:
367
369
  max_reservable = single_node_max
368
370
  else:
369
- # For all other GPU types (T4, L4, T4-small, etc.), only single node
370
371
  max_reservable = single_node_max
371
372
 
372
373
  logger.info(f"Found {full_nodes_available} full nodes available for {gpu_type}, max reservable: {max_reservable} (single node max: {single_node_max})")
@@ -892,6 +892,7 @@ def handler(event, context):
892
892
  logger.error(f"Error cleaning up soft-deleted snapshots: {e}")
893
893
  deleted_snapshot_count = 0
894
894
 
895
+
895
896
  return {
896
897
  "statusCode": 200,
897
898
  "body": json.dumps(