gpu-dev 0.5.9__tar.gz → 0.5.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/.github/workflows/no-gitlinks.yml +1 -1
  2. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/PKG-INFO +1 -1
  3. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  4. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
  5. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +17 -2
  6. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +45 -31
  7. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +3 -0
  8. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/pyproject.toml +1 -1
  9. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_processor/index.py +12 -4
  10. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda.tf +2 -2
  11. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/main.tf +40 -0
  12. gpu_dev-0.5.12/terraform-gpu-devservers/scripts/b200-mig-setup.sh +75 -0
  13. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/.github/workflows/publish.yml +0 -0
  14. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/.gitignore +0 -0
  15. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/CLAUDE.md +0 -0
  16. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/PROGRESS.md +0 -0
  17. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/PR_DESCRIPTION.md +0 -0
  18. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/TODO.md +0 -0
  19. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/admin/README.md +0 -0
  20. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/admin/generate_stats.py +0 -0
  21. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/admin/requirements.txt +0 -0
  22. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/README.md +0 -0
  23. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  24. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  25. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  26. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  27. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  28. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  29. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  30. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  31. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  32. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  33. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  34. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  35. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  36. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/docs/USER_GUIDE.md +0 -0
  37. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/docs/devgpu-features.html +0 -0
  38. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/docs/docker-mark-blue.svg +0 -0
  39. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/docs/icons8-cursor-ai.svg +0 -0
  40. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/post.md +0 -0
  41. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/setup.cfg +0 -0
  42. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  43. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  44. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/README.md +0 -0
  45. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/alb.tf +0 -0
  46. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/availability.tf +0 -0
  47. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/backend.tf +0 -0
  48. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  49. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  50. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  51. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/bash_profile +0 -0
  52. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/bashrc +0 -0
  53. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  54. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  55. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  56. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  57. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/motd_script +0 -0
  58. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  59. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/profile +0 -0
  60. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  61. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  62. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  63. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/shell_env +0 -0
  64. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/ssh_config +0 -0
  65. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/zprofile +0 -0
  66. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/zshrc +0 -0
  67. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  68. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker-build.tf +0 -0
  69. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  70. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  71. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ecr.tf +0 -0
  72. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/efs.tf +0 -0
  73. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/eks.tf +0 -0
  74. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/expiry.tf +0 -0
  75. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/git-cache.tf +0 -0
  76. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/kubernetes.tf +0 -0
  77. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  78. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  84. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  85. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  86. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  87. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  88. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  89. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  90. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  91. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  92. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  93. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  94. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  95. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  96. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/monitoring.tf +0 -0
  97. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/outputs.tf +0 -0
  98. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/pyproject.toml +0 -0
  99. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/queue.tf +0 -0
  100. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/route53.tf +0 -0
  101. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  102. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  103. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  104. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  105. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  106. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  107. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  108. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  109. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  110. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  111. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/switch-to.sh +0 -0
  112. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  113. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  114. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  115. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  116. {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/variables.tf +0 -0
@@ -14,7 +14,7 @@ jobs:
14
14
  uses: actions/checkout@v4
15
15
  - name: Ensure no gitlinks are tracked
16
16
  run: |
17
- gitlinks=$(git ls-files -s | awk "$1 == 160000 {print}")
17
+ gitlinks=$(git ls-files -s | awk '$1 == 160000 {print}')
18
18
  if [ -n "$gitlinks" ]; then
19
19
  echo "Unexpected gitlinks found:"
20
20
  echo "$gitlinks"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.9
3
+ Version: 0.5.12
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.9
3
+ Version: 0.5.12
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -101,6 +101,7 @@ terraform-gpu-devservers/migrations/check_snapshots.py
101
101
  terraform-gpu-devservers/migrations/migrate_disks_to_named.py
102
102
  terraform-gpu-devservers/migrations/run_backfill.sh
103
103
  terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md
104
+ terraform-gpu-devservers/scripts/b200-mig-setup.sh
104
105
  terraform-gpu-devservers/scripts/detect_empty_volumes.sh
105
106
  terraform-gpu-devservers/scripts/ec2_avail_probe.sh
106
107
  terraform-gpu-devservers/scripts/inspect_user_data.sh
@@ -495,9 +495,9 @@ def main(ctx: click.Context) -> None:
495
495
  "--gpu-type",
496
496
  "-t",
497
497
  type=click.Choice(
498
- ["b200", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
498
+ ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
499
499
  ),
500
- help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices (partial GPU on a single shared node): h100-mig-1g (10 GB / 1/7 H100 compute), h100-mig-2g (20 GB / 2/7 H100), h100-mig-3g (40 GB / 3/7 H100). CPU only: cpu-arm, cpu-x86.",
500
+ help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
501
501
  )
502
502
  @click.option(
503
503
  "--hours",
@@ -656,6 +656,9 @@ def reserve(
656
656
  "h100-mig-1g": {"max_gpus": 16, "instance_type": "p5.48xlarge"},
657
657
  "h100-mig-2g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
658
658
  "h100-mig-3g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
659
+ "b200-mig-1g": {"max_gpus": 4, "instance_type": "p6-b200.48xlarge"},
660
+ "b200-mig-2g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
661
+ "b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
659
662
  "h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
660
663
  "b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
661
664
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
@@ -2454,6 +2457,9 @@ def _show_availability() -> None:
2454
2457
  "h100-mig-1g": "Hopper (sm90, MIG 10GB)",
2455
2458
  "h100-mig-2g": "Hopper (sm90, MIG 20GB)",
2456
2459
  "h100-mig-3g": "Hopper (sm90, MIG 40GB)",
2460
+ "b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
2461
+ "b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
2462
+ "b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
2457
2463
  "t4": "Turing (sm75)",
2458
2464
  "cpu-x86": "CPU (x86_64)",
2459
2465
  "cpu-arm": "CPU (arm64)",
@@ -2462,6 +2468,9 @@ def _show_availability() -> None:
2462
2468
  # Sort order: newest GPU architectures first, then CPUs at the bottom
2463
2469
  arch_priority = {
2464
2470
  "Blackwell (sm100)": 0,
2471
+ "Blackwell (sm100, MIG 90GB)": 0,
2472
+ "Blackwell (sm100, MIG 45GB)": 0,
2473
+ "Blackwell (sm100, MIG 23GB)": 0,
2465
2474
  "Blackwell (sm120)": 0,
2466
2475
  "Hopper (sm90)": 1,
2467
2476
  "Hopper (sm90, MIG 40GB)": 1,
@@ -2609,6 +2618,9 @@ def _show_availability_watch(interval: int) -> None:
2609
2618
  "h100-mig-1g": "Hopper (sm90, MIG 10GB)",
2610
2619
  "h100-mig-2g": "Hopper (sm90, MIG 20GB)",
2611
2620
  "h100-mig-3g": "Hopper (sm90, MIG 40GB)",
2621
+ "b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
2622
+ "b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
2623
+ "b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
2612
2624
  "t4": "Turing (sm75)",
2613
2625
  "cpu-x86": "CPU (x86_64)",
2614
2626
  "cpu-arm": "CPU (arm64)",
@@ -2617,6 +2629,9 @@ def _show_availability_watch(interval: int) -> None:
2617
2629
  # Sort order: newest GPU architectures first, then CPUs at the bottom
2618
2630
  arch_priority = {
2619
2631
  "Blackwell (sm100)": 0,
2632
+ "Blackwell (sm100, MIG 90GB)": 0,
2633
+ "Blackwell (sm100, MIG 45GB)": 0,
2634
+ "Blackwell (sm100, MIG 23GB)": 0,
2620
2635
  "Blackwell (sm120)": 0,
2621
2636
  "Hopper (sm90)": 1,
2622
2637
  "Hopper (sm90, MIG 40GB)": 1,
@@ -64,17 +64,25 @@ def select_gpu_type_interactive(
64
64
  if "-mig-" not in gt
65
65
  }
66
66
 
67
- # Aggregate MIG slice availability so we can hint it on the h100 row of this picker.
68
- mig_total_available = sum(
69
- int(info.get("available", 0))
70
- for gt, info in (availability_info or {}).items()
71
- if gt.startswith("h100-mig-")
72
- )
73
- mig_total_capacity = sum(
74
- int(info.get("total", 0))
75
- for gt, info in (availability_info or {}).items()
76
- if gt.startswith("h100-mig-")
77
- )
67
+ # Aggregate MIG slice availability per parent type, hinted on the h100/b200 rows.
68
+ def _mig_aggregates(parent: str):
69
+ avail = sum(
70
+ int(info.get("available", 0))
71
+ for gt, info in (availability_info or {}).items()
72
+ if gt.startswith(f"{parent}-mig-")
73
+ )
74
+ cap = sum(
75
+ int(info.get("total", 0))
76
+ for gt, info in (availability_info or {}).items()
77
+ if gt.startswith(f"{parent}-mig-")
78
+ )
79
+ return avail, cap
80
+
81
+ h100_mig_avail, h100_mig_capacity = _mig_aggregates("h100")
82
+ b200_mig_avail, b200_mig_capacity = _mig_aggregates("b200")
83
+ # Backwards-compat aliases for the existing h100 row code below.
84
+ mig_total_available = h100_mig_avail
85
+ mig_total_capacity = h100_mig_capacity
78
86
 
79
87
  # Display availability table first
80
88
  console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
@@ -146,6 +154,8 @@ def select_gpu_type_interactive(
146
154
  choice_label += f" - {queue_length} in queue"
147
155
  if gpu_type == "h100" and mig_total_capacity > 0:
148
156
  choice_label += f" — also {mig_total_available}/{mig_total_capacity} MIG slices"
157
+ elif gpu_type == "b200" and b200_mig_capacity > 0:
158
+ choice_label += f" — also {b200_mig_avail}/{b200_mig_capacity} MIG slices"
149
159
 
150
160
  choices.append(questionary.Choice(title=choice_label, value=gpu_type))
151
161
 
@@ -223,27 +233,31 @@ def select_gpu_count_interactive(
223
233
  parent_size_etas = parent_info.get("size_etas", {}) or {}
224
234
  _now_ts = int(_time.time())
225
235
 
226
- # MIG slice submenu: only for h100. Each tuple is (target_gpu_type, gpu_count, gb_label).
236
+ # MIG slice submenu: h100 (16+8+8 slices/node) or b200 (4+2+2 slices/node).
227
237
  mig_options = []
228
- if gpu_type == "h100":
229
- # Map to internal SKUs; the count menu surfaces 1/2/4 of each slice size.
230
- mig_specs = [
231
- ("h100-mig-1g", "10GB"),
232
- ("h100-mig-2g", "20GB"),
233
- ("h100-mig-3g", "40GB"),
234
- ]
235
- for sku, gb in mig_specs:
236
- slice_max = {"h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8}[sku]
237
- free = None
238
- if availability_info and sku in availability_info:
239
- free = availability_info[sku].get("available", 0)
240
- for n in [1, 2, 4]:
241
- if n > slice_max:
242
- continue
243
- noun = "slice" if n == 1 else "slices"
244
- avail_suffix = f" [{free} free]" if free is not None else ""
245
- label = f"{n} × {gb} {noun}{avail_suffix}"
246
- mig_options.append((sku, n, label))
238
+ mig_spec_map = {
239
+ "h100": [
240
+ ("h100-mig-1g", "10GB", 16),
241
+ ("h100-mig-2g", "20GB", 8),
242
+ ("h100-mig-3g", "40GB", 8),
243
+ ],
244
+ "b200": [
245
+ ("b200-mig-1g", "23GB", 4),
246
+ ("b200-mig-2g", "45GB", 2),
247
+ ("b200-mig-3g", "90GB", 2),
248
+ ],
249
+ }
250
+ for sku, gb, slice_max in mig_spec_map.get(gpu_type, []):
251
+ free = None
252
+ if availability_info and sku in availability_info:
253
+ free = availability_info[sku].get("available", 0)
254
+ for n in [1, 2, 4]:
255
+ if n > slice_max:
256
+ continue
257
+ noun = "slice" if n == 1 else "slices"
258
+ avail_suffix = f" [{free} free]" if free is not None else ""
259
+ label = f"{n} × {gb} {noun}{avail_suffix}"
260
+ mig_options.append((sku, n, label))
247
261
 
248
262
  # Filter single-node by actual max for this GPU type
249
263
  valid_counts = [count for count in valid_counts if count <= max_gpus]
@@ -543,6 +543,9 @@ class ReservationManager:
543
543
  "h100-mig-1g": {"max_gpus": 16},
544
544
  "h100-mig-2g": {"max_gpus": 8},
545
545
  "h100-mig-3g": {"max_gpus": 8},
546
+ "b200-mig-1g": {"max_gpus": 4},
547
+ "b200-mig-2g": {"max_gpus": 2},
548
+ "b200-mig-3g": {"max_gpus": 2},
546
549
  "h200": {"max_gpus": 8},
547
550
  "b200": {"max_gpus": 8},
548
551
  }
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.9"
7
+ version = "0.5.12"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -71,6 +71,10 @@ GPU_CONFIG = {
71
71
  "h100-mig-1g": {"instance_type": "p5.48xlarge", "max_gpus": 16, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.10gb", "node_gpu_type": "h100"},
72
72
  "h100-mig-2g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.20gb", "node_gpu_type": "h100"},
73
73
  "h100-mig-3g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.40gb", "node_gpu_type": "h100"},
74
+ # B200 MIG slices on the b200-6full-2mig-balanced node (6 full GPUs + 2 partitioned per node).
75
+ "b200-mig-1g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 4, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.23gb", "node_gpu_type": "b200"},
76
+ "b200-mig-2g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.45gb", "node_gpu_type": "b200"},
77
+ "b200-mig-3g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.90gb", "node_gpu_type": "b200"},
74
78
  "t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
75
79
  "g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
76
80
  "a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
@@ -323,12 +327,12 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
323
327
  return target_az
324
328
 
325
329
  logger.warning(f"No ready/schedulable {gpu_type} nodes found in cluster")
326
- return None
330
+ return None, None
327
331
 
328
332
  except Exception as e:
329
333
  logger.error(f"Error determining target AZ for {gpu_type}: {str(e)}")
330
- # Fallback to primary AZ if detection fails
331
- return PRIMARY_AVAILABILITY_ZONE
334
+ # Fallback to primary AZ if detection fails (no node hint — let k8s pick).
335
+ return PRIMARY_AVAILABILITY_ZONE, None
332
336
 
333
337
 
334
338
  def check_for_multiple_volumes(user_id):
@@ -2167,7 +2171,8 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2167
2171
  # Validate GPU type
2168
2172
  valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
2169
2173
  "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
2170
- "h200", "b200", "cpu-arm", "cpu-x86"]
2174
+ "h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
2175
+ "cpu-arm", "cpu-x86"]
2171
2176
  if gpu_type not in valid_gpu_types:
2172
2177
  error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
2173
2178
  logger.error(error_msg)
@@ -2408,6 +2413,9 @@ def update_gpu_availability_table(
2408
2413
  "h100-mig-1g": {"gpus_per_instance": 16},
2409
2414
  "h100-mig-2g": {"gpus_per_instance": 8},
2410
2415
  "h100-mig-3g": {"gpus_per_instance": 8},
2416
+ "b200-mig-1g": {"gpus_per_instance": 4},
2417
+ "b200-mig-2g": {"gpus_per_instance": 2},
2418
+ "b200-mig-3g": {"gpus_per_instance": 2},
2411
2419
  "h200": {"gpus_per_instance": 8},
2412
2420
  "b200": {"gpus_per_instance": 8},
2413
2421
  }
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.9"
184
- MIN_CLI_VERSION = "0.5.5"
183
+ LAMBDA_VERSION = "0.5.12"
184
+ MIN_CLI_VERSION = "0.5.9"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
187
  }, local.alb_env_vars)
@@ -255,6 +255,46 @@ locals {
255
255
  k8s_resource = "nvidia.com/mig-3g.40gb"
256
256
  node_gpu_type = "h100"
257
257
  }
258
+ # B200 MIG slices — virtual SKUs backed by ONE B200 node labelled with the custom
259
+ # mig_profile "b200-6full-2mig-balanced": GPUs 0-5 stay as full B200 (still reservable
260
+ # via --gpu-type b200), GPUs 6-7 get partitioned per-GPU into 2x1g.23gb + 1x2g.45gb +
261
+ # 1x3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large slices.
262
+ "b200-mig-1g" = {
263
+ instance_type = null
264
+ instance_types = null
265
+ instance_count = 0
266
+ gpus_per_instance = 4 # 2 partitioned GPUs * 2 slices each
267
+ use_placement_group = false
268
+ architecture = "x86_64"
269
+ efa_network_cards = 0
270
+ virtual = true
271
+ k8s_resource = "nvidia.com/mig-1g.23gb"
272
+ node_gpu_type = "b200"
273
+ }
274
+ "b200-mig-2g" = {
275
+ instance_type = null
276
+ instance_types = null
277
+ instance_count = 0
278
+ gpus_per_instance = 2 # 2 partitioned GPUs * 1 slice each
279
+ use_placement_group = false
280
+ architecture = "x86_64"
281
+ efa_network_cards = 0
282
+ virtual = true
283
+ k8s_resource = "nvidia.com/mig-2g.45gb"
284
+ node_gpu_type = "b200"
285
+ }
286
+ "b200-mig-3g" = {
287
+ instance_type = null
288
+ instance_types = null
289
+ instance_count = 0
290
+ gpus_per_instance = 2 # 2 partitioned GPUs * 1 slice each
291
+ use_placement_group = false
292
+ architecture = "x86_64"
293
+ efa_network_cards = 0
294
+ virtual = true
295
+ k8s_resource = "nvidia.com/mig-3g.90gb"
296
+ node_gpu_type = "b200"
297
+ }
258
298
  "cpu-arm" = {
259
299
  instance_type = "c7g.8xlarge"
260
300
  instance_types = null
@@ -0,0 +1,75 @@
1
+ #!/bin/bash
2
+ # Post-deploy setup for B200 MIG split (6 full + 2 partitioned per node).
3
+ # Run ONCE after PR #77 is merged + tf applied + the new docker/lambda is live.
4
+
5
+ set -e
6
+
7
+ NS=gpu-operator
8
+ CM=default-mig-parted-config
9
+ PROFILE_NAME=b200-6full-2mig-balanced
10
+
11
+ echo "=== Checking current MIG profile in ConfigMap ==="
12
+ if kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' | grep -q "$PROFILE_NAME:"; then
13
+ echo "Profile $PROFILE_NAME already present — skipping ConfigMap edit"
14
+ else
15
+ echo "Profile $PROFILE_NAME missing. Patching ConfigMap..."
16
+
17
+ # Save current ConfigMap content
18
+ kubectl -n "$NS" get configmap "$CM" -o yaml > /tmp/mig-config-backup.yaml
19
+ echo "Backup saved to /tmp/mig-config-backup.yaml"
20
+
21
+ # Append our profile under mig-configs:
22
+ # NOTE: this is a sed-driven append. ClusterPolicy's controller MAY revert this if it
23
+ # reconciles. If you see the profile disappear, re-run this script. If it keeps reverting,
24
+ # we'll need to fork the ConfigMap (next iteration).
25
+ kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' > /tmp/mig-config.yaml
26
+
27
+ cat >> /tmp/mig-config.yaml <<'EOF'
28
+
29
+ # Mixed B200 split: GPUs 0-5 stay full (reservable as --gpu-type b200), GPUs 6-7 partitioned.
30
+ # Per partitioned GPU: 2x 1g.23gb + 1x 2g.45gb + 1x 3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large.
31
+ b200-6full-2mig-balanced:
32
+ - device-filter: ["0x290110DE"]
33
+ devices: [0, 1, 2, 3, 4, 5]
34
+ mig-enabled: false
35
+ - device-filter: ["0x290110DE"]
36
+ devices: [6, 7]
37
+ mig-enabled: true
38
+ mig-devices:
39
+ "1g.23gb": 2
40
+ "2g.45gb": 1
41
+ "3g.90gb": 1
42
+ EOF
43
+
44
+ # Re-encode and patch
45
+ kubectl -n "$NS" create configmap "$CM" --from-file=config.yaml=/tmp/mig-config.yaml --dry-run=client -o yaml \
46
+ | kubectl -n "$NS" patch configmap "$CM" --patch-file=/dev/stdin
47
+ echo "ConfigMap patched."
48
+ fi
49
+
50
+ echo
51
+ echo "=== Picking a B200 node to label ==="
52
+ NODE=$(kubectl get nodes -l GpuType=b200 -o jsonpath='{.items[0].metadata.name}')
53
+ if [ -z "$NODE" ]; then
54
+ echo "No B200 nodes found. Exiting."
55
+ exit 1
56
+ fi
57
+ echo "Will label: $NODE"
58
+ read -p "Proceed? (y/N): " CONFIRM
59
+ if [ "$CONFIRM" != "y" ]; then
60
+ echo "Aborted."
61
+ exit 0
62
+ fi
63
+
64
+ kubectl label node "$NODE" "nvidia.com/mig.config=$PROFILE_NAME" --overwrite
65
+ echo "Node labelled. nvidia-mig-manager will partition GPUs 6-7 (drains existing pods if any)."
66
+ echo
67
+ echo "Watch progress with:"
68
+ echo " kubectl logs -n gpu-operator -l app=nvidia-mig-manager -f"
69
+ echo " kubectl get node $NODE -o jsonpath='{.status.allocatable}' | jq ."
70
+ echo
71
+ echo "After ~2-5 min, allocatable should show:"
72
+ echo " nvidia.com/gpu: 6"
73
+ echo " nvidia.com/mig-1g.23gb: 4"
74
+ echo " nvidia.com/mig-2g.45gb: 2"
75
+ echo " nvidia.com/mig-3g.90gb: 2"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes