gpu-dev 0.5.11__tar.gz → 0.5.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/.github/workflows/no-gitlinks.yml +1 -1
  2. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/PKG-INFO +1 -1
  3. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  4. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +2 -0
  5. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +17 -2
  6. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +45 -31
  7. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +3 -0
  8. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/pyproject.toml +1 -1
  9. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/kubernetes.tf +8 -0
  10. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/index.py +9 -1
  11. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda.tf +1 -1
  12. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/main.tf +40 -0
  13. gpu_dev-0.5.13/terraform-gpu-devservers/mig-config.tf +55 -0
  14. gpu_dev-0.5.13/terraform-gpu-devservers/mig-parted-config.yaml +528 -0
  15. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/.github/workflows/publish.yml +0 -0
  16. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/.gitignore +0 -0
  17. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/CLAUDE.md +0 -0
  18. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/PROGRESS.md +0 -0
  19. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/PR_DESCRIPTION.md +0 -0
  20. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/TODO.md +0 -0
  21. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/admin/README.md +0 -0
  22. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/admin/generate_stats.py +0 -0
  23. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/admin/requirements.txt +0 -0
  24. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/README.md +0 -0
  25. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  26. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  27. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  28. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  29. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  30. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  31. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  32. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  33. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  34. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  35. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  36. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  37. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  38. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/docs/USER_GUIDE.md +0 -0
  39. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/docs/devgpu-features.html +0 -0
  40. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/docs/docker-mark-blue.svg +0 -0
  41. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/docs/icons8-cursor-ai.svg +0 -0
  42. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/post.md +0 -0
  43. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/setup.cfg +0 -0
  44. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  45. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  46. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/README.md +0 -0
  47. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/alb.tf +0 -0
  48. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/availability.tf +0 -0
  49. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/backend.tf +0 -0
  50. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  51. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  52. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  53. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bash_profile +0 -0
  54. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bashrc +0 -0
  55. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  56. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  57. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  58. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  59. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/motd_script +0 -0
  60. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  61. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/profile +0 -0
  62. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  63. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  64. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  65. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/shell_env +0 -0
  66. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/ssh_config +0 -0
  67. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zprofile +0 -0
  68. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zshrc +0 -0
  69. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  70. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-build.tf +0 -0
  71. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  72. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  73. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ecr.tf +0 -0
  74. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/efs.tf +0 -0
  75. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/eks.tf +0 -0
  76. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/expiry.tf +0 -0
  77. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/git-cache.tf +0 -0
  78. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  79. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  80. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  81. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  82. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  83. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  84. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  85. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  86. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  87. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  88. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  89. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  90. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  91. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  92. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  93. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  94. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  95. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  96. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  97. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/monitoring.tf +0 -0
  98. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/outputs.tf +0 -0
  99. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/pyproject.toml +0 -0
  100. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/queue.tf +0 -0
  101. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/route53.tf +0 -0
  102. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  103. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  104. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  105. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  106. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  107. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  108. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  109. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  110. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  111. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  112. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/switch-to.sh +0 -0
  113. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  114. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  115. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  116. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  117. {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/variables.tf +0 -0
@@ -14,7 +14,7 @@ jobs:
14
14
  uses: actions/checkout@v4
15
15
  - name: Ensure no gitlinks are tracked
16
16
  run: |
17
- gitlinks=$(git ls-files -s | awk "$1 == 160000 {print}")
17
+ gitlinks=$(git ls-files -s | awk '$1 == 160000 {print}')
18
18
  if [ -n "$gitlinks" ]; then
19
19
  echo "Unexpected gitlinks found:"
20
20
  echo "$gitlinks"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.11
3
+ Version: 0.5.13
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.11
3
+ Version: 0.5.13
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -47,6 +47,8 @@ terraform-gpu-devservers/git-cache.tf
47
47
  terraform-gpu-devservers/kubernetes.tf
48
48
  terraform-gpu-devservers/lambda.tf
49
49
  terraform-gpu-devservers/main.tf
50
+ terraform-gpu-devservers/mig-config.tf
51
+ terraform-gpu-devservers/mig-parted-config.yaml
50
52
  terraform-gpu-devservers/monitoring.tf
51
53
  terraform-gpu-devservers/outputs.tf
52
54
  terraform-gpu-devservers/pyproject.toml
@@ -495,9 +495,9 @@ def main(ctx: click.Context) -> None:
495
495
  "--gpu-type",
496
496
  "-t",
497
497
  type=click.Choice(
498
- ["b200", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
498
+ ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
499
499
  ),
500
- help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices (partial GPU on a single shared node): h100-mig-1g (10 GB / 1/7 H100 compute), h100-mig-2g (20 GB / 2/7 H100), h100-mig-3g (40 GB / 3/7 H100). CPU only: cpu-arm, cpu-x86.",
500
+ help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
501
501
  )
502
502
  @click.option(
503
503
  "--hours",
@@ -656,6 +656,9 @@ def reserve(
656
656
  "h100-mig-1g": {"max_gpus": 16, "instance_type": "p5.48xlarge"},
657
657
  "h100-mig-2g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
658
658
  "h100-mig-3g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
659
+ "b200-mig-1g": {"max_gpus": 4, "instance_type": "p6-b200.48xlarge"},
660
+ "b200-mig-2g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
661
+ "b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
659
662
  "h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
660
663
  "b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
661
664
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
@@ -2454,6 +2457,9 @@ def _show_availability() -> None:
2454
2457
  "h100-mig-1g": "Hopper (sm90, MIG 10GB)",
2455
2458
  "h100-mig-2g": "Hopper (sm90, MIG 20GB)",
2456
2459
  "h100-mig-3g": "Hopper (sm90, MIG 40GB)",
2460
+ "b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
2461
+ "b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
2462
+ "b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
2457
2463
  "t4": "Turing (sm75)",
2458
2464
  "cpu-x86": "CPU (x86_64)",
2459
2465
  "cpu-arm": "CPU (arm64)",
@@ -2462,6 +2468,9 @@ def _show_availability() -> None:
2462
2468
  # Sort order: newest GPU architectures first, then CPUs at the bottom
2463
2469
  arch_priority = {
2464
2470
  "Blackwell (sm100)": 0,
2471
+ "Blackwell (sm100, MIG 90GB)": 0,
2472
+ "Blackwell (sm100, MIG 45GB)": 0,
2473
+ "Blackwell (sm100, MIG 23GB)": 0,
2465
2474
  "Blackwell (sm120)": 0,
2466
2475
  "Hopper (sm90)": 1,
2467
2476
  "Hopper (sm90, MIG 40GB)": 1,
@@ -2609,6 +2618,9 @@ def _show_availability_watch(interval: int) -> None:
2609
2618
  "h100-mig-1g": "Hopper (sm90, MIG 10GB)",
2610
2619
  "h100-mig-2g": "Hopper (sm90, MIG 20GB)",
2611
2620
  "h100-mig-3g": "Hopper (sm90, MIG 40GB)",
2621
+ "b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
2622
+ "b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
2623
+ "b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
2612
2624
  "t4": "Turing (sm75)",
2613
2625
  "cpu-x86": "CPU (x86_64)",
2614
2626
  "cpu-arm": "CPU (arm64)",
@@ -2617,6 +2629,9 @@ def _show_availability_watch(interval: int) -> None:
2617
2629
  # Sort order: newest GPU architectures first, then CPUs at the bottom
2618
2630
  arch_priority = {
2619
2631
  "Blackwell (sm100)": 0,
2632
+ "Blackwell (sm100, MIG 90GB)": 0,
2633
+ "Blackwell (sm100, MIG 45GB)": 0,
2634
+ "Blackwell (sm100, MIG 23GB)": 0,
2620
2635
  "Blackwell (sm120)": 0,
2621
2636
  "Hopper (sm90)": 1,
2622
2637
  "Hopper (sm90, MIG 40GB)": 1,
@@ -64,17 +64,25 @@ def select_gpu_type_interactive(
64
64
  if "-mig-" not in gt
65
65
  }
66
66
 
67
- # Aggregate MIG slice availability so we can hint it on the h100 row of this picker.
68
- mig_total_available = sum(
69
- int(info.get("available", 0))
70
- for gt, info in (availability_info or {}).items()
71
- if gt.startswith("h100-mig-")
72
- )
73
- mig_total_capacity = sum(
74
- int(info.get("total", 0))
75
- for gt, info in (availability_info or {}).items()
76
- if gt.startswith("h100-mig-")
77
- )
67
+ # Aggregate MIG slice availability per parent type, hinted on the h100/b200 rows.
68
+ def _mig_aggregates(parent: str):
69
+ avail = sum(
70
+ int(info.get("available", 0))
71
+ for gt, info in (availability_info or {}).items()
72
+ if gt.startswith(f"{parent}-mig-")
73
+ )
74
+ cap = sum(
75
+ int(info.get("total", 0))
76
+ for gt, info in (availability_info or {}).items()
77
+ if gt.startswith(f"{parent}-mig-")
78
+ )
79
+ return avail, cap
80
+
81
+ h100_mig_avail, h100_mig_capacity = _mig_aggregates("h100")
82
+ b200_mig_avail, b200_mig_capacity = _mig_aggregates("b200")
83
+ # Backwards-compat aliases for the existing h100 row code below.
84
+ mig_total_available = h100_mig_avail
85
+ mig_total_capacity = h100_mig_capacity
78
86
 
79
87
  # Display availability table first
80
88
  console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
@@ -146,6 +154,8 @@ def select_gpu_type_interactive(
146
154
  choice_label += f" - {queue_length} in queue"
147
155
  if gpu_type == "h100" and mig_total_capacity > 0:
148
156
  choice_label += f" — also {mig_total_available}/{mig_total_capacity} MIG slices"
157
+ elif gpu_type == "b200" and b200_mig_capacity > 0:
158
+ choice_label += f" — also {b200_mig_avail}/{b200_mig_capacity} MIG slices"
149
159
 
150
160
  choices.append(questionary.Choice(title=choice_label, value=gpu_type))
151
161
 
@@ -223,27 +233,31 @@ def select_gpu_count_interactive(
223
233
  parent_size_etas = parent_info.get("size_etas", {}) or {}
224
234
  _now_ts = int(_time.time())
225
235
 
226
- # MIG slice submenu: only for h100. Each tuple is (target_gpu_type, gpu_count, gb_label).
236
+ # MIG slice submenu: h100 (16+8+8 slices/node) or b200 (4+2+2 slices/node).
227
237
  mig_options = []
228
- if gpu_type == "h100":
229
- # Map to internal SKUs; the count menu surfaces 1/2/4 of each slice size.
230
- mig_specs = [
231
- ("h100-mig-1g", "10GB"),
232
- ("h100-mig-2g", "20GB"),
233
- ("h100-mig-3g", "40GB"),
234
- ]
235
- for sku, gb in mig_specs:
236
- slice_max = {"h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8}[sku]
237
- free = None
238
- if availability_info and sku in availability_info:
239
- free = availability_info[sku].get("available", 0)
240
- for n in [1, 2, 4]:
241
- if n > slice_max:
242
- continue
243
- noun = "slice" if n == 1 else "slices"
244
- avail_suffix = f" [{free} free]" if free is not None else ""
245
- label = f"{n} × {gb} {noun}{avail_suffix}"
246
- mig_options.append((sku, n, label))
238
+ mig_spec_map = {
239
+ "h100": [
240
+ ("h100-mig-1g", "10GB", 16),
241
+ ("h100-mig-2g", "20GB", 8),
242
+ ("h100-mig-3g", "40GB", 8),
243
+ ],
244
+ "b200": [
245
+ ("b200-mig-1g", "23GB", 4),
246
+ ("b200-mig-2g", "45GB", 2),
247
+ ("b200-mig-3g", "90GB", 2),
248
+ ],
249
+ }
250
+ for sku, gb, slice_max in mig_spec_map.get(gpu_type, []):
251
+ free = None
252
+ if availability_info and sku in availability_info:
253
+ free = availability_info[sku].get("available", 0)
254
+ for n in [1, 2, 4]:
255
+ if n > slice_max:
256
+ continue
257
+ noun = "slice" if n == 1 else "slices"
258
+ avail_suffix = f" [{free} free]" if free is not None else ""
259
+ label = f"{n} × {gb} {noun}{avail_suffix}"
260
+ mig_options.append((sku, n, label))
247
261
 
248
262
  # Filter single-node by actual max for this GPU type
249
263
  valid_counts = [count for count in valid_counts if count <= max_gpus]
@@ -543,6 +543,9 @@ class ReservationManager:
543
543
  "h100-mig-1g": {"max_gpus": 16},
544
544
  "h100-mig-2g": {"max_gpus": 8},
545
545
  "h100-mig-3g": {"max_gpus": 8},
546
+ "b200-mig-1g": {"max_gpus": 4},
547
+ "b200-mig-2g": {"max_gpus": 2},
548
+ "b200-mig-3g": {"max_gpus": 2},
546
549
  "h200": {"max_gpus": 8},
547
550
  "b200": {"max_gpus": 8},
548
551
  }
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.11"
7
+ version = "0.5.13"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -305,6 +305,14 @@ resource "helm_release" "nvidia_gpu_operator" {
305
305
  value = "all-disabled"
306
306
  }
307
307
 
308
+ # Read profiles from our forked ConfigMap (managed in mig-config.tf) instead of the
309
+ # operator's auto-created default-mig-parted-config. Lets us add custom mixed profiles
310
+ # like b200-6full-2mig-balanced without ClusterPolicy reconciliation reverting our edits.
311
+ set {
312
+ name = "migManager.config.name"
313
+ value = "gpu-dev-mig-parted-config"
314
+ }
315
+
308
316
  set {
309
317
  name = "nodeStatusExporter.enabled"
310
318
  value = "true"
@@ -71,6 +71,10 @@ GPU_CONFIG = {
71
71
  "h100-mig-1g": {"instance_type": "p5.48xlarge", "max_gpus": 16, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.10gb", "node_gpu_type": "h100"},
72
72
  "h100-mig-2g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.20gb", "node_gpu_type": "h100"},
73
73
  "h100-mig-3g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.40gb", "node_gpu_type": "h100"},
74
+ # B200 MIG slices on the b200-6full-2mig-balanced node (6 full GPUs + 2 partitioned per node).
75
+ "b200-mig-1g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 4, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.23gb", "node_gpu_type": "b200"},
76
+ "b200-mig-2g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.45gb", "node_gpu_type": "b200"},
77
+ "b200-mig-3g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.90gb", "node_gpu_type": "b200"},
74
78
  "t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
75
79
  "g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
76
80
  "a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
@@ -2167,7 +2171,8 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2167
2171
  # Validate GPU type
2168
2172
  valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
2169
2173
  "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
2170
- "h200", "b200", "cpu-arm", "cpu-x86"]
2174
+ "h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
2175
+ "cpu-arm", "cpu-x86"]
2171
2176
  if gpu_type not in valid_gpu_types:
2172
2177
  error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
2173
2178
  logger.error(error_msg)
@@ -2408,6 +2413,9 @@ def update_gpu_availability_table(
2408
2413
  "h100-mig-1g": {"gpus_per_instance": 16},
2409
2414
  "h100-mig-2g": {"gpus_per_instance": 8},
2410
2415
  "h100-mig-3g": {"gpus_per_instance": 8},
2416
+ "b200-mig-1g": {"gpus_per_instance": 4},
2417
+ "b200-mig-2g": {"gpus_per_instance": 2},
2418
+ "b200-mig-3g": {"gpus_per_instance": 2},
2411
2419
  "h200": {"gpus_per_instance": 8},
2412
2420
  "b200": {"gpus_per_instance": 8},
2413
2421
  }
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.11"
183
+ LAMBDA_VERSION = "0.5.13"
184
184
  MIN_CLI_VERSION = "0.5.9"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
@@ -255,6 +255,46 @@ locals {
255
255
  k8s_resource = "nvidia.com/mig-3g.40gb"
256
256
  node_gpu_type = "h100"
257
257
  }
258
+ # B200 MIG slices — virtual SKUs backed by ONE B200 node labelled with the custom
259
+ # mig_profile "b200-6full-2mig-balanced": GPUs 0-5 stay as full B200 (still reservable
260
+ # via --gpu-type b200), GPUs 6-7 get partitioned per-GPU into 2x1g.23gb + 1x2g.45gb +
261
+ # 1x3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large slices.
262
+ "b200-mig-1g" = {
263
+ instance_type = null
264
+ instance_types = null
265
+ instance_count = 0
266
+ gpus_per_instance = 4 # 2 partitioned GPUs * 2 slices each
267
+ use_placement_group = false
268
+ architecture = "x86_64"
269
+ efa_network_cards = 0
270
+ virtual = true
271
+ k8s_resource = "nvidia.com/mig-1g.23gb"
272
+ node_gpu_type = "b200"
273
+ }
274
+ "b200-mig-2g" = {
275
+ instance_type = null
276
+ instance_types = null
277
+ instance_count = 0
278
+ gpus_per_instance = 2 # 2 partitioned GPUs * 1 slice each
279
+ use_placement_group = false
280
+ architecture = "x86_64"
281
+ efa_network_cards = 0
282
+ virtual = true
283
+ k8s_resource = "nvidia.com/mig-2g.45gb"
284
+ node_gpu_type = "b200"
285
+ }
286
+ "b200-mig-3g" = {
287
+ instance_type = null
288
+ instance_types = null
289
+ instance_count = 0
290
+ gpus_per_instance = 2 # 2 partitioned GPUs * 1 slice each
291
+ use_placement_group = false
292
+ architecture = "x86_64"
293
+ efa_network_cards = 0
294
+ virtual = true
295
+ k8s_resource = "nvidia.com/mig-3g.90gb"
296
+ node_gpu_type = "b200"
297
+ }
258
298
  "cpu-arm" = {
259
299
  instance_type = "c7g.8xlarge"
260
300
  instance_types = null
@@ -0,0 +1,55 @@
1
+ # mig-config.tf — fork the NVIDIA mig-parted-config ConfigMap so we can add custom profiles
2
+ # without fighting NVIDIA ClusterPolicy's reconciliation of its default-mig-parted-config.
3
+ #
4
+ # The vendored mig-parted-config.yaml in this directory mirrors the upstream profiles plus our
5
+ # additions (e.g. b200-6full-2mig-balanced). Helm is told to use this ConfigMap by name via
6
+ # migManager.config.name in kubernetes.tf, so the GPU operator skips creating its default and
7
+ # reads ours instead.
8
+
9
+ resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
10
+ metadata {
11
+ name = "gpu-dev-mig-parted-config"
12
+ namespace = "gpu-operator"
13
+ labels = {
14
+ "app.kubernetes.io/managed-by" = "terraform"
15
+ "app.kubernetes.io/part-of" = "gpu-dev-servers"
16
+ }
17
+ }
18
+
19
+ data = {
20
+ "config.yaml" = file("${path.module}/mig-parted-config.yaml")
21
+ }
22
+
23
+ # The gpu-operator namespace is created by the helm release; depend on that so this ConfigMap
24
+ # lands AFTER the namespace exists.
25
+ depends_on = [helm_release.nvidia_gpu_operator]
26
+ }
27
+
28
+ # Optional declarative B200 MIG node label. Set b200_mig_node_name in tfvars (or override the
29
+ # variable's default below) to dedicate a specific B200 node to the mixed profile. Empty string
30
+ # means "no node currently labelled" — the existing all-disabled stays in effect.
31
+ variable "b200_mig_node_name" {
32
+ description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to skip."
33
+ type = string
34
+ default = ""
35
+ }
36
+
37
+ resource "kubernetes_labels" "b200_mig_node" {
38
+ count = var.b200_mig_node_name == "" ? 0 : 1
39
+
40
+ api_version = "v1"
41
+ kind = "Node"
42
+
43
+ metadata {
44
+ name = var.b200_mig_node_name
45
+ }
46
+
47
+ labels = {
48
+ "nvidia.com/mig.config" = "b200-6full-2mig-balanced"
49
+ }
50
+
51
+ # Take ownership of the label even if another tool (kubectl, gpu-operator) set it.
52
+ force = true
53
+
54
+ depends_on = [kubernetes_config_map.gpu_dev_mig_parted_config]
55
+ }