gpu-dev 0.5.1__tar.gz → 0.5.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +27 -5
  4. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +59 -5
  5. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +3 -0
  6. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/pyproject.toml +1 -1
  7. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/availability.tf +1 -1
  8. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/eks.tf +10 -4
  9. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/availability_updater/index.py +30 -12
  10. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_processor/index.py +47 -20
  11. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda.tf +2 -2
  12. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/main.tf +44 -1
  13. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/al2023-user-data.sh +1 -1
  14. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/.github/workflows/no-gitlinks.yml +0 -0
  15. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/.github/workflows/publish.yml +0 -0
  16. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/.gitignore +0 -0
  17. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/CLAUDE.md +0 -0
  18. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/PROGRESS.md +0 -0
  19. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/PR_DESCRIPTION.md +0 -0
  20. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/TODO.md +0 -0
  21. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/admin/README.md +0 -0
  22. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/admin/generate_stats.py +0 -0
  23. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/admin/requirements.txt +0 -0
  24. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/README.md +0 -0
  25. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  26. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
  27. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  28. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  29. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  30. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  31. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  32. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  33. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  34. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  35. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  36. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  37. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  38. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  39. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/docs/USER_GUIDE.md +0 -0
  40. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/docs/devgpu-features.html +0 -0
  41. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/docs/docker-mark-blue.svg +0 -0
  42. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/docs/icons8-cursor-ai.svg +0 -0
  43. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/post.md +0 -0
  44. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/setup.cfg +0 -0
  45. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  46. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  47. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/README.md +0 -0
  48. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/alb.tf +0 -0
  49. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/backend.tf +0 -0
  50. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  51. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  52. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  53. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/bash_profile +0 -0
  54. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/bashrc +0 -0
  55. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  56. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  57. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  58. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  59. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/motd_script +0 -0
  60. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  61. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/profile +0 -0
  62. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  63. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  64. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  65. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/shell_env +0 -0
  66. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/ssh_config +0 -0
  67. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/zprofile +0 -0
  68. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/zshrc +0 -0
  69. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  70. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker-build.tf +0 -0
  71. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  72. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  73. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ecr.tf +0 -0
  74. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/efs.tf +0 -0
  75. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/expiry.tf +0 -0
  76. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/git-cache.tf +0 -0
  77. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/kubernetes.tf +0 -0
  78. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  84. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  85. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  86. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  87. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  88. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  89. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  90. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  91. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  92. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  93. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  94. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  95. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  96. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/monitoring.tf +0 -0
  97. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/outputs.tf +0 -0
  98. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/pyproject.toml +0 -0
  99. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/queue.tf +0 -0
  100. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/route53.tf +0 -0
  101. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  102. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  103. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  104. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  105. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  106. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  107. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  108. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  109. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  110. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  111. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/switch-to.sh +0 -0
  112. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  113. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  114. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  115. {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.1
3
+ Version: 0.5.3
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -498,9 +498,9 @@ def main(ctx: click.Context) -> None:
498
498
  "--gpu-type",
499
499
  "-t",
500
500
  type=click.Choice(
501
- ["b200", "h200", "h100", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
501
+ ["b200", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
502
502
  ),
503
- help="GPU type to reserve (b200/h200/h100/a100/rtxpro6000/a10g/t4/l4/t4-small/cpu-arm/cpu-x86)",
503
+ help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices (partial GPU on a single shared node): h100-mig-1g (10 GB / 1/7 H100 compute), h100-mig-2g (20 GB / 2/7 H100), h100-mig-3g (40 GB / 3/7 H100). CPU only: cpu-arm, cpu-x86.",
504
504
  )
505
505
  @click.option(
506
506
  "--hours",
@@ -656,6 +656,9 @@ def reserve(
656
656
  "t4-small": {"max_gpus": 1, "instance_type": "g4dn.xlarge"},
657
657
  "a100": {"max_gpus": 8, "instance_type": "p4d.24xlarge"},
658
658
  "h100": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
659
+ "h100-mig-1g": {"max_gpus": 16, "instance_type": "p5.48xlarge"},
660
+ "h100-mig-2g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
661
+ "h100-mig-3g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
659
662
  "h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
660
663
  "b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
661
664
  "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
@@ -724,11 +727,18 @@ def reserve(
724
727
  return
725
728
 
726
729
  max_gpus = gpu_configs[gpu_type_lower]["max_gpus"]
727
- gpu_count = select_gpu_count_interactive(
728
- gpu_type_lower, max_gpus)
729
- if gpu_count is None:
730
+ result = select_gpu_count_interactive(
731
+ gpu_type_lower, max_gpus, availability_info=availability_info)
732
+ if result is None:
730
733
  rprint("[yellow]Reservation cancelled.[/yellow]")
731
734
  return
735
+ # If user picked a MIG slice, the function returns (gpu_type, count).
736
+ if isinstance(result, tuple):
737
+ gpu_type, gpu_count = result
738
+ gpu_type_lower = gpu_type.lower()
739
+ max_gpus = gpu_configs[gpu_type_lower]["max_gpus"]
740
+ else:
741
+ gpu_count = result
732
742
 
733
743
  # Show distributed warning for interactive multinode selections (always show)
734
744
  if gpu_count > max_gpus:
@@ -2399,6 +2409,9 @@ def _show_availability() -> None:
2399
2409
  "a10g": "Ampere (sm80)",
2400
2410
  "l4": "Ada Lovelace (sm89)",
2401
2411
  "rtxpro6000": "Blackwell (sm120)",
2412
+ "h100-mig-1g": "Hopper (sm90, MIG 10GB)",
2413
+ "h100-mig-2g": "Hopper (sm90, MIG 20GB)",
2414
+ "h100-mig-3g": "Hopper (sm90, MIG 40GB)",
2402
2415
  "t4": "Turing (sm75)",
2403
2416
  "cpu-x86": "CPU (x86_64)",
2404
2417
  "cpu-arm": "CPU (arm64)",
@@ -2409,6 +2422,9 @@ def _show_availability() -> None:
2409
2422
  "Blackwell (sm100)": 0,
2410
2423
  "Blackwell (sm120)": 0,
2411
2424
  "Hopper (sm90)": 1,
2425
+ "Hopper (sm90, MIG 40GB)": 1,
2426
+ "Hopper (sm90, MIG 20GB)": 1,
2427
+ "Hopper (sm90, MIG 10GB)": 1,
2412
2428
  "Ada Lovelace (sm89)": 2,
2413
2429
  "Ampere (sm80)": 3,
2414
2430
  "Turing (sm75)": 4,
@@ -2548,6 +2564,9 @@ def _show_availability_watch(interval: int) -> None:
2548
2564
  "a10g": "Ampere (sm80)",
2549
2565
  "l4": "Ada Lovelace (sm89)",
2550
2566
  "rtxpro6000": "Blackwell (sm120)",
2567
+ "h100-mig-1g": "Hopper (sm90, MIG 10GB)",
2568
+ "h100-mig-2g": "Hopper (sm90, MIG 20GB)",
2569
+ "h100-mig-3g": "Hopper (sm90, MIG 40GB)",
2551
2570
  "t4": "Turing (sm75)",
2552
2571
  "cpu-x86": "CPU (x86_64)",
2553
2572
  "cpu-arm": "CPU (arm64)",
@@ -2558,6 +2577,9 @@ def _show_availability_watch(interval: int) -> None:
2558
2577
  "Blackwell (sm100)": 0,
2559
2578
  "Blackwell (sm120)": 0,
2560
2579
  "Hopper (sm90)": 1,
2580
+ "Hopper (sm90, MIG 40GB)": 1,
2581
+ "Hopper (sm90, MIG 20GB)": 1,
2582
+ "Hopper (sm90, MIG 10GB)": 1,
2561
2583
  "Ada Lovelace (sm89)": 2,
2562
2584
  "Ampere (sm80)": 3,
2563
2585
  "Turing (sm75)": 4,
@@ -57,6 +57,13 @@ def select_gpu_type_interactive(
57
57
  if not check_interactive_support():
58
58
  return None
59
59
 
60
+ # Hide MIG slice SKUs from the top-level selector — reached via the h100 submenu.
61
+ # Direct `--gpu-type h100-mig-1g` still works for non-interactive scripts.
62
+ visible_info = {
63
+ gt: info for gt, info in availability_info.items()
64
+ if "-mig-" not in gt
65
+ }
66
+
60
67
  # Display availability table first
61
68
  console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
62
69
  table = Table()
@@ -67,7 +74,7 @@ def select_gpu_type_interactive(
67
74
  table.add_column("Est. Wait Time", style="magenta")
68
75
 
69
76
  choices = []
70
- for gpu_type, info in availability_info.items():
77
+ for gpu_type, info in visible_info.items():
71
78
  available = info.get("available", 0)
72
79
  total = info.get("total", 0)
73
80
  queue_length = info.get("queue_length", 0)
@@ -143,8 +150,16 @@ def select_gpu_type_interactive(
143
150
  return None
144
151
 
145
152
 
146
- def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
147
- """Interactive GPU count selection"""
153
+ def select_gpu_count_interactive(
154
+ gpu_type: str,
155
+ max_gpus: int,
156
+ availability_info: Optional[Dict[str, Dict[str, Any]]] = None,
157
+ ):
158
+ """Interactive GPU count selection.
159
+
160
+ Returns int (gpu_count) for normal selections, or a (effective_gpu_type, gpu_count)
161
+ tuple when the user picks a MIG slice option from the h100 submenu.
162
+ """
148
163
  if not check_interactive_support():
149
164
  return None
150
165
 
@@ -157,6 +172,12 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
157
172
  valid_counts = [1, 2, 4]
158
173
  # Add multinode options
159
174
  multinode_counts = [8, 12, 16, 20, 24] # multiples of 4
175
+ elif gpu_type == "h100-mig-1g":
176
+ valid_counts = [1, 2, 4, 8]
177
+ multinode_counts = [] # MIG slices live on a single node — no multinode
178
+ elif gpu_type in ["h100-mig-2g", "h100-mig-3g"]:
179
+ valid_counts = [1, 2, 4]
180
+ multinode_counts = []
160
181
  elif gpu_type == "g5g":
161
182
  valid_counts = [1, 2]
162
183
  multinode_counts = [4, 8] # multiples of 4
@@ -168,6 +189,28 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
168
189
  # Add multinode options
169
190
  multinode_counts = [16, 24, 32, 40, 48] # multiples of 8
170
191
 
192
+ # MIG slice submenu: only for h100. Each tuple is (target_gpu_type, gpu_count, gb_label).
193
+ mig_options = []
194
+ if gpu_type == "h100":
195
+ # Map to internal SKUs; the count menu surfaces 1/2/4 of each slice size.
196
+ mig_specs = [
197
+ ("h100-mig-1g", "10GB"),
198
+ ("h100-mig-2g", "20GB"),
199
+ ("h100-mig-3g", "40GB"),
200
+ ]
201
+ for sku, gb in mig_specs:
202
+ slice_max = {"h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8}[sku]
203
+ free = None
204
+ if availability_info and sku in availability_info:
205
+ free = availability_info[sku].get("available", 0)
206
+ for n in [1, 2, 4]:
207
+ if n > slice_max:
208
+ continue
209
+ noun = "slice" if n == 1 else "slices"
210
+ avail_suffix = f" [{free} free]" if free is not None else ""
211
+ label = f"{n} × {gb} {noun}{avail_suffix}"
212
+ mig_options.append((sku, n, label))
213
+
171
214
  # Filter single-node by actual max for this GPU type
172
215
  valid_counts = [count for count in valid_counts if count <= max_gpus]
173
216
 
@@ -177,7 +220,18 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
177
220
 
178
221
  choices = []
179
222
 
180
- # Add single-node options
223
+ # MIG slice options come first (smallest unit), h100-only.
224
+ if mig_options:
225
+ choices.append(questionary.Separator(
226
+ "--- MIG slices (partial GPU, single node) ---"))
227
+ for sku, count, label in mig_options:
228
+ choices.append(questionary.Choice(title=label, value=(sku, count)))
229
+
230
+ # Full single-node options. Header only when slices were rendered above
231
+ # (otherwise the type already implies "Full GPUs").
232
+ if mig_options:
233
+ choices.append(questionary.Separator(
234
+ "--- Full GPUs (single node) ---"))
181
235
  for count in valid_counts:
182
236
  if count == 1:
183
237
  label = f"1 GPU (single node)"
@@ -185,7 +239,7 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
185
239
  label = f"{count} GPUs (single node)"
186
240
  choices.append(questionary.Choice(title=label, value=count))
187
241
 
188
- # Add separator and multinode options
242
+ # Multinode at the bottom.
189
243
  if multinode_counts:
190
244
  choices.append(questionary.Separator(
191
245
  "--- Multinode (Distributed) ---"))
@@ -540,6 +540,9 @@ class ReservationManager:
540
540
  "g5g": {"max_gpus": 2},
541
541
  "a100": {"max_gpus": 8},
542
542
  "h100": {"max_gpus": 8},
543
+ "h100-mig-1g": {"max_gpus": 16},
544
+ "h100-mig-2g": {"max_gpus": 8},
545
+ "h100-mig-3g": {"max_gpus": 8},
543
546
  "h200": {"max_gpus": 8},
544
547
  "b200": {"max_gpus": 8},
545
548
  }
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.1"
7
+ version = "0.5.3"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -142,7 +142,7 @@ resource "aws_cloudwatch_event_rule" "asg_capacity_change" {
142
142
  "EC2 Instance Terminate Successful"
143
143
  ]
144
144
  detail = {
145
- AutoScalingGroupName = [for gpu_type in keys(local.current_config.supported_gpu_types) : "${var.prefix}-gpu-nodes-${gpu_type}"]
145
+ AutoScalingGroupName = [for gpu_type, cfg in local.current_config.supported_gpu_types : "${var.prefix}-gpu-nodes-${gpu_type}" if !try(cfg.virtual, false)]
146
146
  }
147
147
  })
148
148
 
@@ -198,7 +198,7 @@ locals {
198
198
  # Flatten capacity reservations to create multiple ASGs when needed
199
199
  # Each CR entry must have a stable 'key' field so removing entries doesn't shift other ASG keys.
200
200
  gpu_capacity_reservations = flatten([
201
- for gpu_type, gpu_config in local.current_config.supported_gpu_types : [
201
+ for gpu_type, gpu_config in local.current_config.supported_gpu_types : try(gpu_config.virtual, false) ? [] : [
202
202
  for cr_index, cr_config in try(local.capacity_reservations[terraform.workspace][gpu_type], [null]) : {
203
203
  gpu_type = gpu_type
204
204
  gpu_config = gpu_config
@@ -212,8 +212,13 @@ locals {
212
212
  ? lookup(local.capacity_reservation_azs[terraform.workspace], cr_config.id, local.gpu_subnet_assignments[terraform.workspace][gpu_type])
213
213
  : local.gpu_subnet_assignments[terraform.workspace][gpu_type]
214
214
  )
215
+ # Per-CR override for efa_network_cards (e.g. p5en.48xlarge caps at 16 vs p5e at 32)
216
+ efa_network_cards = cr_config != null ? try(cr_config.efa_network_cards, gpu_config.efa_network_cards) : gpu_config.efa_network_cards
217
+ # Optional MIG profile (e.g. "all-balanced", "all-1g.10gb"). When set, user-data labels the node so nvidia-mig-manager partitions the GPUs.
218
+ # Default to "" (not null) — null breaks templatefile() string interpolation downstream.
219
+ mig_profile = cr_config != null ? try(cr_config.mig_profile, "") : ""
215
220
  # Multi-EFA instances (>1 network card) must use private subnets (no public IP in launch template)
216
- use_private_subnet = try(gpu_config.efa_network_cards, 0) > 1
221
+ use_private_subnet = (cr_config != null ? try(cr_config.efa_network_cards, try(gpu_config.efa_network_cards, 0)) : try(gpu_config.efa_network_cards, 0)) > 1
217
222
  }
218
223
  ]
219
224
  ])
@@ -363,7 +368,7 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
363
368
  associate_public_ip_address = true
364
369
  security_groups = [aws_security_group.gpu_dev_sg.id]
365
370
  subnet_id = each.value.gpu_config.use_placement_group ? null : local.public_subnet_map[each.value.subnet_az]
366
- interface_type = try(each.value.gpu_config.efa_network_cards, 0) > 0 ? "efa" : "interface"
371
+ interface_type = try(each.value.efa_network_cards, 0) > 0 ? "efa" : "interface"
367
372
  delete_on_termination = true
368
373
  }
369
374
  }
@@ -386,7 +391,7 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
386
391
  # Each network card supports 2 device indices (0 and 1); device_index must be 0
387
392
  # since this is the only interface on each card
388
393
  dynamic "network_interfaces" {
389
- for_each = each.value.use_private_subnet ? range(1, try(each.value.gpu_config.efa_network_cards, 1)) : []
394
+ for_each = each.value.use_private_subnet ? range(1, try(each.value.efa_network_cards, 1)) : []
390
395
  content {
391
396
  device_index = 0
392
397
  interface_type = "efa-only"
@@ -423,6 +428,7 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
423
428
  region = local.current_config.aws_region
424
429
  gpu_type = local.gpu_type_kubernetes_labels[each.value.gpu_type]
425
430
  profiling_dedicated = try(each.value.gpu_config.profiling_dedicated, false)
431
+ mig_profile = each.value.mig_profile != null ? each.value.mig_profile : ""
426
432
  container_image = local.latest_image_uri
427
433
  }))
428
434
 
@@ -23,6 +23,13 @@ AVAILABILITY_TABLE = os.environ["AVAILABILITY_TABLE"]
23
23
  SUPPORTED_GPU_TYPES = json.loads(os.environ["SUPPORTED_GPU_TYPES"])
24
24
 
25
25
 
26
+ def get_gpu_resource_name(gpu_type: str) -> str:
27
+ return SUPPORTED_GPU_TYPES.get(gpu_type, {}).get("k8s_resource", "nvidia.com/gpu")
28
+
29
+ def get_node_label_value(gpu_type: str) -> str:
30
+ return SUPPORTED_GPU_TYPES.get(gpu_type, {}).get("node_gpu_type", gpu_type)
31
+
32
+
26
33
  def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
27
34
  """Handle ASG capacity change events - update all GPU types"""
28
35
  try:
@@ -84,7 +91,9 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
84
91
  logger.info(f"Starting availability update for GPU type: {gpu_type}")
85
92
 
86
93
  # Get current ASG capacity - handle multiple ASGs per GPU type (e.g., capacity reservations)
87
- asg_name_prefix = f"pytorch-gpu-dev-gpu-nodes-{gpu_type}"
94
+ # MIG SKUs share the underlying h100 ASGs (cr-dedicated MIG node), so use the physical type for ASG matching
95
+ asg_lookup_type = get_node_label_value(gpu_type)
96
+ asg_name_prefix = f"pytorch-gpu-dev-gpu-nodes-{asg_lookup_type}"
88
97
  logger.info(f"Checking ASGs matching pattern: {asg_name_prefix}*")
89
98
 
90
99
  # Get all ASGs and filter by name pattern
@@ -102,6 +111,9 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
102
111
  logger.info(f"Found {len(matching_asgs)} ASGs: {asg_names}")
103
112
 
104
113
  # Calculate total availability metrics across all matching ASGs
114
+ # For MIG SKUs we cannot tell from ASG alone which instances are MIG-partitioned;
115
+ # we override running_instances later from k8s allocatable.
116
+ is_mig_sku = "k8s_resource" in SUPPORTED_GPU_TYPES.get(gpu_type, {})
105
117
  desired_capacity = sum(asg["DesiredCapacity"] for asg in matching_asgs)
106
118
  running_instances = sum(
107
119
  len([
@@ -130,7 +142,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
130
142
  logger.info(f"Checking CPU node availability for {gpu_type}")
131
143
  # Count available slots by checking pod count on each node
132
144
  v1 = client.CoreV1Api(k8s_client)
133
- nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
145
+ nodes = v1.list_node(label_selector=f"GpuType={get_node_label_value(gpu_type)}")
134
146
 
135
147
  total_available_slots = 0
136
148
  for node in nodes.items:
@@ -178,16 +190,18 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
178
190
  try:
179
191
  from kubernetes import client as k8s_client_lib
180
192
  v1 = k8s_client_lib.CoreV1Api(k8s_client)
181
- nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
193
+ node_label_value = get_node_label_value(gpu_type)
194
+ resource_name = get_gpu_resource_name(gpu_type)
195
+ nodes = v1.list_node(label_selector=f"GpuType={node_label_value}")
182
196
 
183
197
  single_node_max = 0 # Max available on any single node
184
198
  schedulable_total_gpus = 0 # Total GPUs on schedulable (non-cordoned) nodes
185
199
  for node in nodes.items:
186
200
  if is_node_ready_and_schedulable(node):
187
- available_on_node = get_available_gpus_on_node(v1, node)
201
+ available_on_node = get_available_gpus_on_node(v1, node, gpu_type)
188
202
  total_on_node = 0
189
203
  if node.status.allocatable:
190
- gpu_allocatable = node.status.allocatable.get("nvidia.com/gpu", "0")
204
+ gpu_allocatable = node.status.allocatable.get(resource_name, "0")
191
205
  try:
192
206
  total_on_node = int(gpu_allocatable)
193
207
  except (ValueError, TypeError):
@@ -203,6 +217,9 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
203
217
  full_nodes_available += 1
204
218
 
205
219
  total_gpus = schedulable_total_gpus
220
+ # For MIG SKUs override running_instances to the number of MIG-partitioned nodes
221
+ if is_mig_sku:
222
+ running_instances = sum(1 for n in nodes.items if is_node_ready_and_schedulable(n) and int((n.status.allocatable or {}).get(resource_name, "0")) > 0)
206
223
 
207
224
  # Calculate max reservable considering multinode scenarios
208
225
  # Only high-end GPU types support multinode (up to 4 nodes = 32 GPUs)
@@ -276,7 +293,7 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
276
293
  logger.info(f"Created CoreV1Api client for {gpu_type}")
277
294
 
278
295
  # Get all nodes with the specified GPU type
279
- gpu_type_selector = f"GpuType={gpu_type}"
296
+ gpu_type_selector = f"GpuType={get_node_label_value(gpu_type)}"
280
297
  logger.info(f"Querying nodes with label selector: {gpu_type_selector}")
281
298
 
282
299
  nodes = v1.list_node(label_selector=gpu_type_selector)
@@ -297,7 +314,7 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
297
314
 
298
315
  logger.info(f"Node {node.metadata.name} is ready, checking GPU availability")
299
316
  # Get available GPUs on this node
300
- available_on_node = get_available_gpus_on_node(v1, node)
317
+ available_on_node = get_available_gpus_on_node(v1, node, gpu_type)
301
318
  total_schedulable += available_on_node
302
319
  logger.info(f"Node {node.metadata.name}: {available_on_node} GPUs available")
303
320
 
@@ -332,11 +349,12 @@ def is_node_ready_and_schedulable(node) -> bool:
332
349
  return False
333
350
 
334
351
 
335
- def get_available_gpus_on_node(v1_api, node) -> int:
336
- """Get number of available GPUs on a specific node"""
352
+ def get_available_gpus_on_node(v1_api, node, gpu_type: str = None) -> int:
353
+ """Get number of available GPUs (or MIG slices) on a specific node for the given SKU."""
337
354
  try:
338
355
  node_name = node.metadata.name
339
- logger.info(f"Checking GPU availability on node: {node_name}")
356
+ resource_name = get_gpu_resource_name(gpu_type) if gpu_type else "nvidia.com/gpu"
357
+ logger.info(f"Checking GPU availability on node: {node_name} (resource={resource_name})")
340
358
 
341
359
  # Get all pods on this node
342
360
  logger.info(f"Querying pods on node {node_name}")
@@ -350,7 +368,7 @@ def get_available_gpus_on_node(v1_api, node) -> int:
350
368
  for container in pod.spec.containers:
351
369
  if container.resources and container.resources.requests:
352
370
  gpu_request = container.resources.requests.get(
353
- "nvidia.com/gpu", "0"
371
+ resource_name, "0"
354
372
  )
355
373
  try:
356
374
  used_gpus += int(gpu_request)
@@ -360,7 +378,7 @@ def get_available_gpus_on_node(v1_api, node) -> int:
360
378
  # Get total GPUs on this node
361
379
  total_gpus = 0
362
380
  if node.status.allocatable:
363
- gpu_allocatable = node.status.allocatable.get("nvidia.com/gpu", "0")
381
+ gpu_allocatable = node.status.allocatable.get(resource_name, "0")
364
382
  try:
365
383
  total_gpus = int(gpu_allocatable)
366
384
  except (ValueError, TypeError):
@@ -67,6 +67,10 @@ GPU_CONFIG = {
67
67
  "l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
68
68
  "a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
69
69
  "rtxpro6000": {"instance_type": "g7e.24xlarge", "max_gpus": 4, "cpus": 96, "memory_gb": 1024, "efa_count": 2},
70
+ # MIG slices on a dedicated H100 node (all-balanced profile: per GPU = 2x1g.10gb + 1x2g.20gb + 1x3g.40gb)
71
+ "h100-mig-1g": {"instance_type": "p5.48xlarge", "max_gpus": 16, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.10gb", "node_gpu_type": "h100"},
72
+ "h100-mig-2g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.20gb", "node_gpu_type": "h100"},
73
+ "h100-mig-3g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.40gb", "node_gpu_type": "h100"},
70
74
  "t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
71
75
  "g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
72
76
  "a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
@@ -78,6 +82,15 @@ GPU_CONFIG = {
78
82
  }
79
83
  GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0}
80
84
 
85
+ def get_gpu_resource_name(gpu_type: str) -> str:
86
+ """Kubernetes resource name for this SKU (nvidia.com/gpu or nvidia.com/mig-*)."""
87
+ return GPU_CONFIG.get(gpu_type, GPU_CONFIG_DEFAULT).get("k8s_resource", "nvidia.com/gpu")
88
+
89
+ def get_node_gpu_type(gpu_type: str) -> str:
90
+ """Value of the GpuType node label to select. MIG SKUs map to their underlying physical type."""
91
+ return GPU_CONFIG.get(gpu_type, {}).get("node_gpu_type", gpu_type)
92
+
93
+
81
94
  # GPU types under maintenance - only whitelisted users can reserve
82
95
  # Set to {} to disable maintenance mode for all types
83
96
  GPU_MAINTENANCE = {}
@@ -232,7 +245,8 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
232
245
  # Get all nodes with the requested GPU type
233
246
  logger.info(
234
247
  f"Querying nodes for GPU type {gpu_type} with {gpus_requested} GPUs needed")
235
- nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
248
+ node_label_value = get_node_gpu_type(gpu_type)
249
+ nodes = v1.list_node(label_selector=f"GpuType={node_label_value}")
236
250
 
237
251
  candidate_nodes = []
238
252
  all_ready_nodes = []
@@ -271,7 +285,7 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
271
285
  continue
272
286
 
273
287
  # Check available GPU capacity on this node
274
- available_gpus = get_available_gpus_on_node(v1, node)
288
+ available_gpus = get_available_gpus_on_node(v1, node, gpu_type)
275
289
 
276
290
  # Track all ready nodes (for fallback AZ when no single node has enough)
277
291
  all_ready_nodes.append({
@@ -2152,7 +2166,8 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
2152
2166
 
2153
2167
  # Validate GPU type
2154
2168
  valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
2155
- "h100", "h200", "b200", "cpu-arm", "cpu-x86"]
2169
+ "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
2170
+ "h200", "b200", "cpu-arm", "cpu-x86"]
2156
2171
  if gpu_type not in valid_gpu_types:
2157
2172
  error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
2158
2173
  logger.error(error_msg)
@@ -2238,10 +2253,11 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
2238
2253
  nodes = v1.list_node()
2239
2254
  schedulable_gpus = 0
2240
2255
 
2256
+ node_label_value = get_node_gpu_type(gpu_type)
2241
2257
  for node in nodes.items:
2242
2258
  # Check if node has the right GPU type label
2243
2259
  node_labels = node.metadata.labels or {}
2244
- if node_labels.get("GpuType") != gpu_type:
2260
+ if node_labels.get("GpuType") != node_label_value:
2245
2261
  continue
2246
2262
 
2247
2263
  # Check if node is ready and schedulable
@@ -2252,7 +2268,7 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
2252
2268
  continue
2253
2269
 
2254
2270
  # Get available GPUs on this node
2255
- node_gpus = get_available_gpus_on_node(v1, node)
2271
+ node_gpus = get_available_gpus_on_node(v1, node, gpu_type)
2256
2272
  schedulable_gpus += node_gpus
2257
2273
  logger.info(
2258
2274
  f"Node {node.metadata.name}: {node_gpus} available {gpu_type.upper()} GPUs"
@@ -2278,13 +2294,14 @@ def check_max_gpus_on_single_node(gpu_type: str) -> int:
2278
2294
  nodes = v1.list_node()
2279
2295
  max_gpus = 0
2280
2296
 
2297
+ node_label_value = get_node_gpu_type(gpu_type)
2281
2298
  for node in nodes.items:
2282
2299
  node_labels = node.metadata.labels or {}
2283
- if node_labels.get("GpuType") != gpu_type:
2300
+ if node_labels.get("GpuType") != node_label_value:
2284
2301
  continue
2285
2302
  if not is_node_ready_and_schedulable(node):
2286
2303
  continue
2287
- node_gpus = get_available_gpus_on_node(v1, node)
2304
+ node_gpus = get_available_gpus_on_node(v1, node, gpu_type)
2288
2305
  max_gpus = max(max_gpus, node_gpus)
2289
2306
 
2290
2307
  return max_gpus
@@ -2320,12 +2337,13 @@ def is_node_ready_and_schedulable(node) -> bool:
2320
2337
  return True
2321
2338
 
2322
2339
 
2323
- def get_available_gpus_on_node(v1_api, node) -> int:
2324
- """Get the number of available GPUs on a specific node"""
2340
+ def get_available_gpus_on_node(v1_api, node, gpu_type: str = None) -> int:
2341
+ """Get the number of available GPUs (or MIG slices) on a specific node for the given SKU."""
2325
2342
  try:
2343
+ resource_name = get_gpu_resource_name(gpu_type) if gpu_type else "nvidia.com/gpu"
2326
2344
  # Get allocatable GPUs from node status
2327
2345
  allocatable = node.status.allocatable or {}
2328
- total_gpus = int(allocatable.get("nvidia.com/gpu", "0"))
2346
+ total_gpus = int(allocatable.get(resource_name, "0"))
2329
2347
 
2330
2348
  if total_gpus == 0:
2331
2349
  return 0
@@ -2342,7 +2360,7 @@ def get_available_gpus_on_node(v1_api, node) -> int:
2342
2360
  for container in pod.spec.containers:
2343
2361
  if container.resources and container.resources.requests:
2344
2362
  gpu_request = container.resources.requests.get(
2345
- "nvidia.com/gpu", "0"
2363
+ resource_name, "0"
2346
2364
  )
2347
2365
  used_gpus += int(gpu_request)
2348
2366
 
@@ -2368,13 +2386,15 @@ def update_gpu_availability_table(
2368
2386
  total_gpus = 0
2369
2387
  running_instances = 0
2370
2388
 
2389
+ node_label_value = get_node_gpu_type(gpu_type)
2390
+ resource_name = get_gpu_resource_name(gpu_type)
2371
2391
  for node in nodes.items:
2372
2392
  node_labels = node.metadata.labels or {}
2373
- if node_labels.get("GpuType") == gpu_type:
2393
+ if node_labels.get("GpuType") == node_label_value:
2374
2394
  running_instances += 1
2375
2395
  # Get allocatable GPUs from node status
2376
2396
  allocatable = node.status.allocatable or {}
2377
- node_gpus = int(allocatable.get("nvidia.com/gpu", "0"))
2397
+ node_gpus = int(allocatable.get(resource_name, "0"))
2378
2398
  total_gpus += node_gpus
2379
2399
 
2380
2400
  # Get GPU configuration for this type (for gpus_per_instance)
@@ -2385,6 +2405,9 @@ def update_gpu_availability_table(
2385
2405
  "rtxpro6000": {"gpus_per_instance": 4},
2386
2406
  "a100": {"gpus_per_instance": 8},
2387
2407
  "h100": {"gpus_per_instance": 8},
2408
+ "h100-mig-1g": {"gpus_per_instance": 16},
2409
+ "h100-mig-2g": {"gpus_per_instance": 8},
2410
+ "h100-mig-3g": {"gpus_per_instance": 8},
2388
2411
  "h200": {"gpus_per_instance": 8},
2389
2412
  "b200": {"gpus_per_instance": 8},
2390
2413
  }
@@ -3697,7 +3720,8 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
3697
3720
  else:
3698
3721
  # GPU instances get proportional CPU/memory based on GPU allocation
3699
3722
  if gpu_count > 0:
3700
- limits["nvidia.com/gpu"] = str(gpu_count)
3723
+ resource_name = config.get("k8s_resource", "nvidia.com/gpu")
3724
+ limits[resource_name] = str(gpu_count)
3701
3725
 
3702
3726
  gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3703
3727
 
@@ -3712,10 +3736,11 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
3712
3736
  "memory": f"{proportional_memory_limit}Gi"
3713
3737
  })
3714
3738
 
3715
- # EFA optimization: Only use EFA for full-node multinode deployments
3739
+ # EFA optimization: Only use EFA for full-node multinode deployments (skip MIG slices)
3716
3740
  use_efa = (
3717
3741
  gpu_type != "t4-small" and
3718
3742
  not gpu_type.startswith("cpu-") and
3743
+ "mig" not in gpu_type and
3719
3744
  is_multinode and
3720
3745
  gpu_count == max_gpus
3721
3746
  )
@@ -3742,7 +3767,8 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
3742
3767
  requests.update({"cpu": "2", "memory": "4Gi"})
3743
3768
  else:
3744
3769
  if gpu_count > 0:
3745
- requests["nvidia.com/gpu"] = str(gpu_count)
3770
+ resource_name = config.get("k8s_resource", "nvidia.com/gpu")
3771
+ requests[resource_name] = str(gpu_count)
3746
3772
  gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
3747
3773
 
3748
3774
  # Calculate proportional requests (reserve 10% for system overhead)
@@ -3756,10 +3782,11 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
3756
3782
  "memory": f"{proportional_memory_request}Gi"
3757
3783
  })
3758
3784
 
3759
- # EFA: Only for full-node multinode deployments
3785
+ # EFA: Only for full-node multinode deployments (skip MIG slices)
3760
3786
  use_efa = (
3761
3787
  gpu_type != "t4-small" and
3762
3788
  not gpu_type.startswith("cpu-") and
3789
+ "mig" not in gpu_type and
3763
3790
  is_multinode and
3764
3791
  gpu_count == max_gpus
3765
3792
  )
@@ -5243,7 +5270,7 @@ EOF
5243
5270
  )
5244
5271
  ] if _pod_uses_efa(gpu_count, gpu_type, is_multinode) else []),
5245
5272
  node_selector={
5246
- "GpuType": gpu_type,
5273
+ "GpuType": get_node_gpu_type(gpu_type),
5247
5274
  **({} if target_az is None else {"topology.kubernetes.io/zone": target_az})
5248
5275
  },
5249
5276
  # Node affinity for profiling-dedicated preference
@@ -6846,7 +6873,7 @@ def update_pod_status_and_events(k8s_client, pod_name: str, reservation_id: str)
6846
6873
  f"Failed to convert to queued: {queue_err}")
6847
6874
 
6848
6875
  # Show user-friendly scheduling messages while waiting
6849
- if "Insufficient nvidia.com/gpu" in event.message:
6876
+ if "Insufficient nvidia.com/" in event.message and "gpu" in event.message.lower():
6850
6877
  # Check if it's a fragmentation issue (GPUs exist but not enough on single node)
6851
6878
  try:
6852
6879
  reservations_table = dynamodb.Table(
@@ -6882,7 +6909,7 @@ def update_pod_status_and_events(k8s_client, pod_name: str, reservation_id: str)
6882
6909
  k8s_client_temp = get_k8s_client()
6883
6910
  v1 = client.CoreV1Api(k8s_client_temp)
6884
6911
  nodes = v1.list_node(
6885
- label_selector=f"GpuType={gpu_type}")
6912
+ label_selector=f"GpuType={get_node_gpu_type(gpu_type)}")
6886
6913
 
6887
6914
  if len(nodes.items) == 0:
6888
6915
  # No nodes exist for this GPU type - fail immediately
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.1"
184
- MIN_CLI_VERSION = "0.5.1"
183
+ LAMBDA_VERSION = "0.5.3"
184
+ MIN_CLI_VERSION = "0.5.2"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
187
187
  }, local.alb_env_vars)
@@ -152,7 +152,7 @@ locals {
152
152
  efa_network_cards = 8 # p6-b200.48xlarge supports max 8 network cards
153
153
  }
154
154
  "h200" = {
155
- instance_type = "p5e.48xlarge" # Match capacity reservation type
155
+ instance_type = "p5en.48xlarge" # Match capacity reservation type
156
156
  instance_types = ["p5e.48xlarge", "p5en.48xlarge"]
157
157
  instance_count = 4 # Fallback default (not used when capacity_reservations defined)
158
158
  gpus_per_instance = 8
@@ -216,6 +216,45 @@ locals {
216
216
  architecture = "x86_64"
217
217
  efa_network_cards = 2
218
218
  }
219
+ # MIG slice SKUs — virtual: do NOT create an ASG. Surfaces the SKU to availability_updater
220
+ # + reservation_processor. Backed by the H100 CR labelled with mig_profile=all-balanced
221
+ # (per GPU = 2x1g.10gb + 1x2g.20gb + 1x3g.40gb).
222
+ "h100-mig-1g" = {
223
+ instance_type = null
224
+ instance_types = null
225
+ instance_count = 0
226
+ gpus_per_instance = 16 # 8 GPUs * 2 slices/GPU
227
+ use_placement_group = false
228
+ architecture = "x86_64"
229
+ efa_network_cards = 0
230
+ virtual = true
231
+ k8s_resource = "nvidia.com/mig-1g.10gb"
232
+ node_gpu_type = "h100"
233
+ }
234
+ "h100-mig-2g" = {
235
+ instance_type = null
236
+ instance_types = null
237
+ instance_count = 0
238
+ gpus_per_instance = 8 # 8 GPUs * 1 slice/GPU
239
+ use_placement_group = false
240
+ architecture = "x86_64"
241
+ efa_network_cards = 0
242
+ virtual = true
243
+ k8s_resource = "nvidia.com/mig-2g.20gb"
244
+ node_gpu_type = "h100"
245
+ }
246
+ "h100-mig-3g" = {
247
+ instance_type = null
248
+ instance_types = null
249
+ instance_count = 0
250
+ gpus_per_instance = 8 # 8 GPUs * 1 slice/GPU
251
+ use_placement_group = false
252
+ architecture = "x86_64"
253
+ efa_network_cards = 0
254
+ virtual = true
255
+ k8s_resource = "nvidia.com/mig-3g.40gb"
256
+ node_gpu_type = "h100"
257
+ }
219
258
  "cpu-arm" = {
220
259
  instance_type = "c7g.8xlarge"
221
260
  instance_types = null
@@ -267,11 +306,13 @@ locals {
267
306
  { key = "cr0", id = "cr-0a3f49b96fe03ca04", instance_count = 4 }, # H100 reservation us-east-2c (p5.48xlarge)
268
307
  { key = "cr1", id = null, instance_count = 2 }, # H100 on-demand (2 instances)
269
308
  { key = "cr2", id = "cr-044bc72b0a6b56062", instance_count = 4 }, # H100 reservation us-east-2a (4 instances)
309
+ { key = "cr3", id = "cr-0211ea1e8d3a3c79e", instance_count = 1, mig_profile = "all-balanced" }, # H100 reservation us-east-2c (1 instance, MIG-dedicated, all-balanced: 2x1g.10gb + 1x2g.20gb + 1x3g.40gb per GPU)
270
310
  ]
271
311
  h200 = [
272
312
  { key = "cr0", id = "cr-0f6d0766f5d3339e6", instance_count = 2 }, # H200 capacity block (may be expired - keep to prevent ASG destroy)
273
313
  { key = "cr1", id = "cr-06c9c978dea756a26", instance_count = 3 }, # H200 reservation (3 instances)
274
314
  { key = "cr2", id = null, instance_count = 2 }, # H200 on-demand (2 instances)
315
+ { key = "cr3", id = "cr-02949f61f1a761b54", instance_count = 1, efa_network_cards = 16 }, # H200 reservation us-east-2a (1 instance, 8 GPUs, p5en.48xlarge max 16 EFA)
275
316
  ]
276
317
  b200 = [
277
318
  { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 0 }, # B200 reservation us-east-2a (disabled - CR freed)
@@ -323,9 +364,11 @@ locals {
323
364
  # H200 capacity reservations
324
365
  "cr-0f6d0766f5d3339e6" = "tertiary" # us-east-2c (may be expired - kept to prevent ASG destroy)
325
366
  "cr-06c9c978dea756a26" = "tertiary" # us-east-2c
367
+ "cr-02949f61f1a761b54" = "primary" # us-east-2a
326
368
  # H100 capacity reservations
327
369
  "cr-0a3f49b96fe03ca04" = "tertiary" # us-east-2c (p5.48xlarge)
328
370
  "cr-044bc72b0a6b56062" = "primary" # us-east-2a (p5.48xlarge)
371
+ "cr-0211ea1e8d3a3c79e" = "tertiary" # us-east-2c (p5.48xlarge, MIG-dedicated)
329
372
  # A100 capacity reservation
330
373
  "cr-01cc0f00f28b095af" = "primary" # us-east-2a
331
374
  }
@@ -136,7 +136,7 @@ spec:
136
136
  cpu: "2"
137
137
  memory: "4Gi"
138
138
  flags:
139
- - --node-labels=NodeType=gpu,GpuType=${gpu_type},nvidia.com/gpu.deploy.driver=false${profiling_dedicated ? ",gpu.monitoring/profiling-dedicated=true,nvidia.com/gpu.deploy.dcgm-exporter=false" : ""}
139
+ - --node-labels=NodeType=gpu,GpuType=${gpu_type},nvidia.com/gpu.deploy.driver=false${profiling_dedicated ? ",gpu.monitoring/profiling-dedicated=true,nvidia.com/gpu.deploy.dcgm-exporter=false" : ""}${mig_profile != "" ? ",nvidia.com/mig.config=${mig_profile}" : ""}
140
140
  EOF
141
141
 
142
142
  # Configure EFA if hardware present (BEFORE nodeadm so kubelet sees hugepages)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes