gpu-dev 0.5.1__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/PKG-INFO +1 -1
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +27 -5
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +59 -5
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +3 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/pyproject.toml +1 -1
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/availability.tf +1 -1
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/eks.tf +10 -4
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/availability_updater/index.py +30 -12
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_processor/index.py +47 -20
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/main.tf +44 -1
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/al2023-user-data.sh +1 -1
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/.gitignore +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/CLAUDE.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/PROGRESS.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/TODO.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/admin/README.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/post.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/setup.cfg +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -498,9 +498,9 @@ def main(ctx: click.Context) -> None:
|
|
|
498
498
|
"--gpu-type",
|
|
499
499
|
"-t",
|
|
500
500
|
type=click.Choice(
|
|
501
|
-
["b200", "h200", "h100", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
501
|
+
["b200", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
502
502
|
),
|
|
503
|
-
help="GPU type to reserve
|
|
503
|
+
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices (partial GPU on a single shared node): h100-mig-1g (10 GB / 1/7 H100 compute), h100-mig-2g (20 GB / 2/7 H100), h100-mig-3g (40 GB / 3/7 H100). CPU only: cpu-arm, cpu-x86.",
|
|
504
504
|
)
|
|
505
505
|
@click.option(
|
|
506
506
|
"--hours",
|
|
@@ -656,6 +656,9 @@ def reserve(
|
|
|
656
656
|
"t4-small": {"max_gpus": 1, "instance_type": "g4dn.xlarge"},
|
|
657
657
|
"a100": {"max_gpus": 8, "instance_type": "p4d.24xlarge"},
|
|
658
658
|
"h100": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
|
|
659
|
+
"h100-mig-1g": {"max_gpus": 16, "instance_type": "p5.48xlarge"},
|
|
660
|
+
"h100-mig-2g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
|
|
661
|
+
"h100-mig-3g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
|
|
659
662
|
"h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
|
|
660
663
|
"b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
|
|
661
664
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
@@ -724,11 +727,18 @@ def reserve(
|
|
|
724
727
|
return
|
|
725
728
|
|
|
726
729
|
max_gpus = gpu_configs[gpu_type_lower]["max_gpus"]
|
|
727
|
-
|
|
728
|
-
gpu_type_lower, max_gpus)
|
|
729
|
-
if
|
|
730
|
+
result = select_gpu_count_interactive(
|
|
731
|
+
gpu_type_lower, max_gpus, availability_info=availability_info)
|
|
732
|
+
if result is None:
|
|
730
733
|
rprint("[yellow]Reservation cancelled.[/yellow]")
|
|
731
734
|
return
|
|
735
|
+
# If user picked a MIG slice, the function returns (gpu_type, count).
|
|
736
|
+
if isinstance(result, tuple):
|
|
737
|
+
gpu_type, gpu_count = result
|
|
738
|
+
gpu_type_lower = gpu_type.lower()
|
|
739
|
+
max_gpus = gpu_configs[gpu_type_lower]["max_gpus"]
|
|
740
|
+
else:
|
|
741
|
+
gpu_count = result
|
|
732
742
|
|
|
733
743
|
# Show distributed warning for interactive multinode selections (always show)
|
|
734
744
|
if gpu_count > max_gpus:
|
|
@@ -2399,6 +2409,9 @@ def _show_availability() -> None:
|
|
|
2399
2409
|
"a10g": "Ampere (sm80)",
|
|
2400
2410
|
"l4": "Ada Lovelace (sm89)",
|
|
2401
2411
|
"rtxpro6000": "Blackwell (sm120)",
|
|
2412
|
+
"h100-mig-1g": "Hopper (sm90, MIG 10GB)",
|
|
2413
|
+
"h100-mig-2g": "Hopper (sm90, MIG 20GB)",
|
|
2414
|
+
"h100-mig-3g": "Hopper (sm90, MIG 40GB)",
|
|
2402
2415
|
"t4": "Turing (sm75)",
|
|
2403
2416
|
"cpu-x86": "CPU (x86_64)",
|
|
2404
2417
|
"cpu-arm": "CPU (arm64)",
|
|
@@ -2409,6 +2422,9 @@ def _show_availability() -> None:
|
|
|
2409
2422
|
"Blackwell (sm100)": 0,
|
|
2410
2423
|
"Blackwell (sm120)": 0,
|
|
2411
2424
|
"Hopper (sm90)": 1,
|
|
2425
|
+
"Hopper (sm90, MIG 40GB)": 1,
|
|
2426
|
+
"Hopper (sm90, MIG 20GB)": 1,
|
|
2427
|
+
"Hopper (sm90, MIG 10GB)": 1,
|
|
2412
2428
|
"Ada Lovelace (sm89)": 2,
|
|
2413
2429
|
"Ampere (sm80)": 3,
|
|
2414
2430
|
"Turing (sm75)": 4,
|
|
@@ -2548,6 +2564,9 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2548
2564
|
"a10g": "Ampere (sm80)",
|
|
2549
2565
|
"l4": "Ada Lovelace (sm89)",
|
|
2550
2566
|
"rtxpro6000": "Blackwell (sm120)",
|
|
2567
|
+
"h100-mig-1g": "Hopper (sm90, MIG 10GB)",
|
|
2568
|
+
"h100-mig-2g": "Hopper (sm90, MIG 20GB)",
|
|
2569
|
+
"h100-mig-3g": "Hopper (sm90, MIG 40GB)",
|
|
2551
2570
|
"t4": "Turing (sm75)",
|
|
2552
2571
|
"cpu-x86": "CPU (x86_64)",
|
|
2553
2572
|
"cpu-arm": "CPU (arm64)",
|
|
@@ -2558,6 +2577,9 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2558
2577
|
"Blackwell (sm100)": 0,
|
|
2559
2578
|
"Blackwell (sm120)": 0,
|
|
2560
2579
|
"Hopper (sm90)": 1,
|
|
2580
|
+
"Hopper (sm90, MIG 40GB)": 1,
|
|
2581
|
+
"Hopper (sm90, MIG 20GB)": 1,
|
|
2582
|
+
"Hopper (sm90, MIG 10GB)": 1,
|
|
2561
2583
|
"Ada Lovelace (sm89)": 2,
|
|
2562
2584
|
"Ampere (sm80)": 3,
|
|
2563
2585
|
"Turing (sm75)": 4,
|
|
@@ -57,6 +57,13 @@ def select_gpu_type_interactive(
|
|
|
57
57
|
if not check_interactive_support():
|
|
58
58
|
return None
|
|
59
59
|
|
|
60
|
+
# Hide MIG slice SKUs from the top-level selector — reached via the h100 submenu.
|
|
61
|
+
# Direct `--gpu-type h100-mig-1g` still works for non-interactive scripts.
|
|
62
|
+
visible_info = {
|
|
63
|
+
gt: info for gt, info in availability_info.items()
|
|
64
|
+
if "-mig-" not in gt
|
|
65
|
+
}
|
|
66
|
+
|
|
60
67
|
# Display availability table first
|
|
61
68
|
console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
|
|
62
69
|
table = Table()
|
|
@@ -67,7 +74,7 @@ def select_gpu_type_interactive(
|
|
|
67
74
|
table.add_column("Est. Wait Time", style="magenta")
|
|
68
75
|
|
|
69
76
|
choices = []
|
|
70
|
-
for gpu_type, info in
|
|
77
|
+
for gpu_type, info in visible_info.items():
|
|
71
78
|
available = info.get("available", 0)
|
|
72
79
|
total = info.get("total", 0)
|
|
73
80
|
queue_length = info.get("queue_length", 0)
|
|
@@ -143,8 +150,16 @@ def select_gpu_type_interactive(
|
|
|
143
150
|
return None
|
|
144
151
|
|
|
145
152
|
|
|
146
|
-
def select_gpu_count_interactive(
|
|
147
|
-
|
|
153
|
+
def select_gpu_count_interactive(
|
|
154
|
+
gpu_type: str,
|
|
155
|
+
max_gpus: int,
|
|
156
|
+
availability_info: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
157
|
+
):
|
|
158
|
+
"""Interactive GPU count selection.
|
|
159
|
+
|
|
160
|
+
Returns int (gpu_count) for normal selections, or a (effective_gpu_type, gpu_count)
|
|
161
|
+
tuple when the user picks a MIG slice option from the h100 submenu.
|
|
162
|
+
"""
|
|
148
163
|
if not check_interactive_support():
|
|
149
164
|
return None
|
|
150
165
|
|
|
@@ -157,6 +172,12 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
|
|
|
157
172
|
valid_counts = [1, 2, 4]
|
|
158
173
|
# Add multinode options
|
|
159
174
|
multinode_counts = [8, 12, 16, 20, 24] # multiples of 4
|
|
175
|
+
elif gpu_type == "h100-mig-1g":
|
|
176
|
+
valid_counts = [1, 2, 4, 8]
|
|
177
|
+
multinode_counts = [] # MIG slices live on a single node — no multinode
|
|
178
|
+
elif gpu_type in ["h100-mig-2g", "h100-mig-3g"]:
|
|
179
|
+
valid_counts = [1, 2, 4]
|
|
180
|
+
multinode_counts = []
|
|
160
181
|
elif gpu_type == "g5g":
|
|
161
182
|
valid_counts = [1, 2]
|
|
162
183
|
multinode_counts = [4, 8] # multiples of 4
|
|
@@ -168,6 +189,28 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
|
|
|
168
189
|
# Add multinode options
|
|
169
190
|
multinode_counts = [16, 24, 32, 40, 48] # multiples of 8
|
|
170
191
|
|
|
192
|
+
# MIG slice submenu: only for h100. Each tuple is (target_gpu_type, gpu_count, gb_label).
|
|
193
|
+
mig_options = []
|
|
194
|
+
if gpu_type == "h100":
|
|
195
|
+
# Map to internal SKUs; the count menu surfaces 1/2/4 of each slice size.
|
|
196
|
+
mig_specs = [
|
|
197
|
+
("h100-mig-1g", "10GB"),
|
|
198
|
+
("h100-mig-2g", "20GB"),
|
|
199
|
+
("h100-mig-3g", "40GB"),
|
|
200
|
+
]
|
|
201
|
+
for sku, gb in mig_specs:
|
|
202
|
+
slice_max = {"h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8}[sku]
|
|
203
|
+
free = None
|
|
204
|
+
if availability_info and sku in availability_info:
|
|
205
|
+
free = availability_info[sku].get("available", 0)
|
|
206
|
+
for n in [1, 2, 4]:
|
|
207
|
+
if n > slice_max:
|
|
208
|
+
continue
|
|
209
|
+
noun = "slice" if n == 1 else "slices"
|
|
210
|
+
avail_suffix = f" [{free} free]" if free is not None else ""
|
|
211
|
+
label = f"{n} × {gb} {noun}{avail_suffix}"
|
|
212
|
+
mig_options.append((sku, n, label))
|
|
213
|
+
|
|
171
214
|
# Filter single-node by actual max for this GPU type
|
|
172
215
|
valid_counts = [count for count in valid_counts if count <= max_gpus]
|
|
173
216
|
|
|
@@ -177,7 +220,18 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
|
|
|
177
220
|
|
|
178
221
|
choices = []
|
|
179
222
|
|
|
180
|
-
#
|
|
223
|
+
# MIG slice options come first (smallest unit), h100-only.
|
|
224
|
+
if mig_options:
|
|
225
|
+
choices.append(questionary.Separator(
|
|
226
|
+
"--- MIG slices (partial GPU, single node) ---"))
|
|
227
|
+
for sku, count, label in mig_options:
|
|
228
|
+
choices.append(questionary.Choice(title=label, value=(sku, count)))
|
|
229
|
+
|
|
230
|
+
# Full single-node options. Header only when slices were rendered above
|
|
231
|
+
# (otherwise the type already implies "Full GPUs").
|
|
232
|
+
if mig_options:
|
|
233
|
+
choices.append(questionary.Separator(
|
|
234
|
+
"--- Full GPUs (single node) ---"))
|
|
181
235
|
for count in valid_counts:
|
|
182
236
|
if count == 1:
|
|
183
237
|
label = f"1 GPU (single node)"
|
|
@@ -185,7 +239,7 @@ def select_gpu_count_interactive(gpu_type: str, max_gpus: int) -> Optional[int]:
|
|
|
185
239
|
label = f"{count} GPUs (single node)"
|
|
186
240
|
choices.append(questionary.Choice(title=label, value=count))
|
|
187
241
|
|
|
188
|
-
#
|
|
242
|
+
# Multinode at the bottom.
|
|
189
243
|
if multinode_counts:
|
|
190
244
|
choices.append(questionary.Separator(
|
|
191
245
|
"--- Multinode (Distributed) ---"))
|
|
@@ -540,6 +540,9 @@ class ReservationManager:
|
|
|
540
540
|
"g5g": {"max_gpus": 2},
|
|
541
541
|
"a100": {"max_gpus": 8},
|
|
542
542
|
"h100": {"max_gpus": 8},
|
|
543
|
+
"h100-mig-1g": {"max_gpus": 16},
|
|
544
|
+
"h100-mig-2g": {"max_gpus": 8},
|
|
545
|
+
"h100-mig-3g": {"max_gpus": 8},
|
|
543
546
|
"h200": {"max_gpus": 8},
|
|
544
547
|
"b200": {"max_gpus": 8},
|
|
545
548
|
}
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.3"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -142,7 +142,7 @@ resource "aws_cloudwatch_event_rule" "asg_capacity_change" {
|
|
|
142
142
|
"EC2 Instance Terminate Successful"
|
|
143
143
|
]
|
|
144
144
|
detail = {
|
|
145
|
-
AutoScalingGroupName = [for gpu_type in
|
|
145
|
+
AutoScalingGroupName = [for gpu_type, cfg in local.current_config.supported_gpu_types : "${var.prefix}-gpu-nodes-${gpu_type}" if !try(cfg.virtual, false)]
|
|
146
146
|
}
|
|
147
147
|
})
|
|
148
148
|
|
|
@@ -198,7 +198,7 @@ locals {
|
|
|
198
198
|
# Flatten capacity reservations to create multiple ASGs when needed
|
|
199
199
|
# Each CR entry must have a stable 'key' field so removing entries doesn't shift other ASG keys.
|
|
200
200
|
gpu_capacity_reservations = flatten([
|
|
201
|
-
for gpu_type, gpu_config in local.current_config.supported_gpu_types : [
|
|
201
|
+
for gpu_type, gpu_config in local.current_config.supported_gpu_types : try(gpu_config.virtual, false) ? [] : [
|
|
202
202
|
for cr_index, cr_config in try(local.capacity_reservations[terraform.workspace][gpu_type], [null]) : {
|
|
203
203
|
gpu_type = gpu_type
|
|
204
204
|
gpu_config = gpu_config
|
|
@@ -212,8 +212,13 @@ locals {
|
|
|
212
212
|
? lookup(local.capacity_reservation_azs[terraform.workspace], cr_config.id, local.gpu_subnet_assignments[terraform.workspace][gpu_type])
|
|
213
213
|
: local.gpu_subnet_assignments[terraform.workspace][gpu_type]
|
|
214
214
|
)
|
|
215
|
+
# Per-CR override for efa_network_cards (e.g. p5en.48xlarge caps at 16 vs p5e at 32)
|
|
216
|
+
efa_network_cards = cr_config != null ? try(cr_config.efa_network_cards, gpu_config.efa_network_cards) : gpu_config.efa_network_cards
|
|
217
|
+
# Optional MIG profile (e.g. "all-balanced", "all-1g.10gb"). When set, user-data labels the node so nvidia-mig-manager partitions the GPUs.
|
|
218
|
+
# Default to "" (not null) — null breaks templatefile() string interpolation downstream.
|
|
219
|
+
mig_profile = cr_config != null ? try(cr_config.mig_profile, "") : ""
|
|
215
220
|
# Multi-EFA instances (>1 network card) must use private subnets (no public IP in launch template)
|
|
216
|
-
use_private_subnet = try(gpu_config.efa_network_cards, 0) > 1
|
|
221
|
+
use_private_subnet = (cr_config != null ? try(cr_config.efa_network_cards, try(gpu_config.efa_network_cards, 0)) : try(gpu_config.efa_network_cards, 0)) > 1
|
|
217
222
|
}
|
|
218
223
|
]
|
|
219
224
|
])
|
|
@@ -363,7 +368,7 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
|
|
|
363
368
|
associate_public_ip_address = true
|
|
364
369
|
security_groups = [aws_security_group.gpu_dev_sg.id]
|
|
365
370
|
subnet_id = each.value.gpu_config.use_placement_group ? null : local.public_subnet_map[each.value.subnet_az]
|
|
366
|
-
interface_type = try(each.value.
|
|
371
|
+
interface_type = try(each.value.efa_network_cards, 0) > 0 ? "efa" : "interface"
|
|
367
372
|
delete_on_termination = true
|
|
368
373
|
}
|
|
369
374
|
}
|
|
@@ -386,7 +391,7 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
|
|
|
386
391
|
# Each network card supports 2 device indices (0 and 1); device_index must be 0
|
|
387
392
|
# since this is the only interface on each card
|
|
388
393
|
dynamic "network_interfaces" {
|
|
389
|
-
for_each = each.value.use_private_subnet ? range(1, try(each.value.
|
|
394
|
+
for_each = each.value.use_private_subnet ? range(1, try(each.value.efa_network_cards, 1)) : []
|
|
390
395
|
content {
|
|
391
396
|
device_index = 0
|
|
392
397
|
interface_type = "efa-only"
|
|
@@ -423,6 +428,7 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
|
|
|
423
428
|
region = local.current_config.aws_region
|
|
424
429
|
gpu_type = local.gpu_type_kubernetes_labels[each.value.gpu_type]
|
|
425
430
|
profiling_dedicated = try(each.value.gpu_config.profiling_dedicated, false)
|
|
431
|
+
mig_profile = each.value.mig_profile != null ? each.value.mig_profile : ""
|
|
426
432
|
container_image = local.latest_image_uri
|
|
427
433
|
}))
|
|
428
434
|
|
{gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
@@ -23,6 +23,13 @@ AVAILABILITY_TABLE = os.environ["AVAILABILITY_TABLE"]
|
|
|
23
23
|
SUPPORTED_GPU_TYPES = json.loads(os.environ["SUPPORTED_GPU_TYPES"])
|
|
24
24
|
|
|
25
25
|
|
|
26
|
+
def get_gpu_resource_name(gpu_type: str) -> str:
|
|
27
|
+
return SUPPORTED_GPU_TYPES.get(gpu_type, {}).get("k8s_resource", "nvidia.com/gpu")
|
|
28
|
+
|
|
29
|
+
def get_node_label_value(gpu_type: str) -> str:
|
|
30
|
+
return SUPPORTED_GPU_TYPES.get(gpu_type, {}).get("node_gpu_type", gpu_type)
|
|
31
|
+
|
|
32
|
+
|
|
26
33
|
def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
|
|
27
34
|
"""Handle ASG capacity change events - update all GPU types"""
|
|
28
35
|
try:
|
|
@@ -84,7 +91,9 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
84
91
|
logger.info(f"Starting availability update for GPU type: {gpu_type}")
|
|
85
92
|
|
|
86
93
|
# Get current ASG capacity - handle multiple ASGs per GPU type (e.g., capacity reservations)
|
|
87
|
-
|
|
94
|
+
# MIG SKUs share the underlying h100 ASGs (cr-dedicated MIG node), so use the physical type for ASG matching
|
|
95
|
+
asg_lookup_type = get_node_label_value(gpu_type)
|
|
96
|
+
asg_name_prefix = f"pytorch-gpu-dev-gpu-nodes-{asg_lookup_type}"
|
|
88
97
|
logger.info(f"Checking ASGs matching pattern: {asg_name_prefix}*")
|
|
89
98
|
|
|
90
99
|
# Get all ASGs and filter by name pattern
|
|
@@ -102,6 +111,9 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
102
111
|
logger.info(f"Found {len(matching_asgs)} ASGs: {asg_names}")
|
|
103
112
|
|
|
104
113
|
# Calculate total availability metrics across all matching ASGs
|
|
114
|
+
# For MIG SKUs we cannot tell from ASG alone which instances are MIG-partitioned;
|
|
115
|
+
# we override running_instances later from k8s allocatable.
|
|
116
|
+
is_mig_sku = "k8s_resource" in SUPPORTED_GPU_TYPES.get(gpu_type, {})
|
|
105
117
|
desired_capacity = sum(asg["DesiredCapacity"] for asg in matching_asgs)
|
|
106
118
|
running_instances = sum(
|
|
107
119
|
len([
|
|
@@ -130,7 +142,7 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
130
142
|
logger.info(f"Checking CPU node availability for {gpu_type}")
|
|
131
143
|
# Count available slots by checking pod count on each node
|
|
132
144
|
v1 = client.CoreV1Api(k8s_client)
|
|
133
|
-
nodes = v1.list_node(label_selector=f"GpuType={gpu_type}")
|
|
145
|
+
nodes = v1.list_node(label_selector=f"GpuType={get_node_label_value(gpu_type)}")
|
|
134
146
|
|
|
135
147
|
total_available_slots = 0
|
|
136
148
|
for node in nodes.items:
|
|
@@ -178,16 +190,18 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
178
190
|
try:
|
|
179
191
|
from kubernetes import client as k8s_client_lib
|
|
180
192
|
v1 = k8s_client_lib.CoreV1Api(k8s_client)
|
|
181
|
-
|
|
193
|
+
node_label_value = get_node_label_value(gpu_type)
|
|
194
|
+
resource_name = get_gpu_resource_name(gpu_type)
|
|
195
|
+
nodes = v1.list_node(label_selector=f"GpuType={node_label_value}")
|
|
182
196
|
|
|
183
197
|
single_node_max = 0 # Max available on any single node
|
|
184
198
|
schedulable_total_gpus = 0 # Total GPUs on schedulable (non-cordoned) nodes
|
|
185
199
|
for node in nodes.items:
|
|
186
200
|
if is_node_ready_and_schedulable(node):
|
|
187
|
-
available_on_node = get_available_gpus_on_node(v1, node)
|
|
201
|
+
available_on_node = get_available_gpus_on_node(v1, node, gpu_type)
|
|
188
202
|
total_on_node = 0
|
|
189
203
|
if node.status.allocatable:
|
|
190
|
-
gpu_allocatable = node.status.allocatable.get(
|
|
204
|
+
gpu_allocatable = node.status.allocatable.get(resource_name, "0")
|
|
191
205
|
try:
|
|
192
206
|
total_on_node = int(gpu_allocatable)
|
|
193
207
|
except (ValueError, TypeError):
|
|
@@ -203,6 +217,9 @@ def update_gpu_availability(gpu_type: str, k8s_client=None) -> None:
|
|
|
203
217
|
full_nodes_available += 1
|
|
204
218
|
|
|
205
219
|
total_gpus = schedulable_total_gpus
|
|
220
|
+
# For MIG SKUs override running_instances to the number of MIG-partitioned nodes
|
|
221
|
+
if is_mig_sku:
|
|
222
|
+
running_instances = sum(1 for n in nodes.items if is_node_ready_and_schedulable(n) and int((n.status.allocatable or {}).get(resource_name, "0")) > 0)
|
|
206
223
|
|
|
207
224
|
# Calculate max reservable considering multinode scenarios
|
|
208
225
|
# Only high-end GPU types support multinode (up to 4 nodes = 32 GPUs)
|
|
@@ -276,7 +293,7 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
|
|
|
276
293
|
logger.info(f"Created CoreV1Api client for {gpu_type}")
|
|
277
294
|
|
|
278
295
|
# Get all nodes with the specified GPU type
|
|
279
|
-
gpu_type_selector = f"GpuType={gpu_type}"
|
|
296
|
+
gpu_type_selector = f"GpuType={get_node_label_value(gpu_type)}"
|
|
280
297
|
logger.info(f"Querying nodes with label selector: {gpu_type_selector}")
|
|
281
298
|
|
|
282
299
|
nodes = v1.list_node(label_selector=gpu_type_selector)
|
|
@@ -297,7 +314,7 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
|
|
|
297
314
|
|
|
298
315
|
logger.info(f"Node {node.metadata.name} is ready, checking GPU availability")
|
|
299
316
|
# Get available GPUs on this node
|
|
300
|
-
available_on_node = get_available_gpus_on_node(v1, node)
|
|
317
|
+
available_on_node = get_available_gpus_on_node(v1, node, gpu_type)
|
|
301
318
|
total_schedulable += available_on_node
|
|
302
319
|
logger.info(f"Node {node.metadata.name}: {available_on_node} GPUs available")
|
|
303
320
|
|
|
@@ -332,11 +349,12 @@ def is_node_ready_and_schedulable(node) -> bool:
|
|
|
332
349
|
return False
|
|
333
350
|
|
|
334
351
|
|
|
335
|
-
def get_available_gpus_on_node(v1_api, node) -> int:
|
|
336
|
-
"""Get number of available GPUs on a specific node"""
|
|
352
|
+
def get_available_gpus_on_node(v1_api, node, gpu_type: str = None) -> int:
|
|
353
|
+
"""Get number of available GPUs (or MIG slices) on a specific node for the given SKU."""
|
|
337
354
|
try:
|
|
338
355
|
node_name = node.metadata.name
|
|
339
|
-
|
|
356
|
+
resource_name = get_gpu_resource_name(gpu_type) if gpu_type else "nvidia.com/gpu"
|
|
357
|
+
logger.info(f"Checking GPU availability on node: {node_name} (resource={resource_name})")
|
|
340
358
|
|
|
341
359
|
# Get all pods on this node
|
|
342
360
|
logger.info(f"Querying pods on node {node_name}")
|
|
@@ -350,7 +368,7 @@ def get_available_gpus_on_node(v1_api, node) -> int:
|
|
|
350
368
|
for container in pod.spec.containers:
|
|
351
369
|
if container.resources and container.resources.requests:
|
|
352
370
|
gpu_request = container.resources.requests.get(
|
|
353
|
-
|
|
371
|
+
resource_name, "0"
|
|
354
372
|
)
|
|
355
373
|
try:
|
|
356
374
|
used_gpus += int(gpu_request)
|
|
@@ -360,7 +378,7 @@ def get_available_gpus_on_node(v1_api, node) -> int:
|
|
|
360
378
|
# Get total GPUs on this node
|
|
361
379
|
total_gpus = 0
|
|
362
380
|
if node.status.allocatable:
|
|
363
|
-
gpu_allocatable = node.status.allocatable.get(
|
|
381
|
+
gpu_allocatable = node.status.allocatable.get(resource_name, "0")
|
|
364
382
|
try:
|
|
365
383
|
total_gpus = int(gpu_allocatable)
|
|
366
384
|
except (ValueError, TypeError):
|
{gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -67,6 +67,10 @@ GPU_CONFIG = {
|
|
|
67
67
|
"l4": {"instance_type": "g6.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
|
|
68
68
|
"a10g": {"instance_type": "g5.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 1},
|
|
69
69
|
"rtxpro6000": {"instance_type": "g7e.24xlarge", "max_gpus": 4, "cpus": 96, "memory_gb": 1024, "efa_count": 2},
|
|
70
|
+
# MIG slices on a dedicated H100 node (all-balanced profile: per GPU = 2x1g.10gb + 1x2g.20gb + 1x3g.40gb)
|
|
71
|
+
"h100-mig-1g": {"instance_type": "p5.48xlarge", "max_gpus": 16, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.10gb", "node_gpu_type": "h100"},
|
|
72
|
+
"h100-mig-2g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.20gb", "node_gpu_type": "h100"},
|
|
73
|
+
"h100-mig-3g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.40gb", "node_gpu_type": "h100"},
|
|
70
74
|
"t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
71
75
|
"g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
72
76
|
"a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
|
|
@@ -78,6 +82,15 @@ GPU_CONFIG = {
|
|
|
78
82
|
}
|
|
79
83
|
GPU_CONFIG_DEFAULT = {"instance_type": "g4dn.12xlarge", "max_gpus": 4, "cpus": 48, "memory_gb": 192, "efa_count": 0}
|
|
80
84
|
|
|
85
|
+
def get_gpu_resource_name(gpu_type: str) -> str:
|
|
86
|
+
"""Kubernetes resource name for this SKU (nvidia.com/gpu or nvidia.com/mig-*)."""
|
|
87
|
+
return GPU_CONFIG.get(gpu_type, GPU_CONFIG_DEFAULT).get("k8s_resource", "nvidia.com/gpu")
|
|
88
|
+
|
|
89
|
+
def get_node_gpu_type(gpu_type: str) -> str:
|
|
90
|
+
"""Value of the GpuType node label to select. MIG SKUs map to their underlying physical type."""
|
|
91
|
+
return GPU_CONFIG.get(gpu_type, {}).get("node_gpu_type", gpu_type)
|
|
92
|
+
|
|
93
|
+
|
|
81
94
|
# GPU types under maintenance - only whitelisted users can reserve
|
|
82
95
|
# Set to {} to disable maintenance mode for all types
|
|
83
96
|
GPU_MAINTENANCE = {}
|
|
@@ -232,7 +245,8 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
|
|
|
232
245
|
# Get all nodes with the requested GPU type
|
|
233
246
|
logger.info(
|
|
234
247
|
f"Querying nodes for GPU type {gpu_type} with {gpus_requested} GPUs needed")
|
|
235
|
-
|
|
248
|
+
node_label_value = get_node_gpu_type(gpu_type)
|
|
249
|
+
nodes = v1.list_node(label_selector=f"GpuType={node_label_value}")
|
|
236
250
|
|
|
237
251
|
candidate_nodes = []
|
|
238
252
|
all_ready_nodes = []
|
|
@@ -271,7 +285,7 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
|
|
|
271
285
|
continue
|
|
272
286
|
|
|
273
287
|
# Check available GPU capacity on this node
|
|
274
|
-
available_gpus = get_available_gpus_on_node(v1, node)
|
|
288
|
+
available_gpus = get_available_gpus_on_node(v1, node, gpu_type)
|
|
275
289
|
|
|
276
290
|
# Track all ready nodes (for fallback AZ when no single node has enough)
|
|
277
291
|
all_ready_nodes.append({
|
|
@@ -2152,7 +2166,8 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2152
2166
|
|
|
2153
2167
|
# Validate GPU type
|
|
2154
2168
|
valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
|
|
2155
|
-
"h100", "
|
|
2169
|
+
"h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
|
|
2170
|
+
"h200", "b200", "cpu-arm", "cpu-x86"]
|
|
2156
2171
|
if gpu_type not in valid_gpu_types:
|
|
2157
2172
|
error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
|
|
2158
2173
|
logger.error(error_msg)
|
|
@@ -2238,10 +2253,11 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
|
|
|
2238
2253
|
nodes = v1.list_node()
|
|
2239
2254
|
schedulable_gpus = 0
|
|
2240
2255
|
|
|
2256
|
+
node_label_value = get_node_gpu_type(gpu_type)
|
|
2241
2257
|
for node in nodes.items:
|
|
2242
2258
|
# Check if node has the right GPU type label
|
|
2243
2259
|
node_labels = node.metadata.labels or {}
|
|
2244
|
-
if node_labels.get("GpuType") !=
|
|
2260
|
+
if node_labels.get("GpuType") != node_label_value:
|
|
2245
2261
|
continue
|
|
2246
2262
|
|
|
2247
2263
|
# Check if node is ready and schedulable
|
|
@@ -2252,7 +2268,7 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
|
|
|
2252
2268
|
continue
|
|
2253
2269
|
|
|
2254
2270
|
# Get available GPUs on this node
|
|
2255
|
-
node_gpus = get_available_gpus_on_node(v1, node)
|
|
2271
|
+
node_gpus = get_available_gpus_on_node(v1, node, gpu_type)
|
|
2256
2272
|
schedulable_gpus += node_gpus
|
|
2257
2273
|
logger.info(
|
|
2258
2274
|
f"Node {node.metadata.name}: {node_gpus} available {gpu_type.upper()} GPUs"
|
|
@@ -2278,13 +2294,14 @@ def check_max_gpus_on_single_node(gpu_type: str) -> int:
|
|
|
2278
2294
|
nodes = v1.list_node()
|
|
2279
2295
|
max_gpus = 0
|
|
2280
2296
|
|
|
2297
|
+
node_label_value = get_node_gpu_type(gpu_type)
|
|
2281
2298
|
for node in nodes.items:
|
|
2282
2299
|
node_labels = node.metadata.labels or {}
|
|
2283
|
-
if node_labels.get("GpuType") !=
|
|
2300
|
+
if node_labels.get("GpuType") != node_label_value:
|
|
2284
2301
|
continue
|
|
2285
2302
|
if not is_node_ready_and_schedulable(node):
|
|
2286
2303
|
continue
|
|
2287
|
-
node_gpus = get_available_gpus_on_node(v1, node)
|
|
2304
|
+
node_gpus = get_available_gpus_on_node(v1, node, gpu_type)
|
|
2288
2305
|
max_gpus = max(max_gpus, node_gpus)
|
|
2289
2306
|
|
|
2290
2307
|
return max_gpus
|
|
@@ -2320,12 +2337,13 @@ def is_node_ready_and_schedulable(node) -> bool:
|
|
|
2320
2337
|
return True
|
|
2321
2338
|
|
|
2322
2339
|
|
|
2323
|
-
def get_available_gpus_on_node(v1_api, node) -> int:
|
|
2324
|
-
"""Get the number of available GPUs on a specific node"""
|
|
2340
|
+
def get_available_gpus_on_node(v1_api, node, gpu_type: str = None) -> int:
|
|
2341
|
+
"""Get the number of available GPUs (or MIG slices) on a specific node for the given SKU."""
|
|
2325
2342
|
try:
|
|
2343
|
+
resource_name = get_gpu_resource_name(gpu_type) if gpu_type else "nvidia.com/gpu"
|
|
2326
2344
|
# Get allocatable GPUs from node status
|
|
2327
2345
|
allocatable = node.status.allocatable or {}
|
|
2328
|
-
total_gpus = int(allocatable.get(
|
|
2346
|
+
total_gpus = int(allocatable.get(resource_name, "0"))
|
|
2329
2347
|
|
|
2330
2348
|
if total_gpus == 0:
|
|
2331
2349
|
return 0
|
|
@@ -2342,7 +2360,7 @@ def get_available_gpus_on_node(v1_api, node) -> int:
|
|
|
2342
2360
|
for container in pod.spec.containers:
|
|
2343
2361
|
if container.resources and container.resources.requests:
|
|
2344
2362
|
gpu_request = container.resources.requests.get(
|
|
2345
|
-
|
|
2363
|
+
resource_name, "0"
|
|
2346
2364
|
)
|
|
2347
2365
|
used_gpus += int(gpu_request)
|
|
2348
2366
|
|
|
@@ -2368,13 +2386,15 @@ def update_gpu_availability_table(
|
|
|
2368
2386
|
total_gpus = 0
|
|
2369
2387
|
running_instances = 0
|
|
2370
2388
|
|
|
2389
|
+
node_label_value = get_node_gpu_type(gpu_type)
|
|
2390
|
+
resource_name = get_gpu_resource_name(gpu_type)
|
|
2371
2391
|
for node in nodes.items:
|
|
2372
2392
|
node_labels = node.metadata.labels or {}
|
|
2373
|
-
if node_labels.get("GpuType") ==
|
|
2393
|
+
if node_labels.get("GpuType") == node_label_value:
|
|
2374
2394
|
running_instances += 1
|
|
2375
2395
|
# Get allocatable GPUs from node status
|
|
2376
2396
|
allocatable = node.status.allocatable or {}
|
|
2377
|
-
node_gpus = int(allocatable.get(
|
|
2397
|
+
node_gpus = int(allocatable.get(resource_name, "0"))
|
|
2378
2398
|
total_gpus += node_gpus
|
|
2379
2399
|
|
|
2380
2400
|
# Get GPU configuration for this type (for gpus_per_instance)
|
|
@@ -2385,6 +2405,9 @@ def update_gpu_availability_table(
|
|
|
2385
2405
|
"rtxpro6000": {"gpus_per_instance": 4},
|
|
2386
2406
|
"a100": {"gpus_per_instance": 8},
|
|
2387
2407
|
"h100": {"gpus_per_instance": 8},
|
|
2408
|
+
"h100-mig-1g": {"gpus_per_instance": 16},
|
|
2409
|
+
"h100-mig-2g": {"gpus_per_instance": 8},
|
|
2410
|
+
"h100-mig-3g": {"gpus_per_instance": 8},
|
|
2388
2411
|
"h200": {"gpus_per_instance": 8},
|
|
2389
2412
|
"b200": {"gpus_per_instance": 8},
|
|
2390
2413
|
}
|
|
@@ -3697,7 +3720,8 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
|
|
|
3697
3720
|
else:
|
|
3698
3721
|
# GPU instances get proportional CPU/memory based on GPU allocation
|
|
3699
3722
|
if gpu_count > 0:
|
|
3700
|
-
|
|
3723
|
+
resource_name = config.get("k8s_resource", "nvidia.com/gpu")
|
|
3724
|
+
limits[resource_name] = str(gpu_count)
|
|
3701
3725
|
|
|
3702
3726
|
gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
|
|
3703
3727
|
|
|
@@ -3712,10 +3736,11 @@ def get_pod_resource_limits(gpu_count: int, gpu_type: str, is_multinode: bool =
|
|
|
3712
3736
|
"memory": f"{proportional_memory_limit}Gi"
|
|
3713
3737
|
})
|
|
3714
3738
|
|
|
3715
|
-
# EFA optimization: Only use EFA for full-node multinode deployments
|
|
3739
|
+
# EFA optimization: Only use EFA for full-node multinode deployments (skip MIG slices)
|
|
3716
3740
|
use_efa = (
|
|
3717
3741
|
gpu_type != "t4-small" and
|
|
3718
3742
|
not gpu_type.startswith("cpu-") and
|
|
3743
|
+
"mig" not in gpu_type and
|
|
3719
3744
|
is_multinode and
|
|
3720
3745
|
gpu_count == max_gpus
|
|
3721
3746
|
)
|
|
@@ -3742,7 +3767,8 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
|
|
|
3742
3767
|
requests.update({"cpu": "2", "memory": "4Gi"})
|
|
3743
3768
|
else:
|
|
3744
3769
|
if gpu_count > 0:
|
|
3745
|
-
|
|
3770
|
+
resource_name = config.get("k8s_resource", "nvidia.com/gpu")
|
|
3771
|
+
requests[resource_name] = str(gpu_count)
|
|
3746
3772
|
gpu_ratio = gpu_count / max_gpus if max_gpus > 0 else 1.0
|
|
3747
3773
|
|
|
3748
3774
|
# Calculate proportional requests (reserve 10% for system overhead)
|
|
@@ -3756,10 +3782,11 @@ def get_pod_resource_requests(gpu_count: int, gpu_type: str, is_multinode: bool
|
|
|
3756
3782
|
"memory": f"{proportional_memory_request}Gi"
|
|
3757
3783
|
})
|
|
3758
3784
|
|
|
3759
|
-
# EFA: Only for full-node multinode deployments
|
|
3785
|
+
# EFA: Only for full-node multinode deployments (skip MIG slices)
|
|
3760
3786
|
use_efa = (
|
|
3761
3787
|
gpu_type != "t4-small" and
|
|
3762
3788
|
not gpu_type.startswith("cpu-") and
|
|
3789
|
+
"mig" not in gpu_type and
|
|
3763
3790
|
is_multinode and
|
|
3764
3791
|
gpu_count == max_gpus
|
|
3765
3792
|
)
|
|
@@ -5243,7 +5270,7 @@ EOF
|
|
|
5243
5270
|
)
|
|
5244
5271
|
] if _pod_uses_efa(gpu_count, gpu_type, is_multinode) else []),
|
|
5245
5272
|
node_selector={
|
|
5246
|
-
"GpuType": gpu_type,
|
|
5273
|
+
"GpuType": get_node_gpu_type(gpu_type),
|
|
5247
5274
|
**({} if target_az is None else {"topology.kubernetes.io/zone": target_az})
|
|
5248
5275
|
},
|
|
5249
5276
|
# Node affinity for profiling-dedicated preference
|
|
@@ -6846,7 +6873,7 @@ def update_pod_status_and_events(k8s_client, pod_name: str, reservation_id: str)
|
|
|
6846
6873
|
f"Failed to convert to queued: {queue_err}")
|
|
6847
6874
|
|
|
6848
6875
|
# Show user-friendly scheduling messages while waiting
|
|
6849
|
-
if "Insufficient nvidia.com/gpu" in event.message:
|
|
6876
|
+
if "Insufficient nvidia.com/" in event.message and "gpu" in event.message.lower():
|
|
6850
6877
|
# Check if it's a fragmentation issue (GPUs exist but not enough on single node)
|
|
6851
6878
|
try:
|
|
6852
6879
|
reservations_table = dynamodb.Table(
|
|
@@ -6882,7 +6909,7 @@ def update_pod_status_and_events(k8s_client, pod_name: str, reservation_id: str)
|
|
|
6882
6909
|
k8s_client_temp = get_k8s_client()
|
|
6883
6910
|
v1 = client.CoreV1Api(k8s_client_temp)
|
|
6884
6911
|
nodes = v1.list_node(
|
|
6885
|
-
label_selector=f"GpuType={gpu_type}")
|
|
6912
|
+
label_selector=f"GpuType={get_node_gpu_type(gpu_type)}")
|
|
6886
6913
|
|
|
6887
6914
|
if len(nodes.items) == 0:
|
|
6888
6915
|
# No nodes exist for this GPU type - fail immediately
|
|
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
184
|
-
MIN_CLI_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.3"
|
|
184
|
+
MIN_CLI_VERSION = "0.5.2"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
187
|
}, local.alb_env_vars)
|
|
@@ -152,7 +152,7 @@ locals {
|
|
|
152
152
|
efa_network_cards = 8 # p6-b200.48xlarge supports max 8 network cards
|
|
153
153
|
}
|
|
154
154
|
"h200" = {
|
|
155
|
-
instance_type = "
|
|
155
|
+
instance_type = "p5en.48xlarge" # Match capacity reservation type
|
|
156
156
|
instance_types = ["p5e.48xlarge", "p5en.48xlarge"]
|
|
157
157
|
instance_count = 4 # Fallback default (not used when capacity_reservations defined)
|
|
158
158
|
gpus_per_instance = 8
|
|
@@ -216,6 +216,45 @@ locals {
|
|
|
216
216
|
architecture = "x86_64"
|
|
217
217
|
efa_network_cards = 2
|
|
218
218
|
}
|
|
219
|
+
# MIG slice SKUs — virtual: do NOT create an ASG. Surfaces the SKU to availability_updater
|
|
220
|
+
# + reservation_processor. Backed by the H100 CR labelled with mig_profile=all-balanced
|
|
221
|
+
# (per GPU = 2x1g.10gb + 1x2g.20gb + 1x3g.40gb).
|
|
222
|
+
"h100-mig-1g" = {
|
|
223
|
+
instance_type = null
|
|
224
|
+
instance_types = null
|
|
225
|
+
instance_count = 0
|
|
226
|
+
gpus_per_instance = 16 # 8 GPUs * 2 slices/GPU
|
|
227
|
+
use_placement_group = false
|
|
228
|
+
architecture = "x86_64"
|
|
229
|
+
efa_network_cards = 0
|
|
230
|
+
virtual = true
|
|
231
|
+
k8s_resource = "nvidia.com/mig-1g.10gb"
|
|
232
|
+
node_gpu_type = "h100"
|
|
233
|
+
}
|
|
234
|
+
"h100-mig-2g" = {
|
|
235
|
+
instance_type = null
|
|
236
|
+
instance_types = null
|
|
237
|
+
instance_count = 0
|
|
238
|
+
gpus_per_instance = 8 # 8 GPUs * 1 slice/GPU
|
|
239
|
+
use_placement_group = false
|
|
240
|
+
architecture = "x86_64"
|
|
241
|
+
efa_network_cards = 0
|
|
242
|
+
virtual = true
|
|
243
|
+
k8s_resource = "nvidia.com/mig-2g.20gb"
|
|
244
|
+
node_gpu_type = "h100"
|
|
245
|
+
}
|
|
246
|
+
"h100-mig-3g" = {
|
|
247
|
+
instance_type = null
|
|
248
|
+
instance_types = null
|
|
249
|
+
instance_count = 0
|
|
250
|
+
gpus_per_instance = 8 # 8 GPUs * 1 slice/GPU
|
|
251
|
+
use_placement_group = false
|
|
252
|
+
architecture = "x86_64"
|
|
253
|
+
efa_network_cards = 0
|
|
254
|
+
virtual = true
|
|
255
|
+
k8s_resource = "nvidia.com/mig-3g.40gb"
|
|
256
|
+
node_gpu_type = "h100"
|
|
257
|
+
}
|
|
219
258
|
"cpu-arm" = {
|
|
220
259
|
instance_type = "c7g.8xlarge"
|
|
221
260
|
instance_types = null
|
|
@@ -267,11 +306,13 @@ locals {
|
|
|
267
306
|
{ key = "cr0", id = "cr-0a3f49b96fe03ca04", instance_count = 4 }, # H100 reservation us-east-2c (p5.48xlarge)
|
|
268
307
|
{ key = "cr1", id = null, instance_count = 2 }, # H100 on-demand (2 instances)
|
|
269
308
|
{ key = "cr2", id = "cr-044bc72b0a6b56062", instance_count = 4 }, # H100 reservation us-east-2a (4 instances)
|
|
309
|
+
{ key = "cr3", id = "cr-0211ea1e8d3a3c79e", instance_count = 1, mig_profile = "all-balanced" }, # H100 reservation us-east-2c (1 instance, MIG-dedicated, all-balanced: 2x1g.10gb + 1x2g.20gb + 1x3g.40gb per GPU)
|
|
270
310
|
]
|
|
271
311
|
h200 = [
|
|
272
312
|
{ key = "cr0", id = "cr-0f6d0766f5d3339e6", instance_count = 2 }, # H200 capacity block (may be expired - keep to prevent ASG destroy)
|
|
273
313
|
{ key = "cr1", id = "cr-06c9c978dea756a26", instance_count = 3 }, # H200 reservation (3 instances)
|
|
274
314
|
{ key = "cr2", id = null, instance_count = 2 }, # H200 on-demand (2 instances)
|
|
315
|
+
{ key = "cr3", id = "cr-02949f61f1a761b54", instance_count = 1, efa_network_cards = 16 }, # H200 reservation us-east-2a (1 instance, 8 GPUs, p5en.48xlarge max 16 EFA)
|
|
275
316
|
]
|
|
276
317
|
b200 = [
|
|
277
318
|
{ key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 0 }, # B200 reservation us-east-2a (disabled - CR freed)
|
|
@@ -323,9 +364,11 @@ locals {
|
|
|
323
364
|
# H200 capacity reservations
|
|
324
365
|
"cr-0f6d0766f5d3339e6" = "tertiary" # us-east-2c (may be expired - kept to prevent ASG destroy)
|
|
325
366
|
"cr-06c9c978dea756a26" = "tertiary" # us-east-2c
|
|
367
|
+
"cr-02949f61f1a761b54" = "primary" # us-east-2a
|
|
326
368
|
# H100 capacity reservations
|
|
327
369
|
"cr-0a3f49b96fe03ca04" = "tertiary" # us-east-2c (p5.48xlarge)
|
|
328
370
|
"cr-044bc72b0a6b56062" = "primary" # us-east-2a (p5.48xlarge)
|
|
371
|
+
"cr-0211ea1e8d3a3c79e" = "tertiary" # us-east-2c (p5.48xlarge, MIG-dedicated)
|
|
329
372
|
# A100 capacity reservation
|
|
330
373
|
"cr-01cc0f00f28b095af" = "primary" # us-east-2a
|
|
331
374
|
}
|
|
@@ -136,7 +136,7 @@ spec:
|
|
|
136
136
|
cpu: "2"
|
|
137
137
|
memory: "4Gi"
|
|
138
138
|
flags:
|
|
139
|
-
- --node-labels=NodeType=gpu,GpuType=${gpu_type},nvidia.com/gpu.deploy.driver=false${profiling_dedicated ? ",gpu.monitoring/profiling-dedicated=true,nvidia.com/gpu.deploy.dcgm-exporter=false" : ""}
|
|
139
|
+
- --node-labels=NodeType=gpu,GpuType=${gpu_type},nvidia.com/gpu.deploy.driver=false${profiling_dedicated ? ",gpu.monitoring/profiling-dedicated=true,nvidia.com/gpu.deploy.dcgm-exporter=false" : ""}${mig_profile != "" ? ",nvidia.com/mig.config=${mig_profile}" : ""}
|
|
140
140
|
EOF
|
|
141
141
|
|
|
142
142
|
# Configure EFA if hardware present (BEFORE nodeadm so kubelet sees hugepages)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.1 → gpu_dev-0.5.3}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|