gpu-dev 0.5.11__tar.gz → 0.5.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/.github/workflows/no-gitlinks.yml +1 -1
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/PKG-INFO +1 -1
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +2 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +17 -2
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +45 -31
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +3 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/pyproject.toml +1 -1
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/kubernetes.tf +8 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/index.py +9 -1
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda.tf +1 -1
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/main.tf +40 -0
- gpu_dev-0.5.13/terraform-gpu-devservers/mig-config.tf +55 -0
- gpu_dev-0.5.13/terraform-gpu-devservers/mig-parted-config.yaml +528 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/.gitignore +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/CLAUDE.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/PROGRESS.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/TODO.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/admin/README.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/post.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/setup.cfg +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -14,7 +14,7 @@ jobs:
|
|
|
14
14
|
uses: actions/checkout@v4
|
|
15
15
|
- name: Ensure no gitlinks are tracked
|
|
16
16
|
run: |
|
|
17
|
-
gitlinks=$(git ls-files -s | awk
|
|
17
|
+
gitlinks=$(git ls-files -s | awk '$1 == 160000 {print}')
|
|
18
18
|
if [ -n "$gitlinks" ]; then
|
|
19
19
|
echo "Unexpected gitlinks found:"
|
|
20
20
|
echo "$gitlinks"
|
|
@@ -47,6 +47,8 @@ terraform-gpu-devservers/git-cache.tf
|
|
|
47
47
|
terraform-gpu-devservers/kubernetes.tf
|
|
48
48
|
terraform-gpu-devservers/lambda.tf
|
|
49
49
|
terraform-gpu-devservers/main.tf
|
|
50
|
+
terraform-gpu-devservers/mig-config.tf
|
|
51
|
+
terraform-gpu-devservers/mig-parted-config.yaml
|
|
50
52
|
terraform-gpu-devservers/monitoring.tf
|
|
51
53
|
terraform-gpu-devservers/outputs.tf
|
|
52
54
|
terraform-gpu-devservers/pyproject.toml
|
|
@@ -495,9 +495,9 @@ def main(ctx: click.Context) -> None:
|
|
|
495
495
|
"--gpu-type",
|
|
496
496
|
"-t",
|
|
497
497
|
type=click.Choice(
|
|
498
|
-
["b200", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
498
|
+
["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
499
499
|
),
|
|
500
|
-
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices (
|
|
500
|
+
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
|
|
501
501
|
)
|
|
502
502
|
@click.option(
|
|
503
503
|
"--hours",
|
|
@@ -656,6 +656,9 @@ def reserve(
|
|
|
656
656
|
"h100-mig-1g": {"max_gpus": 16, "instance_type": "p5.48xlarge"},
|
|
657
657
|
"h100-mig-2g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
|
|
658
658
|
"h100-mig-3g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
|
|
659
|
+
"b200-mig-1g": {"max_gpus": 4, "instance_type": "p6-b200.48xlarge"},
|
|
660
|
+
"b200-mig-2g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
|
|
661
|
+
"b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
|
|
659
662
|
"h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
|
|
660
663
|
"b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
|
|
661
664
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
@@ -2454,6 +2457,9 @@ def _show_availability() -> None:
|
|
|
2454
2457
|
"h100-mig-1g": "Hopper (sm90, MIG 10GB)",
|
|
2455
2458
|
"h100-mig-2g": "Hopper (sm90, MIG 20GB)",
|
|
2456
2459
|
"h100-mig-3g": "Hopper (sm90, MIG 40GB)",
|
|
2460
|
+
"b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
|
|
2461
|
+
"b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
|
|
2462
|
+
"b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
|
|
2457
2463
|
"t4": "Turing (sm75)",
|
|
2458
2464
|
"cpu-x86": "CPU (x86_64)",
|
|
2459
2465
|
"cpu-arm": "CPU (arm64)",
|
|
@@ -2462,6 +2468,9 @@ def _show_availability() -> None:
|
|
|
2462
2468
|
# Sort order: newest GPU architectures first, then CPUs at the bottom
|
|
2463
2469
|
arch_priority = {
|
|
2464
2470
|
"Blackwell (sm100)": 0,
|
|
2471
|
+
"Blackwell (sm100, MIG 90GB)": 0,
|
|
2472
|
+
"Blackwell (sm100, MIG 45GB)": 0,
|
|
2473
|
+
"Blackwell (sm100, MIG 23GB)": 0,
|
|
2465
2474
|
"Blackwell (sm120)": 0,
|
|
2466
2475
|
"Hopper (sm90)": 1,
|
|
2467
2476
|
"Hopper (sm90, MIG 40GB)": 1,
|
|
@@ -2609,6 +2618,9 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2609
2618
|
"h100-mig-1g": "Hopper (sm90, MIG 10GB)",
|
|
2610
2619
|
"h100-mig-2g": "Hopper (sm90, MIG 20GB)",
|
|
2611
2620
|
"h100-mig-3g": "Hopper (sm90, MIG 40GB)",
|
|
2621
|
+
"b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
|
|
2622
|
+
"b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
|
|
2623
|
+
"b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
|
|
2612
2624
|
"t4": "Turing (sm75)",
|
|
2613
2625
|
"cpu-x86": "CPU (x86_64)",
|
|
2614
2626
|
"cpu-arm": "CPU (arm64)",
|
|
@@ -2617,6 +2629,9 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2617
2629
|
# Sort order: newest GPU architectures first, then CPUs at the bottom
|
|
2618
2630
|
arch_priority = {
|
|
2619
2631
|
"Blackwell (sm100)": 0,
|
|
2632
|
+
"Blackwell (sm100, MIG 90GB)": 0,
|
|
2633
|
+
"Blackwell (sm100, MIG 45GB)": 0,
|
|
2634
|
+
"Blackwell (sm100, MIG 23GB)": 0,
|
|
2620
2635
|
"Blackwell (sm120)": 0,
|
|
2621
2636
|
"Hopper (sm90)": 1,
|
|
2622
2637
|
"Hopper (sm90, MIG 40GB)": 1,
|
|
@@ -64,17 +64,25 @@ def select_gpu_type_interactive(
|
|
|
64
64
|
if "-mig-" not in gt
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
# Aggregate MIG slice availability
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
67
|
+
# Aggregate MIG slice availability per parent type, hinted on the h100/b200 rows.
|
|
68
|
+
def _mig_aggregates(parent: str):
|
|
69
|
+
avail = sum(
|
|
70
|
+
int(info.get("available", 0))
|
|
71
|
+
for gt, info in (availability_info or {}).items()
|
|
72
|
+
if gt.startswith(f"{parent}-mig-")
|
|
73
|
+
)
|
|
74
|
+
cap = sum(
|
|
75
|
+
int(info.get("total", 0))
|
|
76
|
+
for gt, info in (availability_info or {}).items()
|
|
77
|
+
if gt.startswith(f"{parent}-mig-")
|
|
78
|
+
)
|
|
79
|
+
return avail, cap
|
|
80
|
+
|
|
81
|
+
h100_mig_avail, h100_mig_capacity = _mig_aggregates("h100")
|
|
82
|
+
b200_mig_avail, b200_mig_capacity = _mig_aggregates("b200")
|
|
83
|
+
# Backwards-compat aliases for the existing h100 row code below.
|
|
84
|
+
mig_total_available = h100_mig_avail
|
|
85
|
+
mig_total_capacity = h100_mig_capacity
|
|
78
86
|
|
|
79
87
|
# Display availability table first
|
|
80
88
|
console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
|
|
@@ -146,6 +154,8 @@ def select_gpu_type_interactive(
|
|
|
146
154
|
choice_label += f" - {queue_length} in queue"
|
|
147
155
|
if gpu_type == "h100" and mig_total_capacity > 0:
|
|
148
156
|
choice_label += f" — also {mig_total_available}/{mig_total_capacity} MIG slices"
|
|
157
|
+
elif gpu_type == "b200" and b200_mig_capacity > 0:
|
|
158
|
+
choice_label += f" — also {b200_mig_avail}/{b200_mig_capacity} MIG slices"
|
|
149
159
|
|
|
150
160
|
choices.append(questionary.Choice(title=choice_label, value=gpu_type))
|
|
151
161
|
|
|
@@ -223,27 +233,31 @@ def select_gpu_count_interactive(
|
|
|
223
233
|
parent_size_etas = parent_info.get("size_etas", {}) or {}
|
|
224
234
|
_now_ts = int(_time.time())
|
|
225
235
|
|
|
226
|
-
# MIG slice submenu:
|
|
236
|
+
# MIG slice submenu: h100 (16+8+8 slices/node) or b200 (4+2+2 slices/node).
|
|
227
237
|
mig_options = []
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
("h100-mig-
|
|
232
|
-
("h100-mig-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
238
|
+
mig_spec_map = {
|
|
239
|
+
"h100": [
|
|
240
|
+
("h100-mig-1g", "10GB", 16),
|
|
241
|
+
("h100-mig-2g", "20GB", 8),
|
|
242
|
+
("h100-mig-3g", "40GB", 8),
|
|
243
|
+
],
|
|
244
|
+
"b200": [
|
|
245
|
+
("b200-mig-1g", "23GB", 4),
|
|
246
|
+
("b200-mig-2g", "45GB", 2),
|
|
247
|
+
("b200-mig-3g", "90GB", 2),
|
|
248
|
+
],
|
|
249
|
+
}
|
|
250
|
+
for sku, gb, slice_max in mig_spec_map.get(gpu_type, []):
|
|
251
|
+
free = None
|
|
252
|
+
if availability_info and sku in availability_info:
|
|
253
|
+
free = availability_info[sku].get("available", 0)
|
|
254
|
+
for n in [1, 2, 4]:
|
|
255
|
+
if n > slice_max:
|
|
256
|
+
continue
|
|
257
|
+
noun = "slice" if n == 1 else "slices"
|
|
258
|
+
avail_suffix = f" [{free} free]" if free is not None else ""
|
|
259
|
+
label = f"{n} × {gb} {noun}{avail_suffix}"
|
|
260
|
+
mig_options.append((sku, n, label))
|
|
247
261
|
|
|
248
262
|
# Filter single-node by actual max for this GPU type
|
|
249
263
|
valid_counts = [count for count in valid_counts if count <= max_gpus]
|
|
@@ -543,6 +543,9 @@ class ReservationManager:
|
|
|
543
543
|
"h100-mig-1g": {"max_gpus": 16},
|
|
544
544
|
"h100-mig-2g": {"max_gpus": 8},
|
|
545
545
|
"h100-mig-3g": {"max_gpus": 8},
|
|
546
|
+
"b200-mig-1g": {"max_gpus": 4},
|
|
547
|
+
"b200-mig-2g": {"max_gpus": 2},
|
|
548
|
+
"b200-mig-3g": {"max_gpus": 2},
|
|
546
549
|
"h200": {"max_gpus": 8},
|
|
547
550
|
"b200": {"max_gpus": 8},
|
|
548
551
|
}
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.13"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -305,6 +305,14 @@ resource "helm_release" "nvidia_gpu_operator" {
|
|
|
305
305
|
value = "all-disabled"
|
|
306
306
|
}
|
|
307
307
|
|
|
308
|
+
# Read profiles from our forked ConfigMap (managed in mig-config.tf) instead of the
|
|
309
|
+
# operator's auto-created default-mig-parted-config. Lets us add custom mixed profiles
|
|
310
|
+
# like b200-6full-2mig-balanced without ClusterPolicy reconciliation reverting our edits.
|
|
311
|
+
set {
|
|
312
|
+
name = "migManager.config.name"
|
|
313
|
+
value = "gpu-dev-mig-parted-config"
|
|
314
|
+
}
|
|
315
|
+
|
|
308
316
|
set {
|
|
309
317
|
name = "nodeStatusExporter.enabled"
|
|
310
318
|
value = "true"
|
{gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -71,6 +71,10 @@ GPU_CONFIG = {
|
|
|
71
71
|
"h100-mig-1g": {"instance_type": "p5.48xlarge", "max_gpus": 16, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.10gb", "node_gpu_type": "h100"},
|
|
72
72
|
"h100-mig-2g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.20gb", "node_gpu_type": "h100"},
|
|
73
73
|
"h100-mig-3g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.40gb", "node_gpu_type": "h100"},
|
|
74
|
+
# B200 MIG slices on the b200-6full-2mig-balanced node (6 full GPUs + 2 partitioned per node).
|
|
75
|
+
"b200-mig-1g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 4, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.23gb", "node_gpu_type": "b200"},
|
|
76
|
+
"b200-mig-2g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.45gb", "node_gpu_type": "b200"},
|
|
77
|
+
"b200-mig-3g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.90gb", "node_gpu_type": "b200"},
|
|
74
78
|
"t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
75
79
|
"g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
76
80
|
"a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
|
|
@@ -2167,7 +2171,8 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2167
2171
|
# Validate GPU type
|
|
2168
2172
|
valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
|
|
2169
2173
|
"h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
|
|
2170
|
-
"h200", "b200", "
|
|
2174
|
+
"h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
|
|
2175
|
+
"cpu-arm", "cpu-x86"]
|
|
2171
2176
|
if gpu_type not in valid_gpu_types:
|
|
2172
2177
|
error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
|
|
2173
2178
|
logger.error(error_msg)
|
|
@@ -2408,6 +2413,9 @@ def update_gpu_availability_table(
|
|
|
2408
2413
|
"h100-mig-1g": {"gpus_per_instance": 16},
|
|
2409
2414
|
"h100-mig-2g": {"gpus_per_instance": 8},
|
|
2410
2415
|
"h100-mig-3g": {"gpus_per_instance": 8},
|
|
2416
|
+
"b200-mig-1g": {"gpus_per_instance": 4},
|
|
2417
|
+
"b200-mig-2g": {"gpus_per_instance": 2},
|
|
2418
|
+
"b200-mig-3g": {"gpus_per_instance": 2},
|
|
2411
2419
|
"h200": {"gpus_per_instance": 8},
|
|
2412
2420
|
"b200": {"gpus_per_instance": 8},
|
|
2413
2421
|
}
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.13"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.9"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
@@ -255,6 +255,46 @@ locals {
|
|
|
255
255
|
k8s_resource = "nvidia.com/mig-3g.40gb"
|
|
256
256
|
node_gpu_type = "h100"
|
|
257
257
|
}
|
|
258
|
+
# B200 MIG slices — virtual SKUs backed by ONE B200 node labelled with the custom
|
|
259
|
+
# mig_profile "b200-6full-2mig-balanced": GPUs 0-5 stay as full B200 (still reservable
|
|
260
|
+
# via --gpu-type b200), GPUs 6-7 get partitioned per-GPU into 2x1g.23gb + 1x2g.45gb +
|
|
261
|
+
# 1x3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large slices.
|
|
262
|
+
"b200-mig-1g" = {
|
|
263
|
+
instance_type = null
|
|
264
|
+
instance_types = null
|
|
265
|
+
instance_count = 0
|
|
266
|
+
gpus_per_instance = 4 # 2 partitioned GPUs * 2 slices each
|
|
267
|
+
use_placement_group = false
|
|
268
|
+
architecture = "x86_64"
|
|
269
|
+
efa_network_cards = 0
|
|
270
|
+
virtual = true
|
|
271
|
+
k8s_resource = "nvidia.com/mig-1g.23gb"
|
|
272
|
+
node_gpu_type = "b200"
|
|
273
|
+
}
|
|
274
|
+
"b200-mig-2g" = {
|
|
275
|
+
instance_type = null
|
|
276
|
+
instance_types = null
|
|
277
|
+
instance_count = 0
|
|
278
|
+
gpus_per_instance = 2 # 2 partitioned GPUs * 1 slice each
|
|
279
|
+
use_placement_group = false
|
|
280
|
+
architecture = "x86_64"
|
|
281
|
+
efa_network_cards = 0
|
|
282
|
+
virtual = true
|
|
283
|
+
k8s_resource = "nvidia.com/mig-2g.45gb"
|
|
284
|
+
node_gpu_type = "b200"
|
|
285
|
+
}
|
|
286
|
+
"b200-mig-3g" = {
|
|
287
|
+
instance_type = null
|
|
288
|
+
instance_types = null
|
|
289
|
+
instance_count = 0
|
|
290
|
+
gpus_per_instance = 2 # 2 partitioned GPUs * 1 slice each
|
|
291
|
+
use_placement_group = false
|
|
292
|
+
architecture = "x86_64"
|
|
293
|
+
efa_network_cards = 0
|
|
294
|
+
virtual = true
|
|
295
|
+
k8s_resource = "nvidia.com/mig-3g.90gb"
|
|
296
|
+
node_gpu_type = "b200"
|
|
297
|
+
}
|
|
258
298
|
"cpu-arm" = {
|
|
259
299
|
instance_type = "c7g.8xlarge"
|
|
260
300
|
instance_types = null
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# mig-config.tf — fork the NVIDIA mig-parted-config ConfigMap so we can add custom profiles
|
|
2
|
+
# without fighting NVIDIA ClusterPolicy's reconciliation of its default-mig-parted-config.
|
|
3
|
+
#
|
|
4
|
+
# The vendored mig-parted-config.yaml in this directory mirrors the upstream profiles plus our
|
|
5
|
+
# additions (e.g. b200-6full-2mig-balanced). Helm is told to use this ConfigMap by name via
|
|
6
|
+
# migManager.config.name in kubernetes.tf, so the GPU operator skips creating its default and
|
|
7
|
+
# reads ours instead.
|
|
8
|
+
|
|
9
|
+
resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
|
|
10
|
+
metadata {
|
|
11
|
+
name = "gpu-dev-mig-parted-config"
|
|
12
|
+
namespace = "gpu-operator"
|
|
13
|
+
labels = {
|
|
14
|
+
"app.kubernetes.io/managed-by" = "terraform"
|
|
15
|
+
"app.kubernetes.io/part-of" = "gpu-dev-servers"
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
data = {
|
|
20
|
+
"config.yaml" = file("${path.module}/mig-parted-config.yaml")
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
# The gpu-operator namespace is created by the helm release; depend on that so this ConfigMap
|
|
24
|
+
# lands AFTER the namespace exists.
|
|
25
|
+
depends_on = [helm_release.nvidia_gpu_operator]
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Optional declarative B200 MIG node label. Set b200_mig_node_name in tfvars (or override the
|
|
29
|
+
# variable's default below) to dedicate a specific B200 node to the mixed profile. Empty string
|
|
30
|
+
# means "no node currently labelled" — the existing all-disabled stays in effect.
|
|
31
|
+
variable "b200_mig_node_name" {
|
|
32
|
+
description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to skip."
|
|
33
|
+
type = string
|
|
34
|
+
default = ""
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
resource "kubernetes_labels" "b200_mig_node" {
|
|
38
|
+
count = var.b200_mig_node_name == "" ? 0 : 1
|
|
39
|
+
|
|
40
|
+
api_version = "v1"
|
|
41
|
+
kind = "Node"
|
|
42
|
+
|
|
43
|
+
metadata {
|
|
44
|
+
name = var.b200_mig_node_name
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
labels = {
|
|
48
|
+
"nvidia.com/mig.config" = "b200-6full-2mig-balanced"
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Take ownership of the label even if another tool (kubectl, gpu-operator) set it.
|
|
52
|
+
force = true
|
|
53
|
+
|
|
54
|
+
depends_on = [kubernetes_config_map.gpu_dev_mig_parted_config]
|
|
55
|
+
}
|