gpu-dev 0.5.9__tar.gz → 0.5.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/.github/workflows/no-gitlinks.yml +1 -1
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/PKG-INFO +1 -1
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +1 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +17 -2
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +45 -31
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +3 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/pyproject.toml +1 -1
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_processor/index.py +12 -4
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda.tf +2 -2
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/main.tf +40 -0
- gpu_dev-0.5.12/terraform-gpu-devservers/scripts/b200-mig-setup.sh +75 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/.gitignore +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/CLAUDE.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/PROGRESS.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/TODO.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/admin/README.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/post.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/setup.cfg +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/kubernetes.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -14,7 +14,7 @@ jobs:
|
|
|
14
14
|
uses: actions/checkout@v4
|
|
15
15
|
- name: Ensure no gitlinks are tracked
|
|
16
16
|
run: |
|
|
17
|
-
gitlinks=$(git ls-files -s | awk
|
|
17
|
+
gitlinks=$(git ls-files -s | awk '$1 == 160000 {print}')
|
|
18
18
|
if [ -n "$gitlinks" ]; then
|
|
19
19
|
echo "Unexpected gitlinks found:"
|
|
20
20
|
echo "$gitlinks"
|
|
@@ -101,6 +101,7 @@ terraform-gpu-devservers/migrations/check_snapshots.py
|
|
|
101
101
|
terraform-gpu-devservers/migrations/migrate_disks_to_named.py
|
|
102
102
|
terraform-gpu-devservers/migrations/run_backfill.sh
|
|
103
103
|
terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md
|
|
104
|
+
terraform-gpu-devservers/scripts/b200-mig-setup.sh
|
|
104
105
|
terraform-gpu-devservers/scripts/detect_empty_volumes.sh
|
|
105
106
|
terraform-gpu-devservers/scripts/ec2_avail_probe.sh
|
|
106
107
|
terraform-gpu-devservers/scripts/inspect_user_data.sh
|
|
@@ -495,9 +495,9 @@ def main(ctx: click.Context) -> None:
|
|
|
495
495
|
"--gpu-type",
|
|
496
496
|
"-t",
|
|
497
497
|
type=click.Choice(
|
|
498
|
-
["b200", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
498
|
+
["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
|
|
499
499
|
),
|
|
500
|
-
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices (
|
|
500
|
+
help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
|
|
501
501
|
)
|
|
502
502
|
@click.option(
|
|
503
503
|
"--hours",
|
|
@@ -656,6 +656,9 @@ def reserve(
|
|
|
656
656
|
"h100-mig-1g": {"max_gpus": 16, "instance_type": "p5.48xlarge"},
|
|
657
657
|
"h100-mig-2g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
|
|
658
658
|
"h100-mig-3g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
|
|
659
|
+
"b200-mig-1g": {"max_gpus": 4, "instance_type": "p6-b200.48xlarge"},
|
|
660
|
+
"b200-mig-2g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
|
|
661
|
+
"b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
|
|
659
662
|
"h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
|
|
660
663
|
"b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
|
|
661
664
|
"cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
|
|
@@ -2454,6 +2457,9 @@ def _show_availability() -> None:
|
|
|
2454
2457
|
"h100-mig-1g": "Hopper (sm90, MIG 10GB)",
|
|
2455
2458
|
"h100-mig-2g": "Hopper (sm90, MIG 20GB)",
|
|
2456
2459
|
"h100-mig-3g": "Hopper (sm90, MIG 40GB)",
|
|
2460
|
+
"b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
|
|
2461
|
+
"b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
|
|
2462
|
+
"b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
|
|
2457
2463
|
"t4": "Turing (sm75)",
|
|
2458
2464
|
"cpu-x86": "CPU (x86_64)",
|
|
2459
2465
|
"cpu-arm": "CPU (arm64)",
|
|
@@ -2462,6 +2468,9 @@ def _show_availability() -> None:
|
|
|
2462
2468
|
# Sort order: newest GPU architectures first, then CPUs at the bottom
|
|
2463
2469
|
arch_priority = {
|
|
2464
2470
|
"Blackwell (sm100)": 0,
|
|
2471
|
+
"Blackwell (sm100, MIG 90GB)": 0,
|
|
2472
|
+
"Blackwell (sm100, MIG 45GB)": 0,
|
|
2473
|
+
"Blackwell (sm100, MIG 23GB)": 0,
|
|
2465
2474
|
"Blackwell (sm120)": 0,
|
|
2466
2475
|
"Hopper (sm90)": 1,
|
|
2467
2476
|
"Hopper (sm90, MIG 40GB)": 1,
|
|
@@ -2609,6 +2618,9 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2609
2618
|
"h100-mig-1g": "Hopper (sm90, MIG 10GB)",
|
|
2610
2619
|
"h100-mig-2g": "Hopper (sm90, MIG 20GB)",
|
|
2611
2620
|
"h100-mig-3g": "Hopper (sm90, MIG 40GB)",
|
|
2621
|
+
"b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
|
|
2622
|
+
"b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
|
|
2623
|
+
"b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
|
|
2612
2624
|
"t4": "Turing (sm75)",
|
|
2613
2625
|
"cpu-x86": "CPU (x86_64)",
|
|
2614
2626
|
"cpu-arm": "CPU (arm64)",
|
|
@@ -2617,6 +2629,9 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2617
2629
|
# Sort order: newest GPU architectures first, then CPUs at the bottom
|
|
2618
2630
|
arch_priority = {
|
|
2619
2631
|
"Blackwell (sm100)": 0,
|
|
2632
|
+
"Blackwell (sm100, MIG 90GB)": 0,
|
|
2633
|
+
"Blackwell (sm100, MIG 45GB)": 0,
|
|
2634
|
+
"Blackwell (sm100, MIG 23GB)": 0,
|
|
2620
2635
|
"Blackwell (sm120)": 0,
|
|
2621
2636
|
"Hopper (sm90)": 1,
|
|
2622
2637
|
"Hopper (sm90, MIG 40GB)": 1,
|
|
@@ -64,17 +64,25 @@ def select_gpu_type_interactive(
|
|
|
64
64
|
if "-mig-" not in gt
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
# Aggregate MIG slice availability
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
67
|
+
# Aggregate MIG slice availability per parent type, hinted on the h100/b200 rows.
|
|
68
|
+
def _mig_aggregates(parent: str):
|
|
69
|
+
avail = sum(
|
|
70
|
+
int(info.get("available", 0))
|
|
71
|
+
for gt, info in (availability_info or {}).items()
|
|
72
|
+
if gt.startswith(f"{parent}-mig-")
|
|
73
|
+
)
|
|
74
|
+
cap = sum(
|
|
75
|
+
int(info.get("total", 0))
|
|
76
|
+
for gt, info in (availability_info or {}).items()
|
|
77
|
+
if gt.startswith(f"{parent}-mig-")
|
|
78
|
+
)
|
|
79
|
+
return avail, cap
|
|
80
|
+
|
|
81
|
+
h100_mig_avail, h100_mig_capacity = _mig_aggregates("h100")
|
|
82
|
+
b200_mig_avail, b200_mig_capacity = _mig_aggregates("b200")
|
|
83
|
+
# Backwards-compat aliases for the existing h100 row code below.
|
|
84
|
+
mig_total_available = h100_mig_avail
|
|
85
|
+
mig_total_capacity = h100_mig_capacity
|
|
78
86
|
|
|
79
87
|
# Display availability table first
|
|
80
88
|
console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
|
|
@@ -146,6 +154,8 @@ def select_gpu_type_interactive(
|
|
|
146
154
|
choice_label += f" - {queue_length} in queue"
|
|
147
155
|
if gpu_type == "h100" and mig_total_capacity > 0:
|
|
148
156
|
choice_label += f" — also {mig_total_available}/{mig_total_capacity} MIG slices"
|
|
157
|
+
elif gpu_type == "b200" and b200_mig_capacity > 0:
|
|
158
|
+
choice_label += f" — also {b200_mig_avail}/{b200_mig_capacity} MIG slices"
|
|
149
159
|
|
|
150
160
|
choices.append(questionary.Choice(title=choice_label, value=gpu_type))
|
|
151
161
|
|
|
@@ -223,27 +233,31 @@ def select_gpu_count_interactive(
|
|
|
223
233
|
parent_size_etas = parent_info.get("size_etas", {}) or {}
|
|
224
234
|
_now_ts = int(_time.time())
|
|
225
235
|
|
|
226
|
-
# MIG slice submenu:
|
|
236
|
+
# MIG slice submenu: h100 (16+8+8 slices/node) or b200 (4+2+2 slices/node).
|
|
227
237
|
mig_options = []
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
("h100-mig-
|
|
232
|
-
("h100-mig-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
238
|
+
mig_spec_map = {
|
|
239
|
+
"h100": [
|
|
240
|
+
("h100-mig-1g", "10GB", 16),
|
|
241
|
+
("h100-mig-2g", "20GB", 8),
|
|
242
|
+
("h100-mig-3g", "40GB", 8),
|
|
243
|
+
],
|
|
244
|
+
"b200": [
|
|
245
|
+
("b200-mig-1g", "23GB", 4),
|
|
246
|
+
("b200-mig-2g", "45GB", 2),
|
|
247
|
+
("b200-mig-3g", "90GB", 2),
|
|
248
|
+
],
|
|
249
|
+
}
|
|
250
|
+
for sku, gb, slice_max in mig_spec_map.get(gpu_type, []):
|
|
251
|
+
free = None
|
|
252
|
+
if availability_info and sku in availability_info:
|
|
253
|
+
free = availability_info[sku].get("available", 0)
|
|
254
|
+
for n in [1, 2, 4]:
|
|
255
|
+
if n > slice_max:
|
|
256
|
+
continue
|
|
257
|
+
noun = "slice" if n == 1 else "slices"
|
|
258
|
+
avail_suffix = f" [{free} free]" if free is not None else ""
|
|
259
|
+
label = f"{n} × {gb} {noun}{avail_suffix}"
|
|
260
|
+
mig_options.append((sku, n, label))
|
|
247
261
|
|
|
248
262
|
# Filter single-node by actual max for this GPU type
|
|
249
263
|
valid_counts = [count for count in valid_counts if count <= max_gpus]
|
|
@@ -543,6 +543,9 @@ class ReservationManager:
|
|
|
543
543
|
"h100-mig-1g": {"max_gpus": 16},
|
|
544
544
|
"h100-mig-2g": {"max_gpus": 8},
|
|
545
545
|
"h100-mig-3g": {"max_gpus": 8},
|
|
546
|
+
"b200-mig-1g": {"max_gpus": 4},
|
|
547
|
+
"b200-mig-2g": {"max_gpus": 2},
|
|
548
|
+
"b200-mig-3g": {"max_gpus": 2},
|
|
546
549
|
"h200": {"max_gpus": 8},
|
|
547
550
|
"b200": {"max_gpus": 8},
|
|
548
551
|
}
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.12"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
@@ -71,6 +71,10 @@ GPU_CONFIG = {
|
|
|
71
71
|
"h100-mig-1g": {"instance_type": "p5.48xlarge", "max_gpus": 16, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.10gb", "node_gpu_type": "h100"},
|
|
72
72
|
"h100-mig-2g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.20gb", "node_gpu_type": "h100"},
|
|
73
73
|
"h100-mig-3g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.40gb", "node_gpu_type": "h100"},
|
|
74
|
+
# B200 MIG slices on the b200-6full-2mig-balanced node (6 full GPUs + 2 partitioned per node).
|
|
75
|
+
"b200-mig-1g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 4, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.23gb", "node_gpu_type": "b200"},
|
|
76
|
+
"b200-mig-2g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.45gb", "node_gpu_type": "b200"},
|
|
77
|
+
"b200-mig-3g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.90gb", "node_gpu_type": "b200"},
|
|
74
78
|
"t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
75
79
|
"g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
|
|
76
80
|
"a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
|
|
@@ -323,12 +327,12 @@ def get_target_az_for_reservation(gpu_type, gpus_requested):
|
|
|
323
327
|
return target_az
|
|
324
328
|
|
|
325
329
|
logger.warning(f"No ready/schedulable {gpu_type} nodes found in cluster")
|
|
326
|
-
return None
|
|
330
|
+
return None, None
|
|
327
331
|
|
|
328
332
|
except Exception as e:
|
|
329
333
|
logger.error(f"Error determining target AZ for {gpu_type}: {str(e)}")
|
|
330
|
-
# Fallback to primary AZ if detection fails
|
|
331
|
-
return PRIMARY_AVAILABILITY_ZONE
|
|
334
|
+
# Fallback to primary AZ if detection fails (no node hint — let k8s pick).
|
|
335
|
+
return PRIMARY_AVAILABILITY_ZONE, None
|
|
332
336
|
|
|
333
337
|
|
|
334
338
|
def check_for_multiple_volumes(user_id):
|
|
@@ -2167,7 +2171,8 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
|
|
|
2167
2171
|
# Validate GPU type
|
|
2168
2172
|
valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
|
|
2169
2173
|
"h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
|
|
2170
|
-
"h200", "b200", "
|
|
2174
|
+
"h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
|
|
2175
|
+
"cpu-arm", "cpu-x86"]
|
|
2171
2176
|
if gpu_type not in valid_gpu_types:
|
|
2172
2177
|
error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
|
|
2173
2178
|
logger.error(error_msg)
|
|
@@ -2408,6 +2413,9 @@ def update_gpu_availability_table(
|
|
|
2408
2413
|
"h100-mig-1g": {"gpus_per_instance": 16},
|
|
2409
2414
|
"h100-mig-2g": {"gpus_per_instance": 8},
|
|
2410
2415
|
"h100-mig-3g": {"gpus_per_instance": 8},
|
|
2416
|
+
"b200-mig-1g": {"gpus_per_instance": 4},
|
|
2417
|
+
"b200-mig-2g": {"gpus_per_instance": 2},
|
|
2418
|
+
"b200-mig-3g": {"gpus_per_instance": 2},
|
|
2411
2419
|
"h200": {"gpus_per_instance": 8},
|
|
2412
2420
|
"b200": {"gpus_per_instance": 8},
|
|
2413
2421
|
}
|
|
@@ -180,8 +180,8 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
184
|
-
MIN_CLI_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.12"
|
|
184
|
+
MIN_CLI_VERSION = "0.5.9"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
187
187
|
}, local.alb_env_vars)
|
|
@@ -255,6 +255,46 @@ locals {
|
|
|
255
255
|
k8s_resource = "nvidia.com/mig-3g.40gb"
|
|
256
256
|
node_gpu_type = "h100"
|
|
257
257
|
}
|
|
258
|
+
# B200 MIG slices — virtual SKUs backed by ONE B200 node labelled with the custom
|
|
259
|
+
# mig_profile "b200-6full-2mig-balanced": GPUs 0-5 stay as full B200 (still reservable
|
|
260
|
+
# via --gpu-type b200), GPUs 6-7 get partitioned per-GPU into 2x1g.23gb + 1x2g.45gb +
|
|
261
|
+
# 1x3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large slices.
|
|
262
|
+
"b200-mig-1g" = {
|
|
263
|
+
instance_type = null
|
|
264
|
+
instance_types = null
|
|
265
|
+
instance_count = 0
|
|
266
|
+
gpus_per_instance = 4 # 2 partitioned GPUs * 2 slices each
|
|
267
|
+
use_placement_group = false
|
|
268
|
+
architecture = "x86_64"
|
|
269
|
+
efa_network_cards = 0
|
|
270
|
+
virtual = true
|
|
271
|
+
k8s_resource = "nvidia.com/mig-1g.23gb"
|
|
272
|
+
node_gpu_type = "b200"
|
|
273
|
+
}
|
|
274
|
+
"b200-mig-2g" = {
|
|
275
|
+
instance_type = null
|
|
276
|
+
instance_types = null
|
|
277
|
+
instance_count = 0
|
|
278
|
+
gpus_per_instance = 2 # 2 partitioned GPUs * 1 slice each
|
|
279
|
+
use_placement_group = false
|
|
280
|
+
architecture = "x86_64"
|
|
281
|
+
efa_network_cards = 0
|
|
282
|
+
virtual = true
|
|
283
|
+
k8s_resource = "nvidia.com/mig-2g.45gb"
|
|
284
|
+
node_gpu_type = "b200"
|
|
285
|
+
}
|
|
286
|
+
"b200-mig-3g" = {
|
|
287
|
+
instance_type = null
|
|
288
|
+
instance_types = null
|
|
289
|
+
instance_count = 0
|
|
290
|
+
gpus_per_instance = 2 # 2 partitioned GPUs * 1 slice each
|
|
291
|
+
use_placement_group = false
|
|
292
|
+
architecture = "x86_64"
|
|
293
|
+
efa_network_cards = 0
|
|
294
|
+
virtual = true
|
|
295
|
+
k8s_resource = "nvidia.com/mig-3g.90gb"
|
|
296
|
+
node_gpu_type = "b200"
|
|
297
|
+
}
|
|
258
298
|
"cpu-arm" = {
|
|
259
299
|
instance_type = "c7g.8xlarge"
|
|
260
300
|
instance_types = null
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Post-deploy setup for B200 MIG split (6 full + 2 partitioned per node).
|
|
3
|
+
# Run ONCE after PR #77 is merged + tf applied + the new docker/lambda is live.
|
|
4
|
+
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
NS=gpu-operator
|
|
8
|
+
CM=default-mig-parted-config
|
|
9
|
+
PROFILE_NAME=b200-6full-2mig-balanced
|
|
10
|
+
|
|
11
|
+
echo "=== Checking current MIG profile in ConfigMap ==="
|
|
12
|
+
if kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' | grep -q "$PROFILE_NAME:"; then
|
|
13
|
+
echo "Profile $PROFILE_NAME already present — skipping ConfigMap edit"
|
|
14
|
+
else
|
|
15
|
+
echo "Profile $PROFILE_NAME missing. Patching ConfigMap..."
|
|
16
|
+
|
|
17
|
+
# Save current ConfigMap content
|
|
18
|
+
kubectl -n "$NS" get configmap "$CM" -o yaml > /tmp/mig-config-backup.yaml
|
|
19
|
+
echo "Backup saved to /tmp/mig-config-backup.yaml"
|
|
20
|
+
|
|
21
|
+
# Append our profile under mig-configs:
|
|
22
|
+
# NOTE: this is a sed-driven append. ClusterPolicy's controller MAY revert this if it
|
|
23
|
+
# reconciles. If you see the profile disappear, re-run this script. If it keeps reverting,
|
|
24
|
+
# we'll need to fork the ConfigMap (next iteration).
|
|
25
|
+
kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' > /tmp/mig-config.yaml
|
|
26
|
+
|
|
27
|
+
cat >> /tmp/mig-config.yaml <<'EOF'
|
|
28
|
+
|
|
29
|
+
# Mixed B200 split: GPUs 0-5 stay full (reservable as --gpu-type b200), GPUs 6-7 partitioned.
|
|
30
|
+
# Per partitioned GPU: 2x 1g.23gb + 1x 2g.45gb + 1x 3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large.
|
|
31
|
+
b200-6full-2mig-balanced:
|
|
32
|
+
- device-filter: ["0x290110DE"]
|
|
33
|
+
devices: [0, 1, 2, 3, 4, 5]
|
|
34
|
+
mig-enabled: false
|
|
35
|
+
- device-filter: ["0x290110DE"]
|
|
36
|
+
devices: [6, 7]
|
|
37
|
+
mig-enabled: true
|
|
38
|
+
mig-devices:
|
|
39
|
+
"1g.23gb": 2
|
|
40
|
+
"2g.45gb": 1
|
|
41
|
+
"3g.90gb": 1
|
|
42
|
+
EOF
|
|
43
|
+
|
|
44
|
+
# Re-encode and patch
|
|
45
|
+
kubectl -n "$NS" create configmap "$CM" --from-file=config.yaml=/tmp/mig-config.yaml --dry-run=client -o yaml \
|
|
46
|
+
| kubectl -n "$NS" patch configmap "$CM" --patch-file=/dev/stdin
|
|
47
|
+
echo "ConfigMap patched."
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
echo
|
|
51
|
+
echo "=== Picking a B200 node to label ==="
|
|
52
|
+
NODE=$(kubectl get nodes -l GpuType=b200 -o jsonpath='{.items[0].metadata.name}')
|
|
53
|
+
if [ -z "$NODE" ]; then
|
|
54
|
+
echo "No B200 nodes found. Exiting."
|
|
55
|
+
exit 1
|
|
56
|
+
fi
|
|
57
|
+
echo "Will label: $NODE"
|
|
58
|
+
read -p "Proceed? (y/N): " CONFIRM
|
|
59
|
+
if [ "$CONFIRM" != "y" ]; then
|
|
60
|
+
echo "Aborted."
|
|
61
|
+
exit 0
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
kubectl label node "$NODE" "nvidia.com/mig.config=$PROFILE_NAME" --overwrite
|
|
65
|
+
echo "Node labelled. nvidia-mig-manager will partition GPUs 6-7 (drains existing pods if any)."
|
|
66
|
+
echo
|
|
67
|
+
echo "Watch progress with:"
|
|
68
|
+
echo " kubectl logs -n gpu-operator -l app=nvidia-mig-manager -f"
|
|
69
|
+
echo " kubectl get node $NODE -o jsonpath='{.status.allocatable}' | jq ."
|
|
70
|
+
echo
|
|
71
|
+
echo "After ~2-5 min, allocatable should show:"
|
|
72
|
+
echo " nvidia.com/gpu: 6"
|
|
73
|
+
echo " nvidia.com/mig-1g.23gb: 4"
|
|
74
|
+
echo " nvidia.com/mig-2g.45gb: 2"
|
|
75
|
+
echo " nvidia.com/mig-3g.90gb: 2"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.9 → gpu_dev-0.5.12}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|