gpu-dev 0.5.12__tar.gz → 0.5.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/PKG-INFO +1 -1
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +2 -1
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +8 -9
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +2 -2
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/pyproject.toml +1 -1
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/kubernetes.tf +8 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda.tf +1 -1
- gpu_dev-0.5.14/terraform-gpu-devservers/mig-config.tf +72 -0
- gpu_dev-0.5.14/terraform-gpu-devservers/mig-parted-config.yaml +528 -0
- gpu_dev-0.5.12/terraform-gpu-devservers/scripts/b200-mig-setup.sh +0 -75
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/.gitignore +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/CLAUDE.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/PROGRESS.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/TODO.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/admin/README.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/post.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/setup.cfg +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -47,6 +47,8 @@ terraform-gpu-devservers/git-cache.tf
|
|
|
47
47
|
terraform-gpu-devservers/kubernetes.tf
|
|
48
48
|
terraform-gpu-devservers/lambda.tf
|
|
49
49
|
terraform-gpu-devservers/main.tf
|
|
50
|
+
terraform-gpu-devservers/mig-config.tf
|
|
51
|
+
terraform-gpu-devservers/mig-parted-config.yaml
|
|
50
52
|
terraform-gpu-devservers/monitoring.tf
|
|
51
53
|
terraform-gpu-devservers/outputs.tf
|
|
52
54
|
terraform-gpu-devservers/pyproject.toml
|
|
@@ -101,7 +103,6 @@ terraform-gpu-devservers/migrations/check_snapshots.py
|
|
|
101
103
|
terraform-gpu-devservers/migrations/migrate_disks_to_named.py
|
|
102
104
|
terraform-gpu-devservers/migrations/run_backfill.sh
|
|
103
105
|
terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md
|
|
104
|
-
terraform-gpu-devservers/scripts/b200-mig-setup.sh
|
|
105
106
|
terraform-gpu-devservers/scripts/detect_empty_volumes.sh
|
|
106
107
|
terraform-gpu-devservers/scripts/ec2_avail_probe.sh
|
|
107
108
|
terraform-gpu-devservers/scripts/inspect_user_data.sh
|
|
@@ -688,6 +688,7 @@ def reserve(
|
|
|
688
688
|
# and total wall-clock time drops from sum to max(each).
|
|
689
689
|
from concurrent.futures import ThreadPoolExecutor
|
|
690
690
|
config = load_config()
|
|
691
|
+
reservation_mgr = ReservationManager(config)
|
|
691
692
|
|
|
692
693
|
with Live(
|
|
693
694
|
Spinner("dots", text="🚀 Loading…"), console=console
|
|
@@ -704,9 +705,7 @@ def reserve(
|
|
|
704
705
|
else:
|
|
705
706
|
f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
|
|
706
707
|
ssh_result = None
|
|
707
|
-
f_avail = ex.submit(
|
|
708
|
-
lambda: ReservationManager(config).get_gpu_availability_by_type()
|
|
709
|
-
)
|
|
708
|
+
f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
|
|
710
709
|
|
|
711
710
|
# Surface auth failure first (most actionable).
|
|
712
711
|
try:
|
|
@@ -2496,10 +2495,10 @@ def _show_availability() -> None:
|
|
|
2496
2495
|
table = Table(
|
|
2497
2496
|
title="GPU Availability by Type (numbers are GPUs, not nodes)")
|
|
2498
2497
|
table.add_column("GPU Type", style="cyan")
|
|
2499
|
-
table.add_column("
|
|
2500
|
-
table.add_column("Max
|
|
2498
|
+
table.add_column("Avail", style="green")
|
|
2499
|
+
table.add_column("Max\nReservable", style="bright_green")
|
|
2501
2500
|
table.add_column("Total", style="blue")
|
|
2502
|
-
table.add_column("Queue
|
|
2501
|
+
table.add_column("Queue\nLength", style="yellow")
|
|
2503
2502
|
table.add_column("Architecture", style="dim")
|
|
2504
2503
|
table.add_column("Est. Wait Time", style="magenta")
|
|
2505
2504
|
|
|
@@ -2657,10 +2656,10 @@ def _show_availability_watch(interval: int) -> None:
|
|
|
2657
2656
|
table = Table(
|
|
2658
2657
|
title="GPU Availability by Type (numbers are GPUs, not nodes)")
|
|
2659
2658
|
table.add_column("GPU Type", style="cyan")
|
|
2660
|
-
table.add_column("
|
|
2661
|
-
table.add_column("Max
|
|
2659
|
+
table.add_column("Avail", style="green")
|
|
2660
|
+
table.add_column("Max\nReservable", style="blue")
|
|
2662
2661
|
table.add_column("Total", style="blue")
|
|
2663
|
-
table.add_column("Queue
|
|
2662
|
+
table.add_column("Queue\nLength", style="yellow")
|
|
2664
2663
|
table.add_column("Architecture", style="dim")
|
|
2665
2664
|
table.add_column("Est. Wait Time", style="magenta")
|
|
2666
2665
|
|
|
@@ -88,9 +88,9 @@ def select_gpu_type_interactive(
|
|
|
88
88
|
console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
|
|
89
89
|
table = Table()
|
|
90
90
|
table.add_column("GPU Type", style="cyan")
|
|
91
|
-
table.add_column("
|
|
91
|
+
table.add_column("Avail", style="green")
|
|
92
92
|
table.add_column("Total", style="blue")
|
|
93
|
-
table.add_column("Queue
|
|
93
|
+
table.add_column("Queue\nLength", style="yellow")
|
|
94
94
|
table.add_column("Est. Wait Time", style="magenta")
|
|
95
95
|
|
|
96
96
|
choices = []
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.14"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -305,6 +305,14 @@ resource "helm_release" "nvidia_gpu_operator" {
|
|
|
305
305
|
value = "all-disabled"
|
|
306
306
|
}
|
|
307
307
|
|
|
308
|
+
# Read profiles from our forked ConfigMap (managed in mig-config.tf) instead of the
|
|
309
|
+
# operator's auto-created default-mig-parted-config. Lets us add custom mixed profiles
|
|
310
|
+
# like b200-6full-2mig-balanced without ClusterPolicy reconciliation reverting our edits.
|
|
311
|
+
set {
|
|
312
|
+
name = "migManager.config.name"
|
|
313
|
+
value = "gpu-dev-mig-parted-config"
|
|
314
|
+
}
|
|
315
|
+
|
|
308
316
|
set {
|
|
309
317
|
name = "nodeStatusExporter.enabled"
|
|
310
318
|
value = "true"
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.13"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.9"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# mig-config.tf — fork the NVIDIA mig-parted-config ConfigMap so we can add custom profiles
|
|
2
|
+
# without fighting NVIDIA ClusterPolicy's reconciliation of its default-mig-parted-config.
|
|
3
|
+
#
|
|
4
|
+
# The vendored mig-parted-config.yaml in this directory mirrors the upstream profiles plus our
|
|
5
|
+
# additions (e.g. b200-6full-2mig-balanced). Helm is told to use this ConfigMap by name via
|
|
6
|
+
# migManager.config.name in kubernetes.tf, so the GPU operator skips creating its default and
|
|
7
|
+
# reads ours instead.
|
|
8
|
+
|
|
9
|
+
resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
|
|
10
|
+
metadata {
|
|
11
|
+
name = "gpu-dev-mig-parted-config"
|
|
12
|
+
namespace = "gpu-operator"
|
|
13
|
+
labels = {
|
|
14
|
+
"app.kubernetes.io/managed-by" = "terraform"
|
|
15
|
+
"app.kubernetes.io/part-of" = "gpu-dev-servers"
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
data = {
|
|
20
|
+
"config.yaml" = file("${path.module}/mig-parted-config.yaml")
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
# The gpu-operator namespace is created by the helm release; depend on that so this ConfigMap
|
|
24
|
+
# lands AFTER the namespace exists.
|
|
25
|
+
depends_on = [helm_release.nvidia_gpu_operator]
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Declarative B200 MIG node label. Set b200_mig_node_name (per workspace via the locals lookup
|
|
29
|
+
# below, or override via tfvars / -var) to dedicate a specific B200 node to the mixed profile.
|
|
30
|
+
# Empty string means "no node labelled" — every B200 stays full.
|
|
31
|
+
#
|
|
32
|
+
# Future cleanup: when we split a B200 CR into two ASGs (one with mig_profile, one without),
|
|
33
|
+
# the user_data path will set this label at boot for any instance in the MIG-dedicated ASG —
|
|
34
|
+
# matching the H100 cr3 pattern. Until then, this declarative label pins the role to a hostname.
|
|
35
|
+
locals {
|
|
36
|
+
# Workspace-scoped defaults so the resource is a no-op in non-prod and no apply ever tries to
|
|
37
|
+
# label a node that doesn't exist.
|
|
38
|
+
default_b200_mig_node_by_workspace = {
|
|
39
|
+
prod = "ip-10-0-67-125.us-east-2.compute.internal"
|
|
40
|
+
}
|
|
41
|
+
b200_mig_node_effective = (
|
|
42
|
+
var.b200_mig_node_name != ""
|
|
43
|
+
? var.b200_mig_node_name
|
|
44
|
+
: lookup(local.default_b200_mig_node_by_workspace, terraform.workspace, "")
|
|
45
|
+
)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
variable "b200_mig_node_name" {
|
|
49
|
+
description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to use the per-workspace default in mig-config.tf."
|
|
50
|
+
type = string
|
|
51
|
+
default = ""
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
resource "kubernetes_labels" "b200_mig_node" {
|
|
55
|
+
count = local.b200_mig_node_effective == "" ? 0 : 1
|
|
56
|
+
|
|
57
|
+
api_version = "v1"
|
|
58
|
+
kind = "Node"
|
|
59
|
+
|
|
60
|
+
metadata {
|
|
61
|
+
name = local.b200_mig_node_effective
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
labels = {
|
|
65
|
+
"nvidia.com/mig.config" = "b200-6full-2mig-balanced"
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Take ownership of the label even if another tool (kubectl, gpu-operator) set it.
|
|
69
|
+
force = true
|
|
70
|
+
|
|
71
|
+
depends_on = [kubernetes_config_map.gpu_dev_mig_parted_config]
|
|
72
|
+
}
|
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
version: v1
|
|
2
|
+
mig-configs:
|
|
3
|
+
all-disabled:
|
|
4
|
+
- devices: all
|
|
5
|
+
mig-enabled: false
|
|
6
|
+
|
|
7
|
+
all-enabled:
|
|
8
|
+
- devices: all
|
|
9
|
+
mig-enabled: true
|
|
10
|
+
mig-devices: {}
|
|
11
|
+
|
|
12
|
+
# A100-40GB, A800-40GB
|
|
13
|
+
all-1g.5gb:
|
|
14
|
+
- devices: all
|
|
15
|
+
mig-enabled: true
|
|
16
|
+
mig-devices:
|
|
17
|
+
"1g.5gb": 7
|
|
18
|
+
|
|
19
|
+
all-1g.5gb.me:
|
|
20
|
+
- devices: all
|
|
21
|
+
mig-enabled: true
|
|
22
|
+
mig-devices:
|
|
23
|
+
"1g.5gb+me": 1
|
|
24
|
+
|
|
25
|
+
all-2g.10gb:
|
|
26
|
+
- devices: all
|
|
27
|
+
mig-enabled: true
|
|
28
|
+
mig-devices:
|
|
29
|
+
"2g.10gb": 3
|
|
30
|
+
|
|
31
|
+
all-3g.20gb:
|
|
32
|
+
- devices: all
|
|
33
|
+
mig-enabled: true
|
|
34
|
+
mig-devices:
|
|
35
|
+
"3g.20gb": 2
|
|
36
|
+
|
|
37
|
+
all-4g.20gb:
|
|
38
|
+
- devices: all
|
|
39
|
+
mig-enabled: true
|
|
40
|
+
mig-devices:
|
|
41
|
+
"4g.20gb": 1
|
|
42
|
+
|
|
43
|
+
all-7g.40gb:
|
|
44
|
+
- devices: all
|
|
45
|
+
mig-enabled: true
|
|
46
|
+
mig-devices:
|
|
47
|
+
"7g.40gb": 1
|
|
48
|
+
|
|
49
|
+
# RTX-PRO-6000-96GB
|
|
50
|
+
all-1g.24gb.gfx:
|
|
51
|
+
- devices: all
|
|
52
|
+
mig-enabled: true
|
|
53
|
+
mig-devices:
|
|
54
|
+
"1g.24gb+gfx": 4
|
|
55
|
+
|
|
56
|
+
all-1g.24gb.me.all:
|
|
57
|
+
- devices: all
|
|
58
|
+
mig-enabled: true
|
|
59
|
+
mig-devices:
|
|
60
|
+
"1g.24gb+me.all": 1
|
|
61
|
+
|
|
62
|
+
all-1g.24gb-me:
|
|
63
|
+
- devices: all
|
|
64
|
+
mig-enabled: true
|
|
65
|
+
mig-devices:
|
|
66
|
+
"1g.24gb-me": 4
|
|
67
|
+
|
|
68
|
+
all-2g.48gb:
|
|
69
|
+
- devices: all
|
|
70
|
+
mig-enabled: true
|
|
71
|
+
mig-devices:
|
|
72
|
+
"2g.48gb": 2
|
|
73
|
+
|
|
74
|
+
all-2g.48gb.gfx:
|
|
75
|
+
- devices: all
|
|
76
|
+
mig-enabled: true
|
|
77
|
+
mig-devices:
|
|
78
|
+
"2g.48gb+gfx": 2
|
|
79
|
+
|
|
80
|
+
all-2g.48gb.me.all:
|
|
81
|
+
- devices: all
|
|
82
|
+
mig-enabled: true
|
|
83
|
+
mig-devices:
|
|
84
|
+
"2g.48gb+me.all": 1
|
|
85
|
+
|
|
86
|
+
all-2g.48gb-me:
|
|
87
|
+
- devices: all
|
|
88
|
+
mig-enabled: true
|
|
89
|
+
mig-devices:
|
|
90
|
+
"2g.48gb-me": 2
|
|
91
|
+
|
|
92
|
+
all-4g.96gb:
|
|
93
|
+
- devices: all
|
|
94
|
+
mig-enabled: true
|
|
95
|
+
mig-devices:
|
|
96
|
+
"4g.96gb": 1
|
|
97
|
+
|
|
98
|
+
all-4g.96gb.gfx:
|
|
99
|
+
- devices: all
|
|
100
|
+
mig-enabled: true
|
|
101
|
+
mig-devices:
|
|
102
|
+
"4g.96gb+gfx": 1
|
|
103
|
+
|
|
104
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB, A100-40GB, A800-40GB
|
|
105
|
+
all-1g.10gb:
|
|
106
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB
|
|
107
|
+
- device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"]
|
|
108
|
+
devices: all
|
|
109
|
+
mig-enabled: true
|
|
110
|
+
mig-devices:
|
|
111
|
+
"1g.10gb": 7
|
|
112
|
+
|
|
113
|
+
# A100-40GB, A800-40GB
|
|
114
|
+
- device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"]
|
|
115
|
+
devices: all
|
|
116
|
+
mig-enabled: true
|
|
117
|
+
mig-devices:
|
|
118
|
+
"1g.10gb": 4
|
|
119
|
+
|
|
120
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB
|
|
121
|
+
all-1g.10gb.me:
|
|
122
|
+
- devices: all
|
|
123
|
+
mig-enabled: true
|
|
124
|
+
mig-devices:
|
|
125
|
+
"1g.10gb+me": 1
|
|
126
|
+
|
|
127
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB
|
|
128
|
+
all-1g.20gb:
|
|
129
|
+
- devices: all
|
|
130
|
+
mig-enabled: true
|
|
131
|
+
mig-devices:
|
|
132
|
+
"1g.20gb": 4
|
|
133
|
+
|
|
134
|
+
# GB200, B200
|
|
135
|
+
all-1g.23gb:
|
|
136
|
+
- devices: all
|
|
137
|
+
mig-enabled: true
|
|
138
|
+
mig-devices:
|
|
139
|
+
"1g.23gb": 7
|
|
140
|
+
|
|
141
|
+
# GB200, B200
|
|
142
|
+
all-1g.23gb.me:
|
|
143
|
+
- devices: all
|
|
144
|
+
mig-enabled: true
|
|
145
|
+
mig-devices:
|
|
146
|
+
"1g.23gb+me": 1
|
|
147
|
+
|
|
148
|
+
all-1g.24gb.me:
|
|
149
|
+
- devices: all
|
|
150
|
+
mig-enabled: true
|
|
151
|
+
mig-devices:
|
|
152
|
+
"1g.24gb+me": 1
|
|
153
|
+
|
|
154
|
+
all-2g.20gb:
|
|
155
|
+
- devices: all
|
|
156
|
+
mig-enabled: true
|
|
157
|
+
mig-devices:
|
|
158
|
+
"2g.20gb": 3
|
|
159
|
+
|
|
160
|
+
all-3g.40gb:
|
|
161
|
+
- devices: all
|
|
162
|
+
mig-enabled: true
|
|
163
|
+
mig-devices:
|
|
164
|
+
"3g.40gb": 2
|
|
165
|
+
|
|
166
|
+
all-4g.40gb:
|
|
167
|
+
- devices: all
|
|
168
|
+
mig-enabled: true
|
|
169
|
+
mig-devices:
|
|
170
|
+
"4g.40gb": 1
|
|
171
|
+
|
|
172
|
+
all-7g.80gb:
|
|
173
|
+
- devices: all
|
|
174
|
+
mig-enabled: true
|
|
175
|
+
mig-devices:
|
|
176
|
+
"7g.80gb": 1
|
|
177
|
+
|
|
178
|
+
# A30-24GB
|
|
179
|
+
all-1g.6gb:
|
|
180
|
+
- devices: all
|
|
181
|
+
mig-enabled: true
|
|
182
|
+
mig-devices:
|
|
183
|
+
"1g.6gb": 4
|
|
184
|
+
|
|
185
|
+
all-1g.6gb.me:
|
|
186
|
+
- devices: all
|
|
187
|
+
mig-enabled: true
|
|
188
|
+
mig-devices:
|
|
189
|
+
"1g.6gb+me": 1
|
|
190
|
+
|
|
191
|
+
all-2g.12gb:
|
|
192
|
+
- devices: all
|
|
193
|
+
mig-enabled: true
|
|
194
|
+
mig-devices:
|
|
195
|
+
"2g.12gb": 2
|
|
196
|
+
|
|
197
|
+
all-2g.12gb.me:
|
|
198
|
+
- devices: all
|
|
199
|
+
mig-enabled: true
|
|
200
|
+
mig-devices:
|
|
201
|
+
"2g.12gb+me": 1
|
|
202
|
+
|
|
203
|
+
all-4g.24gb:
|
|
204
|
+
- devices: all
|
|
205
|
+
mig-enabled: true
|
|
206
|
+
mig-devices:
|
|
207
|
+
"4g.24gb": 1
|
|
208
|
+
|
|
209
|
+
# H100 NVL, H800 NVL, GH200
|
|
210
|
+
all-1g.12gb:
|
|
211
|
+
- devices: all
|
|
212
|
+
mig-enabled: true
|
|
213
|
+
mig-devices:
|
|
214
|
+
"1g.12gb": 7
|
|
215
|
+
|
|
216
|
+
all-1g.12gb.me:
|
|
217
|
+
- devices: all
|
|
218
|
+
mig-enabled: true
|
|
219
|
+
mig-devices:
|
|
220
|
+
"1g.12gb+me": 1
|
|
221
|
+
|
|
222
|
+
all-1g.24gb:
|
|
223
|
+
- devices: all
|
|
224
|
+
mig-enabled: true
|
|
225
|
+
mig-devices:
|
|
226
|
+
"1g.24gb": 4
|
|
227
|
+
|
|
228
|
+
all-1g.45gb:
|
|
229
|
+
- devices: all
|
|
230
|
+
mig-enabled: true
|
|
231
|
+
mig-devices:
|
|
232
|
+
"1g.45gb": 4
|
|
233
|
+
|
|
234
|
+
all-1g.47gb:
|
|
235
|
+
- devices: all
|
|
236
|
+
mig-enabled: true
|
|
237
|
+
mig-devices:
|
|
238
|
+
"1g.47gb": 4
|
|
239
|
+
|
|
240
|
+
all-2g.24gb:
|
|
241
|
+
- devices: all
|
|
242
|
+
mig-enabled: true
|
|
243
|
+
mig-devices:
|
|
244
|
+
"2g.24gb": 3
|
|
245
|
+
|
|
246
|
+
all-2g.45gb:
|
|
247
|
+
- devices: all
|
|
248
|
+
mig-enabled: true
|
|
249
|
+
mig-devices:
|
|
250
|
+
"2g.45gb": 3
|
|
251
|
+
|
|
252
|
+
all-2g.47gb:
|
|
253
|
+
- devices: all
|
|
254
|
+
mig-enabled: true
|
|
255
|
+
mig-devices:
|
|
256
|
+
"2g.47gb": 3
|
|
257
|
+
|
|
258
|
+
# H100 NVL, H800 NVL
|
|
259
|
+
all-3g.47gb:
|
|
260
|
+
- devices: all
|
|
261
|
+
mig-enabled: true
|
|
262
|
+
mig-devices:
|
|
263
|
+
"3g.47gb": 2
|
|
264
|
+
|
|
265
|
+
all-4g.47gb:
|
|
266
|
+
- devices: all
|
|
267
|
+
mig-enabled: true
|
|
268
|
+
mig-devices:
|
|
269
|
+
"4g.47gb": 1
|
|
270
|
+
|
|
271
|
+
all-7g.94gb:
|
|
272
|
+
- devices: all
|
|
273
|
+
mig-enabled: true
|
|
274
|
+
mig-devices:
|
|
275
|
+
"7g.94gb": 1
|
|
276
|
+
|
|
277
|
+
# H100-96GB, PG506-96GB, GH200
|
|
278
|
+
all-3g.48gb:
|
|
279
|
+
- devices: all
|
|
280
|
+
mig-enabled: true
|
|
281
|
+
mig-devices:
|
|
282
|
+
"3g.48gb": 2
|
|
283
|
+
|
|
284
|
+
all-3g.90gb:
|
|
285
|
+
- devices: all
|
|
286
|
+
mig-enabled: true
|
|
287
|
+
mig-devices:
|
|
288
|
+
"3g.90gb": 2
|
|
289
|
+
|
|
290
|
+
all-3g.93gb:
|
|
291
|
+
- devices: all
|
|
292
|
+
mig-enabled: true
|
|
293
|
+
mig-devices:
|
|
294
|
+
"3g.93gb": 2
|
|
295
|
+
|
|
296
|
+
all-3g.95gb:
|
|
297
|
+
- devices: all
|
|
298
|
+
mig-enabled: true
|
|
299
|
+
mig-devices:
|
|
300
|
+
"3g.95gb": 2
|
|
301
|
+
|
|
302
|
+
all-4g.48gb:
|
|
303
|
+
- devices: all
|
|
304
|
+
mig-enabled: true
|
|
305
|
+
mig-devices:
|
|
306
|
+
"4g.48gb": 1
|
|
307
|
+
|
|
308
|
+
all-4g.90gb:
|
|
309
|
+
- devices: all
|
|
310
|
+
mig-enabled: true
|
|
311
|
+
mig-devices:
|
|
312
|
+
"4g.90gb": 1
|
|
313
|
+
|
|
314
|
+
all-4g.93gb:
|
|
315
|
+
- devices: all
|
|
316
|
+
mig-enabled: true
|
|
317
|
+
mig-devices:
|
|
318
|
+
"4g.93gb": 1
|
|
319
|
+
|
|
320
|
+
all-4g.95gb:
|
|
321
|
+
- devices: all
|
|
322
|
+
mig-enabled: true
|
|
323
|
+
mig-devices:
|
|
324
|
+
"4g.95gb": 1
|
|
325
|
+
|
|
326
|
+
all-7g.96gb:
|
|
327
|
+
- devices: all
|
|
328
|
+
mig-enabled: true
|
|
329
|
+
mig-devices:
|
|
330
|
+
"7g.96gb": 1
|
|
331
|
+
|
|
332
|
+
all-7g.180gb:
|
|
333
|
+
- devices: all
|
|
334
|
+
mig-enabled: true
|
|
335
|
+
mig-devices:
|
|
336
|
+
"7g.180gb": 1
|
|
337
|
+
|
|
338
|
+
all-7g.186gb:
|
|
339
|
+
- devices: all
|
|
340
|
+
mig-enabled: true
|
|
341
|
+
mig-devices:
|
|
342
|
+
"7g.186gb": 1
|
|
343
|
+
|
|
344
|
+
all-7g.189gb:
|
|
345
|
+
- devices: all
|
|
346
|
+
mig-enabled: true
|
|
347
|
+
mig-devices:
|
|
348
|
+
"7g.189gb": 1
|
|
349
|
+
|
|
350
|
+
# GB200 HGX, B200, GH200 144G HBM3e, H200-141GB, H200 NVL, H100-96GB, GH200, H100 NVL, H800 NVL, H100-80GB, H800-80GB, A800-40GB, A800-80GB, A100-40GB, A100-80GB, A30-24GB, PG506-96GB
|
|
351
|
+
all-balanced:
|
|
352
|
+
# GB200 HGX
|
|
353
|
+
- device-filter: ["0x294110DE"]
|
|
354
|
+
devices: all
|
|
355
|
+
mig-enabled: true
|
|
356
|
+
mig-devices:
|
|
357
|
+
"1g.23gb": 2
|
|
358
|
+
"2g.47gb": 1
|
|
359
|
+
"3g.93gb": 1
|
|
360
|
+
|
|
361
|
+
# RTX-PRO-6000-96GB
|
|
362
|
+
- device-filter: ["0x2BB510DE"]
|
|
363
|
+
devices: all
|
|
364
|
+
mig-enabled: true
|
|
365
|
+
mig-devices:
|
|
366
|
+
"1g.24gb": 2
|
|
367
|
+
"2g.48gb": 1
|
|
368
|
+
|
|
369
|
+
# B200
|
|
370
|
+
- device-filter: ["0x290110DE"]
|
|
371
|
+
devices: all
|
|
372
|
+
mig-enabled: true
|
|
373
|
+
mig-devices:
|
|
374
|
+
"1g.23gb": 2
|
|
375
|
+
"2g.45gb": 1
|
|
376
|
+
"3g.90gb": 1
|
|
377
|
+
|
|
378
|
+
# GH200 144G HBM3e
|
|
379
|
+
- device-filter: ["0x234810DE"]
|
|
380
|
+
devices: all
|
|
381
|
+
mig-enabled: true
|
|
382
|
+
mig-devices:
|
|
383
|
+
"1g.18gb": 2
|
|
384
|
+
"2g.36gb": 1
|
|
385
|
+
"3g.72gb": 1
|
|
386
|
+
|
|
387
|
+
# H200 141GB, H200 NVL
|
|
388
|
+
- device-filter: ["0x233510DE", "0x233B10DE"]
|
|
389
|
+
devices: all
|
|
390
|
+
mig-enabled: true
|
|
391
|
+
mig-devices:
|
|
392
|
+
"1g.18gb": 2
|
|
393
|
+
"2g.35gb": 1
|
|
394
|
+
"3g.71gb": 1
|
|
395
|
+
|
|
396
|
+
# H100 NVL, H800 NVL
|
|
397
|
+
- device-filter: ["0x232110DE", "0x233A10DE"]
|
|
398
|
+
devices: all
|
|
399
|
+
mig-enabled: true
|
|
400
|
+
mig-devices:
|
|
401
|
+
"1g.12gb": 2
|
|
402
|
+
"2g.24gb": 1
|
|
403
|
+
"3g.47gb": 1
|
|
404
|
+
|
|
405
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB
|
|
406
|
+
- device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"]
|
|
407
|
+
devices: all
|
|
408
|
+
mig-enabled: true
|
|
409
|
+
mig-devices:
|
|
410
|
+
"1g.10gb": 2
|
|
411
|
+
"2g.20gb": 1
|
|
412
|
+
"3g.40gb": 1
|
|
413
|
+
|
|
414
|
+
# A100-40GB, A800-40GB
|
|
415
|
+
- device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"]
|
|
416
|
+
devices: all
|
|
417
|
+
mig-enabled: true
|
|
418
|
+
mig-devices:
|
|
419
|
+
"1g.5gb": 2
|
|
420
|
+
"2g.10gb": 1
|
|
421
|
+
"3g.20gb": 1
|
|
422
|
+
|
|
423
|
+
# A30-24GB
|
|
424
|
+
- device-filter: "0x20B710DE"
|
|
425
|
+
devices: all
|
|
426
|
+
mig-enabled: true
|
|
427
|
+
mig-devices:
|
|
428
|
+
"1g.6gb": 2
|
|
429
|
+
"2g.12gb": 1
|
|
430
|
+
|
|
431
|
+
# H100-96GB, PG506-96GB, GH200, H20
|
|
432
|
+
- device-filter: ["0x234210DE", "0x233D10DE", "0x20B610DE", "0x232910DE"]
|
|
433
|
+
devices: all
|
|
434
|
+
mig-enabled: true
|
|
435
|
+
mig-devices:
|
|
436
|
+
"1g.12gb": 2
|
|
437
|
+
"2g.24gb": 1
|
|
438
|
+
"3g.48gb": 1
|
|
439
|
+
|
|
440
|
+
# H200-141GB, GH200 144G HBM3e
|
|
441
|
+
all-1g.18gb:
|
|
442
|
+
- devices: all
|
|
443
|
+
mig-enabled: true
|
|
444
|
+
mig-devices:
|
|
445
|
+
"1g.18gb": 7
|
|
446
|
+
|
|
447
|
+
all-1g.18gb.me:
|
|
448
|
+
- devices: all
|
|
449
|
+
mig-enabled: true
|
|
450
|
+
mig-devices:
|
|
451
|
+
"1g.18gb+me": 1
|
|
452
|
+
|
|
453
|
+
# H200-141GB
|
|
454
|
+
all-1g.35gb:
|
|
455
|
+
- devices: all
|
|
456
|
+
mig-enabled: true
|
|
457
|
+
mig-devices:
|
|
458
|
+
"1g.35gb": 4
|
|
459
|
+
|
|
460
|
+
all-2g.35gb:
|
|
461
|
+
- devices: all
|
|
462
|
+
mig-enabled: true
|
|
463
|
+
mig-devices:
|
|
464
|
+
"2g.35gb": 3
|
|
465
|
+
|
|
466
|
+
all-3g.71gb:
|
|
467
|
+
- devices: all
|
|
468
|
+
mig-enabled: true
|
|
469
|
+
mig-devices:
|
|
470
|
+
"3g.71gb": 2
|
|
471
|
+
|
|
472
|
+
all-4g.71gb:
|
|
473
|
+
- devices: all
|
|
474
|
+
mig-enabled: true
|
|
475
|
+
mig-devices:
|
|
476
|
+
"4g.71gb": 1
|
|
477
|
+
|
|
478
|
+
all-7g.141gb:
|
|
479
|
+
- devices: all
|
|
480
|
+
mig-enabled: true
|
|
481
|
+
mig-devices:
|
|
482
|
+
"7g.141gb": 1
|
|
483
|
+
|
|
484
|
+
# GH200 144G HBM3e
|
|
485
|
+
all-1g.36gb:
|
|
486
|
+
- devices: all
|
|
487
|
+
mig-enabled: true
|
|
488
|
+
mig-devices:
|
|
489
|
+
"1g.36gb": 4
|
|
490
|
+
|
|
491
|
+
all-2g.36gb:
|
|
492
|
+
- devices: all
|
|
493
|
+
mig-enabled: true
|
|
494
|
+
mig-devices:
|
|
495
|
+
"2g.36gb": 3
|
|
496
|
+
|
|
497
|
+
all-3g.72gb:
|
|
498
|
+
- devices: all
|
|
499
|
+
mig-enabled: true
|
|
500
|
+
mig-devices:
|
|
501
|
+
"3g.72gb": 2
|
|
502
|
+
|
|
503
|
+
all-4g.72gb:
|
|
504
|
+
- devices: all
|
|
505
|
+
mig-enabled: true
|
|
506
|
+
mig-devices:
|
|
507
|
+
"4g.72gb": 1
|
|
508
|
+
|
|
509
|
+
all-7g.144gb:
|
|
510
|
+
- devices: all
|
|
511
|
+
mig-enabled: true
|
|
512
|
+
mig-devices:
|
|
513
|
+
"7g.144gb": 1
|
|
514
|
+
|
|
515
|
+
# Custom: B200 mixed split — GPUs 0-5 stay full (reservable as --gpu-type b200),
|
|
516
|
+
# GPUs 6-7 partitioned per-GPU into 2x1g.23gb + 1x2g.45gb + 1x3g.90gb.
|
|
517
|
+
# Per node: 6 full + 4 small + 2 medium + 2 large slices.
|
|
518
|
+
b200-6full-2mig-balanced:
|
|
519
|
+
- device-filter: ["0x290110DE"]
|
|
520
|
+
devices: [0, 1, 2, 3, 4, 5]
|
|
521
|
+
mig-enabled: false
|
|
522
|
+
- device-filter: ["0x290110DE"]
|
|
523
|
+
devices: [6, 7]
|
|
524
|
+
mig-enabled: true
|
|
525
|
+
mig-devices:
|
|
526
|
+
"1g.23gb": 2
|
|
527
|
+
"2g.45gb": 1
|
|
528
|
+
"3g.90gb": 1
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
# Post-deploy setup for B200 MIG split (6 full + 2 partitioned per node).
|
|
3
|
-
# Run ONCE after PR #77 is merged + tf applied + the new docker/lambda is live.
|
|
4
|
-
|
|
5
|
-
set -e
|
|
6
|
-
|
|
7
|
-
NS=gpu-operator
|
|
8
|
-
CM=default-mig-parted-config
|
|
9
|
-
PROFILE_NAME=b200-6full-2mig-balanced
|
|
10
|
-
|
|
11
|
-
echo "=== Checking current MIG profile in ConfigMap ==="
|
|
12
|
-
if kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' | grep -q "$PROFILE_NAME:"; then
|
|
13
|
-
echo "Profile $PROFILE_NAME already present — skipping ConfigMap edit"
|
|
14
|
-
else
|
|
15
|
-
echo "Profile $PROFILE_NAME missing. Patching ConfigMap..."
|
|
16
|
-
|
|
17
|
-
# Save current ConfigMap content
|
|
18
|
-
kubectl -n "$NS" get configmap "$CM" -o yaml > /tmp/mig-config-backup.yaml
|
|
19
|
-
echo "Backup saved to /tmp/mig-config-backup.yaml"
|
|
20
|
-
|
|
21
|
-
# Append our profile under mig-configs:
|
|
22
|
-
# NOTE: this is a sed-driven append. ClusterPolicy's controller MAY revert this if it
|
|
23
|
-
# reconciles. If you see the profile disappear, re-run this script. If it keeps reverting,
|
|
24
|
-
# we'll need to fork the ConfigMap (next iteration).
|
|
25
|
-
kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' > /tmp/mig-config.yaml
|
|
26
|
-
|
|
27
|
-
cat >> /tmp/mig-config.yaml <<'EOF'
|
|
28
|
-
|
|
29
|
-
# Mixed B200 split: GPUs 0-5 stay full (reservable as --gpu-type b200), GPUs 6-7 partitioned.
|
|
30
|
-
# Per partitioned GPU: 2x 1g.23gb + 1x 2g.45gb + 1x 3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large.
|
|
31
|
-
b200-6full-2mig-balanced:
|
|
32
|
-
- device-filter: ["0x290110DE"]
|
|
33
|
-
devices: [0, 1, 2, 3, 4, 5]
|
|
34
|
-
mig-enabled: false
|
|
35
|
-
- device-filter: ["0x290110DE"]
|
|
36
|
-
devices: [6, 7]
|
|
37
|
-
mig-enabled: true
|
|
38
|
-
mig-devices:
|
|
39
|
-
"1g.23gb": 2
|
|
40
|
-
"2g.45gb": 1
|
|
41
|
-
"3g.90gb": 1
|
|
42
|
-
EOF
|
|
43
|
-
|
|
44
|
-
# Re-encode and patch
|
|
45
|
-
kubectl -n "$NS" create configmap "$CM" --from-file=config.yaml=/tmp/mig-config.yaml --dry-run=client -o yaml \
|
|
46
|
-
| kubectl -n "$NS" patch configmap "$CM" --patch-file=/dev/stdin
|
|
47
|
-
echo "ConfigMap patched."
|
|
48
|
-
fi
|
|
49
|
-
|
|
50
|
-
echo
|
|
51
|
-
echo "=== Picking a B200 node to label ==="
|
|
52
|
-
NODE=$(kubectl get nodes -l GpuType=b200 -o jsonpath='{.items[0].metadata.name}')
|
|
53
|
-
if [ -z "$NODE" ]; then
|
|
54
|
-
echo "No B200 nodes found. Exiting."
|
|
55
|
-
exit 1
|
|
56
|
-
fi
|
|
57
|
-
echo "Will label: $NODE"
|
|
58
|
-
read -p "Proceed? (y/N): " CONFIRM
|
|
59
|
-
if [ "$CONFIRM" != "y" ]; then
|
|
60
|
-
echo "Aborted."
|
|
61
|
-
exit 0
|
|
62
|
-
fi
|
|
63
|
-
|
|
64
|
-
kubectl label node "$NODE" "nvidia.com/mig.config=$PROFILE_NAME" --overwrite
|
|
65
|
-
echo "Node labelled. nvidia-mig-manager will partition GPUs 6-7 (drains existing pods if any)."
|
|
66
|
-
echo
|
|
67
|
-
echo "Watch progress with:"
|
|
68
|
-
echo " kubectl logs -n gpu-operator -l app=nvidia-mig-manager -f"
|
|
69
|
-
echo " kubectl get node $NODE -o jsonpath='{.status.allocatable}' | jq ."
|
|
70
|
-
echo
|
|
71
|
-
echo "After ~2-5 min, allocatable should show:"
|
|
72
|
-
echo " nvidia.com/gpu: 6"
|
|
73
|
-
echo " nvidia.com/mig-1g.23gb: 4"
|
|
74
|
-
echo " nvidia.com/mig-2g.45gb: 2"
|
|
75
|
-
echo " nvidia.com/mig-3g.90gb: 2"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|