gpu-dev 0.5.12__tar.gz → 0.5.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/PKG-INFO +1 -1
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +2 -1
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/pyproject.toml +1 -1
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/kubernetes.tf +8 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda.tf +1 -1
- gpu_dev-0.5.13/terraform-gpu-devservers/mig-config.tf +55 -0
- gpu_dev-0.5.13/terraform-gpu-devservers/mig-parted-config.yaml +528 -0
- gpu_dev-0.5.12/terraform-gpu-devservers/scripts/b200-mig-setup.sh +0 -75
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/.github/workflows/no-gitlinks.yml +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/.github/workflows/publish.yml +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/.gitignore +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/CLAUDE.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/PROGRESS.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/PR_DESCRIPTION.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/TODO.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/admin/README.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/admin/generate_stats.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/admin/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/README.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/docs/USER_GUIDE.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/docs/devgpu-features.html +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/docs/docker-mark-blue.svg +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/docs/icons8-cursor-ai.svg +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/post.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/setup.cfg +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/README.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/alb.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/availability.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/backend.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/.dockerignore +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/Dockerfile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bash_profile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bashrc +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/motd_script +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/profile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/shell_env +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/ssh_config +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zprofile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zshrc +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-build.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/ecr.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/efs.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/eks.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/expiry.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/git-cache.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/main.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/monitoring.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/outputs.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/pyproject.toml +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/queue.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/route53.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/switch-to.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/user-data.sh +0 -0
- {gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/variables.tf +0 -0
|
@@ -47,6 +47,8 @@ terraform-gpu-devservers/git-cache.tf
|
|
|
47
47
|
terraform-gpu-devservers/kubernetes.tf
|
|
48
48
|
terraform-gpu-devservers/lambda.tf
|
|
49
49
|
terraform-gpu-devservers/main.tf
|
|
50
|
+
terraform-gpu-devservers/mig-config.tf
|
|
51
|
+
terraform-gpu-devservers/mig-parted-config.yaml
|
|
50
52
|
terraform-gpu-devservers/monitoring.tf
|
|
51
53
|
terraform-gpu-devservers/outputs.tf
|
|
52
54
|
terraform-gpu-devservers/pyproject.toml
|
|
@@ -101,7 +103,6 @@ terraform-gpu-devservers/migrations/check_snapshots.py
|
|
|
101
103
|
terraform-gpu-devservers/migrations/migrate_disks_to_named.py
|
|
102
104
|
terraform-gpu-devservers/migrations/run_backfill.sh
|
|
103
105
|
terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md
|
|
104
|
-
terraform-gpu-devservers/scripts/b200-mig-setup.sh
|
|
105
106
|
terraform-gpu-devservers/scripts/detect_empty_volumes.sh
|
|
106
107
|
terraform-gpu-devservers/scripts/ec2_avail_probe.sh
|
|
107
108
|
terraform-gpu-devservers/scripts/inspect_user_data.sh
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "gpu-dev"
|
|
7
|
-
version = "0.5.
|
|
7
|
+
version = "0.5.13"
|
|
8
8
|
description = "CLI tool for PyTorch GPU developer server reservations"
|
|
9
9
|
authors = [{name = "PyTorch Team"}]
|
|
10
10
|
readme = "cli-tools/gpu-dev-cli/README.md"
|
|
@@ -305,6 +305,14 @@ resource "helm_release" "nvidia_gpu_operator" {
|
|
|
305
305
|
value = "all-disabled"
|
|
306
306
|
}
|
|
307
307
|
|
|
308
|
+
# Read profiles from our forked ConfigMap (managed in mig-config.tf) instead of the
|
|
309
|
+
# operator's auto-created default-mig-parted-config. Lets us add custom mixed profiles
|
|
310
|
+
# like b200-6full-2mig-balanced without ClusterPolicy reconciliation reverting our edits.
|
|
311
|
+
set {
|
|
312
|
+
name = "migManager.config.name"
|
|
313
|
+
value = "gpu-dev-mig-parted-config"
|
|
314
|
+
}
|
|
315
|
+
|
|
308
316
|
set {
|
|
309
317
|
name = "nodeStatusExporter.enabled"
|
|
310
318
|
value = "true"
|
|
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
|
|
|
180
180
|
HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
|
|
181
181
|
SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
|
|
182
182
|
SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
|
|
183
|
-
LAMBDA_VERSION = "0.5.
|
|
183
|
+
LAMBDA_VERSION = "0.5.13"
|
|
184
184
|
MIN_CLI_VERSION = "0.5.9"
|
|
185
185
|
DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
|
|
186
186
|
OPERATIONS_TABLE = aws_dynamodb_table.operations.name
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# mig-config.tf — fork the NVIDIA mig-parted-config ConfigMap so we can add custom profiles
|
|
2
|
+
# without fighting NVIDIA ClusterPolicy's reconciliation of its default-mig-parted-config.
|
|
3
|
+
#
|
|
4
|
+
# The vendored mig-parted-config.yaml in this directory mirrors the upstream profiles plus our
|
|
5
|
+
# additions (e.g. b200-6full-2mig-balanced). Helm is told to use this ConfigMap by name via
|
|
6
|
+
# migManager.config.name in kubernetes.tf, so the GPU operator skips creating its default and
|
|
7
|
+
# reads ours instead.
|
|
8
|
+
|
|
9
|
+
resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
|
|
10
|
+
metadata {
|
|
11
|
+
name = "gpu-dev-mig-parted-config"
|
|
12
|
+
namespace = "gpu-operator"
|
|
13
|
+
labels = {
|
|
14
|
+
"app.kubernetes.io/managed-by" = "terraform"
|
|
15
|
+
"app.kubernetes.io/part-of" = "gpu-dev-servers"
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
data = {
|
|
20
|
+
"config.yaml" = file("${path.module}/mig-parted-config.yaml")
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
# The gpu-operator namespace is created by the helm release; depend on that so this ConfigMap
|
|
24
|
+
# lands AFTER the namespace exists.
|
|
25
|
+
depends_on = [helm_release.nvidia_gpu_operator]
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Optional declarative B200 MIG node label. Set b200_mig_node_name in tfvars (or override the
|
|
29
|
+
# variable's default below) to dedicate a specific B200 node to the mixed profile. Empty string
|
|
30
|
+
# means "no node currently labelled" — the existing all-disabled stays in effect.
|
|
31
|
+
variable "b200_mig_node_name" {
|
|
32
|
+
description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to skip."
|
|
33
|
+
type = string
|
|
34
|
+
default = ""
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
resource "kubernetes_labels" "b200_mig_node" {
|
|
38
|
+
count = var.b200_mig_node_name == "" ? 0 : 1
|
|
39
|
+
|
|
40
|
+
api_version = "v1"
|
|
41
|
+
kind = "Node"
|
|
42
|
+
|
|
43
|
+
metadata {
|
|
44
|
+
name = var.b200_mig_node_name
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
labels = {
|
|
48
|
+
"nvidia.com/mig.config" = "b200-6full-2mig-balanced"
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# Take ownership of the label even if another tool (kubectl, gpu-operator) set it.
|
|
52
|
+
force = true
|
|
53
|
+
|
|
54
|
+
depends_on = [kubernetes_config_map.gpu_dev_mig_parted_config]
|
|
55
|
+
}
|
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
version: v1
|
|
2
|
+
mig-configs:
|
|
3
|
+
all-disabled:
|
|
4
|
+
- devices: all
|
|
5
|
+
mig-enabled: false
|
|
6
|
+
|
|
7
|
+
all-enabled:
|
|
8
|
+
- devices: all
|
|
9
|
+
mig-enabled: true
|
|
10
|
+
mig-devices: {}
|
|
11
|
+
|
|
12
|
+
# A100-40GB, A800-40GB
|
|
13
|
+
all-1g.5gb:
|
|
14
|
+
- devices: all
|
|
15
|
+
mig-enabled: true
|
|
16
|
+
mig-devices:
|
|
17
|
+
"1g.5gb": 7
|
|
18
|
+
|
|
19
|
+
all-1g.5gb.me:
|
|
20
|
+
- devices: all
|
|
21
|
+
mig-enabled: true
|
|
22
|
+
mig-devices:
|
|
23
|
+
"1g.5gb+me": 1
|
|
24
|
+
|
|
25
|
+
all-2g.10gb:
|
|
26
|
+
- devices: all
|
|
27
|
+
mig-enabled: true
|
|
28
|
+
mig-devices:
|
|
29
|
+
"2g.10gb": 3
|
|
30
|
+
|
|
31
|
+
all-3g.20gb:
|
|
32
|
+
- devices: all
|
|
33
|
+
mig-enabled: true
|
|
34
|
+
mig-devices:
|
|
35
|
+
"3g.20gb": 2
|
|
36
|
+
|
|
37
|
+
all-4g.20gb:
|
|
38
|
+
- devices: all
|
|
39
|
+
mig-enabled: true
|
|
40
|
+
mig-devices:
|
|
41
|
+
"4g.20gb": 1
|
|
42
|
+
|
|
43
|
+
all-7g.40gb:
|
|
44
|
+
- devices: all
|
|
45
|
+
mig-enabled: true
|
|
46
|
+
mig-devices:
|
|
47
|
+
"7g.40gb": 1
|
|
48
|
+
|
|
49
|
+
# RTX-PRO-6000-96GB
|
|
50
|
+
all-1g.24gb.gfx:
|
|
51
|
+
- devices: all
|
|
52
|
+
mig-enabled: true
|
|
53
|
+
mig-devices:
|
|
54
|
+
"1g.24gb+gfx": 4
|
|
55
|
+
|
|
56
|
+
all-1g.24gb.me.all:
|
|
57
|
+
- devices: all
|
|
58
|
+
mig-enabled: true
|
|
59
|
+
mig-devices:
|
|
60
|
+
"1g.24gb+me.all": 1
|
|
61
|
+
|
|
62
|
+
all-1g.24gb-me:
|
|
63
|
+
- devices: all
|
|
64
|
+
mig-enabled: true
|
|
65
|
+
mig-devices:
|
|
66
|
+
"1g.24gb-me": 4
|
|
67
|
+
|
|
68
|
+
all-2g.48gb:
|
|
69
|
+
- devices: all
|
|
70
|
+
mig-enabled: true
|
|
71
|
+
mig-devices:
|
|
72
|
+
"2g.48gb": 2
|
|
73
|
+
|
|
74
|
+
all-2g.48gb.gfx:
|
|
75
|
+
- devices: all
|
|
76
|
+
mig-enabled: true
|
|
77
|
+
mig-devices:
|
|
78
|
+
"2g.48gb+gfx": 2
|
|
79
|
+
|
|
80
|
+
all-2g.48gb.me.all:
|
|
81
|
+
- devices: all
|
|
82
|
+
mig-enabled: true
|
|
83
|
+
mig-devices:
|
|
84
|
+
"2g.48gb+me.all": 1
|
|
85
|
+
|
|
86
|
+
all-2g.48gb-me:
|
|
87
|
+
- devices: all
|
|
88
|
+
mig-enabled: true
|
|
89
|
+
mig-devices:
|
|
90
|
+
"2g.48gb-me": 2
|
|
91
|
+
|
|
92
|
+
all-4g.96gb:
|
|
93
|
+
- devices: all
|
|
94
|
+
mig-enabled: true
|
|
95
|
+
mig-devices:
|
|
96
|
+
"4g.96gb": 1
|
|
97
|
+
|
|
98
|
+
all-4g.96gb.gfx:
|
|
99
|
+
- devices: all
|
|
100
|
+
mig-enabled: true
|
|
101
|
+
mig-devices:
|
|
102
|
+
"4g.96gb+gfx": 1
|
|
103
|
+
|
|
104
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB, A100-40GB, A800-40GB
|
|
105
|
+
all-1g.10gb:
|
|
106
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB
|
|
107
|
+
- device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"]
|
|
108
|
+
devices: all
|
|
109
|
+
mig-enabled: true
|
|
110
|
+
mig-devices:
|
|
111
|
+
"1g.10gb": 7
|
|
112
|
+
|
|
113
|
+
# A100-40GB, A800-40GB
|
|
114
|
+
- device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"]
|
|
115
|
+
devices: all
|
|
116
|
+
mig-enabled: true
|
|
117
|
+
mig-devices:
|
|
118
|
+
"1g.10gb": 4
|
|
119
|
+
|
|
120
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB
|
|
121
|
+
all-1g.10gb.me:
|
|
122
|
+
- devices: all
|
|
123
|
+
mig-enabled: true
|
|
124
|
+
mig-devices:
|
|
125
|
+
"1g.10gb+me": 1
|
|
126
|
+
|
|
127
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB
|
|
128
|
+
all-1g.20gb:
|
|
129
|
+
- devices: all
|
|
130
|
+
mig-enabled: true
|
|
131
|
+
mig-devices:
|
|
132
|
+
"1g.20gb": 4
|
|
133
|
+
|
|
134
|
+
# GB200, B200
|
|
135
|
+
all-1g.23gb:
|
|
136
|
+
- devices: all
|
|
137
|
+
mig-enabled: true
|
|
138
|
+
mig-devices:
|
|
139
|
+
"1g.23gb": 7
|
|
140
|
+
|
|
141
|
+
# GB200, B200
|
|
142
|
+
all-1g.23gb.me:
|
|
143
|
+
- devices: all
|
|
144
|
+
mig-enabled: true
|
|
145
|
+
mig-devices:
|
|
146
|
+
"1g.23gb+me": 1
|
|
147
|
+
|
|
148
|
+
all-1g.24gb.me:
|
|
149
|
+
- devices: all
|
|
150
|
+
mig-enabled: true
|
|
151
|
+
mig-devices:
|
|
152
|
+
"1g.24gb+me": 1
|
|
153
|
+
|
|
154
|
+
all-2g.20gb:
|
|
155
|
+
- devices: all
|
|
156
|
+
mig-enabled: true
|
|
157
|
+
mig-devices:
|
|
158
|
+
"2g.20gb": 3
|
|
159
|
+
|
|
160
|
+
all-3g.40gb:
|
|
161
|
+
- devices: all
|
|
162
|
+
mig-enabled: true
|
|
163
|
+
mig-devices:
|
|
164
|
+
"3g.40gb": 2
|
|
165
|
+
|
|
166
|
+
all-4g.40gb:
|
|
167
|
+
- devices: all
|
|
168
|
+
mig-enabled: true
|
|
169
|
+
mig-devices:
|
|
170
|
+
"4g.40gb": 1
|
|
171
|
+
|
|
172
|
+
all-7g.80gb:
|
|
173
|
+
- devices: all
|
|
174
|
+
mig-enabled: true
|
|
175
|
+
mig-devices:
|
|
176
|
+
"7g.80gb": 1
|
|
177
|
+
|
|
178
|
+
# A30-24GB
|
|
179
|
+
all-1g.6gb:
|
|
180
|
+
- devices: all
|
|
181
|
+
mig-enabled: true
|
|
182
|
+
mig-devices:
|
|
183
|
+
"1g.6gb": 4
|
|
184
|
+
|
|
185
|
+
all-1g.6gb.me:
|
|
186
|
+
- devices: all
|
|
187
|
+
mig-enabled: true
|
|
188
|
+
mig-devices:
|
|
189
|
+
"1g.6gb+me": 1
|
|
190
|
+
|
|
191
|
+
all-2g.12gb:
|
|
192
|
+
- devices: all
|
|
193
|
+
mig-enabled: true
|
|
194
|
+
mig-devices:
|
|
195
|
+
"2g.12gb": 2
|
|
196
|
+
|
|
197
|
+
all-2g.12gb.me:
|
|
198
|
+
- devices: all
|
|
199
|
+
mig-enabled: true
|
|
200
|
+
mig-devices:
|
|
201
|
+
"2g.12gb+me": 1
|
|
202
|
+
|
|
203
|
+
all-4g.24gb:
|
|
204
|
+
- devices: all
|
|
205
|
+
mig-enabled: true
|
|
206
|
+
mig-devices:
|
|
207
|
+
"4g.24gb": 1
|
|
208
|
+
|
|
209
|
+
# H100 NVL, H800 NVL, GH200
|
|
210
|
+
all-1g.12gb:
|
|
211
|
+
- devices: all
|
|
212
|
+
mig-enabled: true
|
|
213
|
+
mig-devices:
|
|
214
|
+
"1g.12gb": 7
|
|
215
|
+
|
|
216
|
+
all-1g.12gb.me:
|
|
217
|
+
- devices: all
|
|
218
|
+
mig-enabled: true
|
|
219
|
+
mig-devices:
|
|
220
|
+
"1g.12gb+me": 1
|
|
221
|
+
|
|
222
|
+
all-1g.24gb:
|
|
223
|
+
- devices: all
|
|
224
|
+
mig-enabled: true
|
|
225
|
+
mig-devices:
|
|
226
|
+
"1g.24gb": 4
|
|
227
|
+
|
|
228
|
+
all-1g.45gb:
|
|
229
|
+
- devices: all
|
|
230
|
+
mig-enabled: true
|
|
231
|
+
mig-devices:
|
|
232
|
+
"1g.45gb": 4
|
|
233
|
+
|
|
234
|
+
all-1g.47gb:
|
|
235
|
+
- devices: all
|
|
236
|
+
mig-enabled: true
|
|
237
|
+
mig-devices:
|
|
238
|
+
"1g.47gb": 4
|
|
239
|
+
|
|
240
|
+
all-2g.24gb:
|
|
241
|
+
- devices: all
|
|
242
|
+
mig-enabled: true
|
|
243
|
+
mig-devices:
|
|
244
|
+
"2g.24gb": 3
|
|
245
|
+
|
|
246
|
+
all-2g.45gb:
|
|
247
|
+
- devices: all
|
|
248
|
+
mig-enabled: true
|
|
249
|
+
mig-devices:
|
|
250
|
+
"2g.45gb": 3
|
|
251
|
+
|
|
252
|
+
all-2g.47gb:
|
|
253
|
+
- devices: all
|
|
254
|
+
mig-enabled: true
|
|
255
|
+
mig-devices:
|
|
256
|
+
"2g.47gb": 3
|
|
257
|
+
|
|
258
|
+
# H100 NVL, H800 NVL
|
|
259
|
+
all-3g.47gb:
|
|
260
|
+
- devices: all
|
|
261
|
+
mig-enabled: true
|
|
262
|
+
mig-devices:
|
|
263
|
+
"3g.47gb": 2
|
|
264
|
+
|
|
265
|
+
all-4g.47gb:
|
|
266
|
+
- devices: all
|
|
267
|
+
mig-enabled: true
|
|
268
|
+
mig-devices:
|
|
269
|
+
"4g.47gb": 1
|
|
270
|
+
|
|
271
|
+
all-7g.94gb:
|
|
272
|
+
- devices: all
|
|
273
|
+
mig-enabled: true
|
|
274
|
+
mig-devices:
|
|
275
|
+
"7g.94gb": 1
|
|
276
|
+
|
|
277
|
+
# H100-96GB, PG506-96GB, GH200
|
|
278
|
+
all-3g.48gb:
|
|
279
|
+
- devices: all
|
|
280
|
+
mig-enabled: true
|
|
281
|
+
mig-devices:
|
|
282
|
+
"3g.48gb": 2
|
|
283
|
+
|
|
284
|
+
all-3g.90gb:
|
|
285
|
+
- devices: all
|
|
286
|
+
mig-enabled: true
|
|
287
|
+
mig-devices:
|
|
288
|
+
"3g.90gb": 2
|
|
289
|
+
|
|
290
|
+
all-3g.93gb:
|
|
291
|
+
- devices: all
|
|
292
|
+
mig-enabled: true
|
|
293
|
+
mig-devices:
|
|
294
|
+
"3g.93gb": 2
|
|
295
|
+
|
|
296
|
+
all-3g.95gb:
|
|
297
|
+
- devices: all
|
|
298
|
+
mig-enabled: true
|
|
299
|
+
mig-devices:
|
|
300
|
+
"3g.95gb": 2
|
|
301
|
+
|
|
302
|
+
all-4g.48gb:
|
|
303
|
+
- devices: all
|
|
304
|
+
mig-enabled: true
|
|
305
|
+
mig-devices:
|
|
306
|
+
"4g.48gb": 1
|
|
307
|
+
|
|
308
|
+
all-4g.90gb:
|
|
309
|
+
- devices: all
|
|
310
|
+
mig-enabled: true
|
|
311
|
+
mig-devices:
|
|
312
|
+
"4g.90gb": 1
|
|
313
|
+
|
|
314
|
+
all-4g.93gb:
|
|
315
|
+
- devices: all
|
|
316
|
+
mig-enabled: true
|
|
317
|
+
mig-devices:
|
|
318
|
+
"4g.93gb": 1
|
|
319
|
+
|
|
320
|
+
all-4g.95gb:
|
|
321
|
+
- devices: all
|
|
322
|
+
mig-enabled: true
|
|
323
|
+
mig-devices:
|
|
324
|
+
"4g.95gb": 1
|
|
325
|
+
|
|
326
|
+
all-7g.96gb:
|
|
327
|
+
- devices: all
|
|
328
|
+
mig-enabled: true
|
|
329
|
+
mig-devices:
|
|
330
|
+
"7g.96gb": 1
|
|
331
|
+
|
|
332
|
+
all-7g.180gb:
|
|
333
|
+
- devices: all
|
|
334
|
+
mig-enabled: true
|
|
335
|
+
mig-devices:
|
|
336
|
+
"7g.180gb": 1
|
|
337
|
+
|
|
338
|
+
all-7g.186gb:
|
|
339
|
+
- devices: all
|
|
340
|
+
mig-enabled: true
|
|
341
|
+
mig-devices:
|
|
342
|
+
"7g.186gb": 1
|
|
343
|
+
|
|
344
|
+
all-7g.189gb:
|
|
345
|
+
- devices: all
|
|
346
|
+
mig-enabled: true
|
|
347
|
+
mig-devices:
|
|
348
|
+
"7g.189gb": 1
|
|
349
|
+
|
|
350
|
+
# GB200 HGX, B200, GH200 144G HBM3e, H200-141GB, H200 NVL, H100-96GB, GH200, H100 NVL, H800 NVL, H100-80GB, H800-80GB, A800-40GB, A800-80GB, A100-40GB, A100-80GB, A30-24GB, PG506-96GB
|
|
351
|
+
all-balanced:
|
|
352
|
+
# GB200 HGX
|
|
353
|
+
- device-filter: ["0x294110DE"]
|
|
354
|
+
devices: all
|
|
355
|
+
mig-enabled: true
|
|
356
|
+
mig-devices:
|
|
357
|
+
"1g.23gb": 2
|
|
358
|
+
"2g.47gb": 1
|
|
359
|
+
"3g.93gb": 1
|
|
360
|
+
|
|
361
|
+
# RTX-PRO-6000-96GB
|
|
362
|
+
- device-filter: ["0x2BB510DE"]
|
|
363
|
+
devices: all
|
|
364
|
+
mig-enabled: true
|
|
365
|
+
mig-devices:
|
|
366
|
+
"1g.24gb": 2
|
|
367
|
+
"2g.48gb": 1
|
|
368
|
+
|
|
369
|
+
# B200
|
|
370
|
+
- device-filter: ["0x290110DE"]
|
|
371
|
+
devices: all
|
|
372
|
+
mig-enabled: true
|
|
373
|
+
mig-devices:
|
|
374
|
+
"1g.23gb": 2
|
|
375
|
+
"2g.45gb": 1
|
|
376
|
+
"3g.90gb": 1
|
|
377
|
+
|
|
378
|
+
# GH200 144G HBM3e
|
|
379
|
+
- device-filter: ["0x234810DE"]
|
|
380
|
+
devices: all
|
|
381
|
+
mig-enabled: true
|
|
382
|
+
mig-devices:
|
|
383
|
+
"1g.18gb": 2
|
|
384
|
+
"2g.36gb": 1
|
|
385
|
+
"3g.72gb": 1
|
|
386
|
+
|
|
387
|
+
# H200 141GB, H200 NVL
|
|
388
|
+
- device-filter: ["0x233510DE", "0x233B10DE"]
|
|
389
|
+
devices: all
|
|
390
|
+
mig-enabled: true
|
|
391
|
+
mig-devices:
|
|
392
|
+
"1g.18gb": 2
|
|
393
|
+
"2g.35gb": 1
|
|
394
|
+
"3g.71gb": 1
|
|
395
|
+
|
|
396
|
+
# H100 NVL, H800 NVL
|
|
397
|
+
- device-filter: ["0x232110DE", "0x233A10DE"]
|
|
398
|
+
devices: all
|
|
399
|
+
mig-enabled: true
|
|
400
|
+
mig-devices:
|
|
401
|
+
"1g.12gb": 2
|
|
402
|
+
"2g.24gb": 1
|
|
403
|
+
"3g.47gb": 1
|
|
404
|
+
|
|
405
|
+
# H100-80GB, H800-80GB, A100-80GB, A800-80GB
|
|
406
|
+
- device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"]
|
|
407
|
+
devices: all
|
|
408
|
+
mig-enabled: true
|
|
409
|
+
mig-devices:
|
|
410
|
+
"1g.10gb": 2
|
|
411
|
+
"2g.20gb": 1
|
|
412
|
+
"3g.40gb": 1
|
|
413
|
+
|
|
414
|
+
# A100-40GB, A800-40GB
|
|
415
|
+
- device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"]
|
|
416
|
+
devices: all
|
|
417
|
+
mig-enabled: true
|
|
418
|
+
mig-devices:
|
|
419
|
+
"1g.5gb": 2
|
|
420
|
+
"2g.10gb": 1
|
|
421
|
+
"3g.20gb": 1
|
|
422
|
+
|
|
423
|
+
# A30-24GB
|
|
424
|
+
- device-filter: "0x20B710DE"
|
|
425
|
+
devices: all
|
|
426
|
+
mig-enabled: true
|
|
427
|
+
mig-devices:
|
|
428
|
+
"1g.6gb": 2
|
|
429
|
+
"2g.12gb": 1
|
|
430
|
+
|
|
431
|
+
# H100-96GB, PG506-96GB, GH200, H20
|
|
432
|
+
- device-filter: ["0x234210DE", "0x233D10DE", "0x20B610DE", "0x232910DE"]
|
|
433
|
+
devices: all
|
|
434
|
+
mig-enabled: true
|
|
435
|
+
mig-devices:
|
|
436
|
+
"1g.12gb": 2
|
|
437
|
+
"2g.24gb": 1
|
|
438
|
+
"3g.48gb": 1
|
|
439
|
+
|
|
440
|
+
# H200-141GB, GH200 144G HBM3e
|
|
441
|
+
all-1g.18gb:
|
|
442
|
+
- devices: all
|
|
443
|
+
mig-enabled: true
|
|
444
|
+
mig-devices:
|
|
445
|
+
"1g.18gb": 7
|
|
446
|
+
|
|
447
|
+
all-1g.18gb.me:
|
|
448
|
+
- devices: all
|
|
449
|
+
mig-enabled: true
|
|
450
|
+
mig-devices:
|
|
451
|
+
"1g.18gb+me": 1
|
|
452
|
+
|
|
453
|
+
# H200-141GB
|
|
454
|
+
all-1g.35gb:
|
|
455
|
+
- devices: all
|
|
456
|
+
mig-enabled: true
|
|
457
|
+
mig-devices:
|
|
458
|
+
"1g.35gb": 4
|
|
459
|
+
|
|
460
|
+
all-2g.35gb:
|
|
461
|
+
- devices: all
|
|
462
|
+
mig-enabled: true
|
|
463
|
+
mig-devices:
|
|
464
|
+
"2g.35gb": 3
|
|
465
|
+
|
|
466
|
+
all-3g.71gb:
|
|
467
|
+
- devices: all
|
|
468
|
+
mig-enabled: true
|
|
469
|
+
mig-devices:
|
|
470
|
+
"3g.71gb": 2
|
|
471
|
+
|
|
472
|
+
all-4g.71gb:
|
|
473
|
+
- devices: all
|
|
474
|
+
mig-enabled: true
|
|
475
|
+
mig-devices:
|
|
476
|
+
"4g.71gb": 1
|
|
477
|
+
|
|
478
|
+
all-7g.141gb:
|
|
479
|
+
- devices: all
|
|
480
|
+
mig-enabled: true
|
|
481
|
+
mig-devices:
|
|
482
|
+
"7g.141gb": 1
|
|
483
|
+
|
|
484
|
+
# GH200 144G HBM3e
|
|
485
|
+
all-1g.36gb:
|
|
486
|
+
- devices: all
|
|
487
|
+
mig-enabled: true
|
|
488
|
+
mig-devices:
|
|
489
|
+
"1g.36gb": 4
|
|
490
|
+
|
|
491
|
+
all-2g.36gb:
|
|
492
|
+
- devices: all
|
|
493
|
+
mig-enabled: true
|
|
494
|
+
mig-devices:
|
|
495
|
+
"2g.36gb": 3
|
|
496
|
+
|
|
497
|
+
all-3g.72gb:
|
|
498
|
+
- devices: all
|
|
499
|
+
mig-enabled: true
|
|
500
|
+
mig-devices:
|
|
501
|
+
"3g.72gb": 2
|
|
502
|
+
|
|
503
|
+
all-4g.72gb:
|
|
504
|
+
- devices: all
|
|
505
|
+
mig-enabled: true
|
|
506
|
+
mig-devices:
|
|
507
|
+
"4g.72gb": 1
|
|
508
|
+
|
|
509
|
+
all-7g.144gb:
|
|
510
|
+
- devices: all
|
|
511
|
+
mig-enabled: true
|
|
512
|
+
mig-devices:
|
|
513
|
+
"7g.144gb": 1
|
|
514
|
+
|
|
515
|
+
# Custom: B200 mixed split — GPUs 0-5 stay full (reservable as --gpu-type b200),
|
|
516
|
+
# GPUs 6-7 partitioned per-GPU into 2x1g.23gb + 1x2g.45gb + 1x3g.90gb.
|
|
517
|
+
# Per node: 6 full + 4 small + 2 medium + 2 large slices.
|
|
518
|
+
b200-6full-2mig-balanced:
|
|
519
|
+
- device-filter: ["0x290110DE"]
|
|
520
|
+
devices: [0, 1, 2, 3, 4, 5]
|
|
521
|
+
mig-enabled: false
|
|
522
|
+
- device-filter: ["0x290110DE"]
|
|
523
|
+
devices: [6, 7]
|
|
524
|
+
mig-enabled: true
|
|
525
|
+
mig-devices:
|
|
526
|
+
"1g.23gb": 2
|
|
527
|
+
"2g.45gb": 1
|
|
528
|
+
"3g.90gb": 1
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
# Post-deploy setup for B200 MIG split (6 full + 2 partitioned per node).
|
|
3
|
-
# Run ONCE after PR #77 is merged + tf applied + the new docker/lambda is live.
|
|
4
|
-
|
|
5
|
-
set -e
|
|
6
|
-
|
|
7
|
-
NS=gpu-operator
|
|
8
|
-
CM=default-mig-parted-config
|
|
9
|
-
PROFILE_NAME=b200-6full-2mig-balanced
|
|
10
|
-
|
|
11
|
-
echo "=== Checking current MIG profile in ConfigMap ==="
|
|
12
|
-
if kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' | grep -q "$PROFILE_NAME:"; then
|
|
13
|
-
echo "Profile $PROFILE_NAME already present — skipping ConfigMap edit"
|
|
14
|
-
else
|
|
15
|
-
echo "Profile $PROFILE_NAME missing. Patching ConfigMap..."
|
|
16
|
-
|
|
17
|
-
# Save current ConfigMap content
|
|
18
|
-
kubectl -n "$NS" get configmap "$CM" -o yaml > /tmp/mig-config-backup.yaml
|
|
19
|
-
echo "Backup saved to /tmp/mig-config-backup.yaml"
|
|
20
|
-
|
|
21
|
-
# Append our profile under mig-configs:
|
|
22
|
-
# NOTE: this is a sed-driven append. ClusterPolicy's controller MAY revert this if it
|
|
23
|
-
# reconciles. If you see the profile disappear, re-run this script. If it keeps reverting,
|
|
24
|
-
# we'll need to fork the ConfigMap (next iteration).
|
|
25
|
-
kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' > /tmp/mig-config.yaml
|
|
26
|
-
|
|
27
|
-
cat >> /tmp/mig-config.yaml <<'EOF'
|
|
28
|
-
|
|
29
|
-
# Mixed B200 split: GPUs 0-5 stay full (reservable as --gpu-type b200), GPUs 6-7 partitioned.
|
|
30
|
-
# Per partitioned GPU: 2x 1g.23gb + 1x 2g.45gb + 1x 3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large.
|
|
31
|
-
b200-6full-2mig-balanced:
|
|
32
|
-
- device-filter: ["0x290110DE"]
|
|
33
|
-
devices: [0, 1, 2, 3, 4, 5]
|
|
34
|
-
mig-enabled: false
|
|
35
|
-
- device-filter: ["0x290110DE"]
|
|
36
|
-
devices: [6, 7]
|
|
37
|
-
mig-enabled: true
|
|
38
|
-
mig-devices:
|
|
39
|
-
"1g.23gb": 2
|
|
40
|
-
"2g.45gb": 1
|
|
41
|
-
"3g.90gb": 1
|
|
42
|
-
EOF
|
|
43
|
-
|
|
44
|
-
# Re-encode and patch
|
|
45
|
-
kubectl -n "$NS" create configmap "$CM" --from-file=config.yaml=/tmp/mig-config.yaml --dry-run=client -o yaml \
|
|
46
|
-
| kubectl -n "$NS" patch configmap "$CM" --patch-file=/dev/stdin
|
|
47
|
-
echo "ConfigMap patched."
|
|
48
|
-
fi
|
|
49
|
-
|
|
50
|
-
echo
|
|
51
|
-
echo "=== Picking a B200 node to label ==="
|
|
52
|
-
NODE=$(kubectl get nodes -l GpuType=b200 -o jsonpath='{.items[0].metadata.name}')
|
|
53
|
-
if [ -z "$NODE" ]; then
|
|
54
|
-
echo "No B200 nodes found. Exiting."
|
|
55
|
-
exit 1
|
|
56
|
-
fi
|
|
57
|
-
echo "Will label: $NODE"
|
|
58
|
-
read -p "Proceed? (y/N): " CONFIRM
|
|
59
|
-
if [ "$CONFIRM" != "y" ]; then
|
|
60
|
-
echo "Aborted."
|
|
61
|
-
exit 0
|
|
62
|
-
fi
|
|
63
|
-
|
|
64
|
-
kubectl label node "$NODE" "nvidia.com/mig.config=$PROFILE_NAME" --overwrite
|
|
65
|
-
echo "Node labelled. nvidia-mig-manager will partition GPUs 6-7 (drains existing pods if any)."
|
|
66
|
-
echo
|
|
67
|
-
echo "Watch progress with:"
|
|
68
|
-
echo " kubectl logs -n gpu-operator -l app=nvidia-mig-manager -f"
|
|
69
|
-
echo " kubectl get node $NODE -o jsonpath='{.status.allocatable}' | jq ."
|
|
70
|
-
echo
|
|
71
|
-
echo "After ~2-5 min, allocatable should show:"
|
|
72
|
-
echo " nvidia.com/gpu: 6"
|
|
73
|
-
echo " nvidia.com/mig-1g.23gb: 4"
|
|
74
|
-
echo " nvidia.com/mig-2g.45gb: 2"
|
|
75
|
-
echo " nvidia.com/mig-3g.90gb: 2"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/docker/setup-dotfiles-persistence
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/availability_updater/index.py
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py
RENAMED
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_expiry/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/index.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh
RENAMED
|
File without changes
|
|
File without changes
|
{gpu_dev-0.5.12 → gpu_dev-0.5.13}/terraform-gpu-devservers/templates/user-data-self-managed.sh
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|