gpu-dev 0.5.12__tar.gz → 0.5.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/PKG-INFO +1 -1
  2. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO +1 -1
  3. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt +2 -1
  4. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py +8 -9
  5. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py +2 -2
  6. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/pyproject.toml +1 -1
  7. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/kubernetes.tf +8 -0
  8. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda.tf +1 -1
  9. gpu_dev-0.5.14/terraform-gpu-devservers/mig-config.tf +72 -0
  10. gpu_dev-0.5.14/terraform-gpu-devservers/mig-parted-config.yaml +528 -0
  11. gpu_dev-0.5.12/terraform-gpu-devservers/scripts/b200-mig-setup.sh +0 -75
  12. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/.github/workflows/no-gitlinks.yml +0 -0
  13. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/.github/workflows/publish.yml +0 -0
  14. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/.gitignore +0 -0
  15. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/CLAUDE.md +0 -0
  16. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/PROGRESS.md +0 -0
  17. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/PR_DESCRIPTION.md +0 -0
  18. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/TODO.md +0 -0
  19. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/admin/README.md +0 -0
  20. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/admin/generate_stats.py +0 -0
  21. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/admin/requirements.txt +0 -0
  22. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/README.md +0 -0
  23. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/ZERO_CONFIG_SETUP.md +0 -0
  24. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/dependency_links.txt +0 -0
  25. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/entry_points.txt +0 -0
  26. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/requires.txt +0 -0
  27. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/top_level.txt +0 -0
  28. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/__init__.py +0 -0
  29. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/auth.py +0 -0
  30. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/config.py +0 -0
  31. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/disks.py +0 -0
  32. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/name_generator.py +0 -0
  33. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py +0 -0
  34. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/gpu_dev_cli/ssh_proxy.py +0 -0
  35. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/gpu-dev-cli/minimal-iam-policy.json +0 -0
  36. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/cli-tools/scripts/clear_stale_disk_locks.py +0 -0
  37. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/docs/USER_GUIDE.md +0 -0
  38. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/docs/devgpu-features.html +0 -0
  39. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/docs/docker-mark-blue.svg +0 -0
  40. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/docs/icons8-cursor-ai.svg +0 -0
  41. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/post.md +0 -0
  42. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/setup.cfg +0 -0
  43. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/.claude/skills/deploy.md +0 -0
  44. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/.terraform.lock.hcl +0 -0
  45. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/README.md +0 -0
  46. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/alb.tf +0 -0
  47. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/availability.tf +0 -0
  48. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/backend.tf +0 -0
  49. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/.dockerignore +0 -0
  50. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/Dockerfile +0 -0
  51. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/backup-dotfiles +0 -0
  52. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bash_profile +0 -0
  53. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bashrc +0 -0
  54. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/bashrc_ext +0 -0
  55. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/build-with-efa.sh +0 -0
  56. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/dotfiles-shutdown-handler +0 -0
  57. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/list-dotfile-versions +0 -0
  58. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/motd_script +0 -0
  59. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/nproc_wrapper +0 -0
  60. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/profile +0 -0
  61. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/restore-dotfiles +0 -0
  62. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/restore-dotfiles-version +0 -0
  63. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/setup-dotfiles-persistence +0 -0
  64. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/shell_env +0 -0
  65. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/ssh_config +0 -0
  66. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zprofile +0 -0
  67. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zshrc +0 -0
  68. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker/zshrc_ext +0 -0
  69. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-build.tf +0 -0
  70. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-example/Dockerfile +0 -0
  71. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/docker-example/hello.txt +0 -0
  72. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ecr.tf +0 -0
  73. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/efs.tf +0 -0
  74. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/eks.tf +0 -0
  75. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/expiry.tf +0 -0
  76. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/git-cache.tf +0 -0
  77. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/availability_updater/index.py +0 -0
  78. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/availability_updater/requirements.txt +0 -0
  79. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/migration/tag_largest_snapshots.py +0 -0
  80. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_expiry/index.py +0 -0
  81. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_expiry/requirements.txt +0 -0
  82. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/buildkit_job.py +0 -0
  83. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/index.py +0 -0
  84. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/reservation_processor/requirements.txt +0 -0
  85. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/__init__.py +0 -0
  86. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/alb_utils.py +0 -0
  87. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/dns_utils.py +0 -0
  88. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/k8s_client.py +0 -0
  89. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/k8s_resource_tracker.py +0 -0
  90. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/requirements.txt +0 -0
  91. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/lambda/shared/snapshot_utils.py +0 -0
  92. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/main.tf +0 -0
  93. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py +0 -0
  94. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/backfill_snapshot_contents.py.bak +0 -0
  95. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/check_snapshots.py +0 -0
  96. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/migrate_disks_to_named.py +0 -0
  97. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/migrations/run_backfill.sh +0 -0
  98. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/monitoring.tf +0 -0
  99. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/outputs.tf +0 -0
  100. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/pyproject.toml +0 -0
  101. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/queue.tf +0 -0
  102. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/route53.tf +0 -0
  103. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/s3-disk-contents.tf +0 -0
  104. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md +0 -0
  105. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/detect_empty_volumes.sh +0 -0
  106. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/ec2_avail_probe.sh +0 -0
  107. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/scripts/inspect_user_data.sh +0 -0
  108. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/Dockerfile +0 -0
  109. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/proxy.py +0 -0
  110. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy/requirements.txt +0 -0
  111. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy-service.tf +0 -0
  112. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/ssh-proxy.tf +0 -0
  113. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/switch-to.sh +0 -0
  114. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/al2023-cpu-user-data.sh +0 -0
  115. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/al2023-user-data.sh +0 -0
  116. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/user-data-self-managed.sh +0 -0
  117. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/templates/user-data.sh +0 -0
  118. {gpu_dev-0.5.12 → gpu_dev-0.5.14}/terraform-gpu-devservers/variables.tf +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.12
3
+ Version: 0.5.14
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gpu-dev
3
- Version: 0.5.12
3
+ Version: 0.5.14
4
4
  Summary: CLI tool for PyTorch GPU developer server reservations
5
5
  Author: PyTorch Team
6
6
  Requires-Python: >=3.10
@@ -47,6 +47,8 @@ terraform-gpu-devservers/git-cache.tf
47
47
  terraform-gpu-devservers/kubernetes.tf
48
48
  terraform-gpu-devservers/lambda.tf
49
49
  terraform-gpu-devservers/main.tf
50
+ terraform-gpu-devservers/mig-config.tf
51
+ terraform-gpu-devservers/mig-parted-config.yaml
50
52
  terraform-gpu-devservers/monitoring.tf
51
53
  terraform-gpu-devservers/outputs.tf
52
54
  terraform-gpu-devservers/pyproject.toml
@@ -101,7 +103,6 @@ terraform-gpu-devservers/migrations/check_snapshots.py
101
103
  terraform-gpu-devservers/migrations/migrate_disks_to_named.py
102
104
  terraform-gpu-devservers/migrations/run_backfill.sh
103
105
  terraform-gpu-devservers/scripts/CLEANUP_GUIDE.md
104
- terraform-gpu-devservers/scripts/b200-mig-setup.sh
105
106
  terraform-gpu-devservers/scripts/detect_empty_volumes.sh
106
107
  terraform-gpu-devservers/scripts/ec2_avail_probe.sh
107
108
  terraform-gpu-devservers/scripts/inspect_user_data.sh
@@ -688,6 +688,7 @@ def reserve(
688
688
  # and total wall-clock time drops from sum to max(each).
689
689
  from concurrent.futures import ThreadPoolExecutor
690
690
  config = load_config()
691
+ reservation_mgr = ReservationManager(config)
691
692
 
692
693
  with Live(
693
694
  Spinner("dots", text="🚀 Loading…"), console=console
@@ -704,9 +705,7 @@ def reserve(
704
705
  else:
705
706
  f_ssh = ex.submit(validate_ssh_key_matches_github_user, config, None)
706
707
  ssh_result = None
707
- f_avail = ex.submit(
708
- lambda: ReservationManager(config).get_gpu_availability_by_type()
709
- )
708
+ f_avail = ex.submit(reservation_mgr.get_gpu_availability_by_type)
710
709
 
711
710
  # Surface auth failure first (most actionable).
712
711
  try:
@@ -2496,10 +2495,10 @@ def _show_availability() -> None:
2496
2495
  table = Table(
2497
2496
  title="GPU Availability by Type (numbers are GPUs, not nodes)")
2498
2497
  table.add_column("GPU Type", style="cyan")
2499
- table.add_column("Available", style="green")
2500
- table.add_column("Max Reservable", style="bright_green")
2498
+ table.add_column("Avail", style="green")
2499
+ table.add_column("Max\nReservable", style="bright_green")
2501
2500
  table.add_column("Total", style="blue")
2502
- table.add_column("Queue Length", style="yellow")
2501
+ table.add_column("Queue\nLength", style="yellow")
2503
2502
  table.add_column("Architecture", style="dim")
2504
2503
  table.add_column("Est. Wait Time", style="magenta")
2505
2504
 
@@ -2657,10 +2656,10 @@ def _show_availability_watch(interval: int) -> None:
2657
2656
  table = Table(
2658
2657
  title="GPU Availability by Type (numbers are GPUs, not nodes)")
2659
2658
  table.add_column("GPU Type", style="cyan")
2660
- table.add_column("Available", style="green")
2661
- table.add_column("Max Reservable", style="blue")
2659
+ table.add_column("Avail", style="green")
2660
+ table.add_column("Max\nReservable", style="blue")
2662
2661
  table.add_column("Total", style="blue")
2663
- table.add_column("Queue Length", style="yellow")
2662
+ table.add_column("Queue\nLength", style="yellow")
2664
2663
  table.add_column("Architecture", style="dim")
2665
2664
  table.add_column("Est. Wait Time", style="magenta")
2666
2665
 
@@ -88,9 +88,9 @@ def select_gpu_type_interactive(
88
88
  console.print("\n[cyan]🖥️ GPU Availability:[/cyan]")
89
89
  table = Table()
90
90
  table.add_column("GPU Type", style="cyan")
91
- table.add_column("Available", style="green")
91
+ table.add_column("Avail", style="green")
92
92
  table.add_column("Total", style="blue")
93
- table.add_column("Queue Length", style="yellow")
93
+ table.add_column("Queue\nLength", style="yellow")
94
94
  table.add_column("Est. Wait Time", style="magenta")
95
95
 
96
96
  choices = []
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "gpu-dev"
7
- version = "0.5.12"
7
+ version = "0.5.14"
8
8
  description = "CLI tool for PyTorch GPU developer server reservations"
9
9
  authors = [{name = "PyTorch Team"}]
10
10
  readme = "cli-tools/gpu-dev-cli/README.md"
@@ -305,6 +305,14 @@ resource "helm_release" "nvidia_gpu_operator" {
305
305
  value = "all-disabled"
306
306
  }
307
307
 
308
+ # Read profiles from our forked ConfigMap (managed in mig-config.tf) instead of the
309
+ # operator's auto-created default-mig-parted-config. Lets us add custom mixed profiles
310
+ # like b200-6full-2mig-balanced without ClusterPolicy reconciliation reverting our edits.
311
+ set {
312
+ name = "migManager.config.name"
313
+ value = "gpu-dev-mig-parted-config"
314
+ }
315
+
308
316
  set {
309
317
  name = "nodeStatusExporter.enabled"
310
318
  value = "true"
@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
180
180
  HOSTED_ZONE_ID = local.effective_domain_name != "" ? local.hosted_zone_id : ""
181
181
  SSH_DOMAIN_MAPPINGS_TABLE = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
182
182
  SSL_CERTIFICATE_ARN = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
183
- LAMBDA_VERSION = "0.5.12"
183
+ LAMBDA_VERSION = "0.5.13"
184
184
  MIN_CLI_VERSION = "0.5.9"
185
185
  DISK_CONTENTS_BUCKET = aws_s3_bucket.disk_contents.bucket
186
186
  OPERATIONS_TABLE = aws_dynamodb_table.operations.name
@@ -0,0 +1,72 @@
1
+ # mig-config.tf — fork the NVIDIA mig-parted-config ConfigMap so we can add custom profiles
2
+ # without fighting NVIDIA ClusterPolicy's reconciliation of its default-mig-parted-config.
3
+ #
4
+ # The vendored mig-parted-config.yaml in this directory mirrors the upstream profiles plus our
5
+ # additions (e.g. b200-6full-2mig-balanced). Helm is told to use this ConfigMap by name via
6
+ # migManager.config.name in kubernetes.tf, so the GPU operator skips creating its default and
7
+ # reads ours instead.
8
+
9
+ resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
10
+ metadata {
11
+ name = "gpu-dev-mig-parted-config"
12
+ namespace = "gpu-operator"
13
+ labels = {
14
+ "app.kubernetes.io/managed-by" = "terraform"
15
+ "app.kubernetes.io/part-of" = "gpu-dev-servers"
16
+ }
17
+ }
18
+
19
+ data = {
20
+ "config.yaml" = file("${path.module}/mig-parted-config.yaml")
21
+ }
22
+
23
+ # The gpu-operator namespace is created by the helm release; depend on that so this ConfigMap
24
+ # lands AFTER the namespace exists.
25
+ depends_on = [helm_release.nvidia_gpu_operator]
26
+ }
27
+
28
+ # Declarative B200 MIG node label. Set b200_mig_node_name (per workspace via the locals lookup
29
+ # below, or override via tfvars / -var) to dedicate a specific B200 node to the mixed profile.
30
+ # Empty string means "no node labelled" — every B200 stays full.
31
+ #
32
+ # Future cleanup: when we split a B200 CR into two ASGs (one with mig_profile, one without),
33
+ # the user_data path will set this label at boot for any instance in the MIG-dedicated ASG —
34
+ # matching the H100 cr3 pattern. Until then, this declarative label pins the role to a hostname.
35
+ locals {
36
+ # Workspace-scoped defaults so the resource is a no-op in non-prod and no apply ever tries to
37
+ # label a node that doesn't exist.
38
+ default_b200_mig_node_by_workspace = {
39
+ prod = "ip-10-0-67-125.us-east-2.compute.internal"
40
+ }
41
+ b200_mig_node_effective = (
42
+ var.b200_mig_node_name != ""
43
+ ? var.b200_mig_node_name
44
+ : lookup(local.default_b200_mig_node_by_workspace, terraform.workspace, "")
45
+ )
46
+ }
47
+
48
+ variable "b200_mig_node_name" {
49
+ description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to use the per-workspace default in mig-config.tf."
50
+ type = string
51
+ default = ""
52
+ }
53
+
54
+ resource "kubernetes_labels" "b200_mig_node" {
55
+ count = local.b200_mig_node_effective == "" ? 0 : 1
56
+
57
+ api_version = "v1"
58
+ kind = "Node"
59
+
60
+ metadata {
61
+ name = local.b200_mig_node_effective
62
+ }
63
+
64
+ labels = {
65
+ "nvidia.com/mig.config" = "b200-6full-2mig-balanced"
66
+ }
67
+
68
+ # Take ownership of the label even if another tool (kubectl, gpu-operator) set it.
69
+ force = true
70
+
71
+ depends_on = [kubernetes_config_map.gpu_dev_mig_parted_config]
72
+ }
@@ -0,0 +1,528 @@
1
+ version: v1
2
+ mig-configs:
3
+ all-disabled:
4
+ - devices: all
5
+ mig-enabled: false
6
+
7
+ all-enabled:
8
+ - devices: all
9
+ mig-enabled: true
10
+ mig-devices: {}
11
+
12
+ # A100-40GB, A800-40GB
13
+ all-1g.5gb:
14
+ - devices: all
15
+ mig-enabled: true
16
+ mig-devices:
17
+ "1g.5gb": 7
18
+
19
+ all-1g.5gb.me:
20
+ - devices: all
21
+ mig-enabled: true
22
+ mig-devices:
23
+ "1g.5gb+me": 1
24
+
25
+ all-2g.10gb:
26
+ - devices: all
27
+ mig-enabled: true
28
+ mig-devices:
29
+ "2g.10gb": 3
30
+
31
+ all-3g.20gb:
32
+ - devices: all
33
+ mig-enabled: true
34
+ mig-devices:
35
+ "3g.20gb": 2
36
+
37
+ all-4g.20gb:
38
+ - devices: all
39
+ mig-enabled: true
40
+ mig-devices:
41
+ "4g.20gb": 1
42
+
43
+ all-7g.40gb:
44
+ - devices: all
45
+ mig-enabled: true
46
+ mig-devices:
47
+ "7g.40gb": 1
48
+
49
+ # RTX-PRO-6000-96GB
50
+ all-1g.24gb.gfx:
51
+ - devices: all
52
+ mig-enabled: true
53
+ mig-devices:
54
+ "1g.24gb+gfx": 4
55
+
56
+ all-1g.24gb.me.all:
57
+ - devices: all
58
+ mig-enabled: true
59
+ mig-devices:
60
+ "1g.24gb+me.all": 1
61
+
62
+ all-1g.24gb-me:
63
+ - devices: all
64
+ mig-enabled: true
65
+ mig-devices:
66
+ "1g.24gb-me": 4
67
+
68
+ all-2g.48gb:
69
+ - devices: all
70
+ mig-enabled: true
71
+ mig-devices:
72
+ "2g.48gb": 2
73
+
74
+ all-2g.48gb.gfx:
75
+ - devices: all
76
+ mig-enabled: true
77
+ mig-devices:
78
+ "2g.48gb+gfx": 2
79
+
80
+ all-2g.48gb.me.all:
81
+ - devices: all
82
+ mig-enabled: true
83
+ mig-devices:
84
+ "2g.48gb+me.all": 1
85
+
86
+ all-2g.48gb-me:
87
+ - devices: all
88
+ mig-enabled: true
89
+ mig-devices:
90
+ "2g.48gb-me": 2
91
+
92
+ all-4g.96gb:
93
+ - devices: all
94
+ mig-enabled: true
95
+ mig-devices:
96
+ "4g.96gb": 1
97
+
98
+ all-4g.96gb.gfx:
99
+ - devices: all
100
+ mig-enabled: true
101
+ mig-devices:
102
+ "4g.96gb+gfx": 1
103
+
104
+ # H100-80GB, H800-80GB, A100-80GB, A800-80GB, A100-40GB, A800-40GB
105
+ all-1g.10gb:
106
+ # H100-80GB, H800-80GB, A100-80GB, A800-80GB
107
+ - device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"]
108
+ devices: all
109
+ mig-enabled: true
110
+ mig-devices:
111
+ "1g.10gb": 7
112
+
113
+ # A100-40GB, A800-40GB
114
+ - device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"]
115
+ devices: all
116
+ mig-enabled: true
117
+ mig-devices:
118
+ "1g.10gb": 4
119
+
120
+ # H100-80GB, H800-80GB, A100-80GB, A800-80GB
121
+ all-1g.10gb.me:
122
+ - devices: all
123
+ mig-enabled: true
124
+ mig-devices:
125
+ "1g.10gb+me": 1
126
+
127
+ # H100-80GB, H800-80GB, A100-80GB, A800-80GB
128
+ all-1g.20gb:
129
+ - devices: all
130
+ mig-enabled: true
131
+ mig-devices:
132
+ "1g.20gb": 4
133
+
134
+ # GB200, B200
135
+ all-1g.23gb:
136
+ - devices: all
137
+ mig-enabled: true
138
+ mig-devices:
139
+ "1g.23gb": 7
140
+
141
+ # GB200, B200
142
+ all-1g.23gb.me:
143
+ - devices: all
144
+ mig-enabled: true
145
+ mig-devices:
146
+ "1g.23gb+me": 1
147
+
148
+ all-1g.24gb.me:
149
+ - devices: all
150
+ mig-enabled: true
151
+ mig-devices:
152
+ "1g.24gb+me": 1
153
+
154
+ all-2g.20gb:
155
+ - devices: all
156
+ mig-enabled: true
157
+ mig-devices:
158
+ "2g.20gb": 3
159
+
160
+ all-3g.40gb:
161
+ - devices: all
162
+ mig-enabled: true
163
+ mig-devices:
164
+ "3g.40gb": 2
165
+
166
+ all-4g.40gb:
167
+ - devices: all
168
+ mig-enabled: true
169
+ mig-devices:
170
+ "4g.40gb": 1
171
+
172
+ all-7g.80gb:
173
+ - devices: all
174
+ mig-enabled: true
175
+ mig-devices:
176
+ "7g.80gb": 1
177
+
178
+ # A30-24GB
179
+ all-1g.6gb:
180
+ - devices: all
181
+ mig-enabled: true
182
+ mig-devices:
183
+ "1g.6gb": 4
184
+
185
+ all-1g.6gb.me:
186
+ - devices: all
187
+ mig-enabled: true
188
+ mig-devices:
189
+ "1g.6gb+me": 1
190
+
191
+ all-2g.12gb:
192
+ - devices: all
193
+ mig-enabled: true
194
+ mig-devices:
195
+ "2g.12gb": 2
196
+
197
+ all-2g.12gb.me:
198
+ - devices: all
199
+ mig-enabled: true
200
+ mig-devices:
201
+ "2g.12gb+me": 1
202
+
203
+ all-4g.24gb:
204
+ - devices: all
205
+ mig-enabled: true
206
+ mig-devices:
207
+ "4g.24gb": 1
208
+
209
+ # H100 NVL, H800 NVL, GH200
210
+ all-1g.12gb:
211
+ - devices: all
212
+ mig-enabled: true
213
+ mig-devices:
214
+ "1g.12gb": 7
215
+
216
+ all-1g.12gb.me:
217
+ - devices: all
218
+ mig-enabled: true
219
+ mig-devices:
220
+ "1g.12gb+me": 1
221
+
222
+ all-1g.24gb:
223
+ - devices: all
224
+ mig-enabled: true
225
+ mig-devices:
226
+ "1g.24gb": 4
227
+
228
+ all-1g.45gb:
229
+ - devices: all
230
+ mig-enabled: true
231
+ mig-devices:
232
+ "1g.45gb": 4
233
+
234
+ all-1g.47gb:
235
+ - devices: all
236
+ mig-enabled: true
237
+ mig-devices:
238
+ "1g.47gb": 4
239
+
240
+ all-2g.24gb:
241
+ - devices: all
242
+ mig-enabled: true
243
+ mig-devices:
244
+ "2g.24gb": 3
245
+
246
+ all-2g.45gb:
247
+ - devices: all
248
+ mig-enabled: true
249
+ mig-devices:
250
+ "2g.45gb": 3
251
+
252
+ all-2g.47gb:
253
+ - devices: all
254
+ mig-enabled: true
255
+ mig-devices:
256
+ "2g.47gb": 3
257
+
258
+ # H100 NVL, H800 NVL
259
+ all-3g.47gb:
260
+ - devices: all
261
+ mig-enabled: true
262
+ mig-devices:
263
+ "3g.47gb": 2
264
+
265
+ all-4g.47gb:
266
+ - devices: all
267
+ mig-enabled: true
268
+ mig-devices:
269
+ "4g.47gb": 1
270
+
271
+ all-7g.94gb:
272
+ - devices: all
273
+ mig-enabled: true
274
+ mig-devices:
275
+ "7g.94gb": 1
276
+
277
+ # H100-96GB, PG506-96GB, GH200
278
+ all-3g.48gb:
279
+ - devices: all
280
+ mig-enabled: true
281
+ mig-devices:
282
+ "3g.48gb": 2
283
+
284
+ all-3g.90gb:
285
+ - devices: all
286
+ mig-enabled: true
287
+ mig-devices:
288
+ "3g.90gb": 2
289
+
290
+ all-3g.93gb:
291
+ - devices: all
292
+ mig-enabled: true
293
+ mig-devices:
294
+ "3g.93gb": 2
295
+
296
+ all-3g.95gb:
297
+ - devices: all
298
+ mig-enabled: true
299
+ mig-devices:
300
+ "3g.95gb": 2
301
+
302
+ all-4g.48gb:
303
+ - devices: all
304
+ mig-enabled: true
305
+ mig-devices:
306
+ "4g.48gb": 1
307
+
308
+ all-4g.90gb:
309
+ - devices: all
310
+ mig-enabled: true
311
+ mig-devices:
312
+ "4g.90gb": 1
313
+
314
+ all-4g.93gb:
315
+ - devices: all
316
+ mig-enabled: true
317
+ mig-devices:
318
+ "4g.93gb": 1
319
+
320
+ all-4g.95gb:
321
+ - devices: all
322
+ mig-enabled: true
323
+ mig-devices:
324
+ "4g.95gb": 1
325
+
326
+ all-7g.96gb:
327
+ - devices: all
328
+ mig-enabled: true
329
+ mig-devices:
330
+ "7g.96gb": 1
331
+
332
+ all-7g.180gb:
333
+ - devices: all
334
+ mig-enabled: true
335
+ mig-devices:
336
+ "7g.180gb": 1
337
+
338
+ all-7g.186gb:
339
+ - devices: all
340
+ mig-enabled: true
341
+ mig-devices:
342
+ "7g.186gb": 1
343
+
344
+ all-7g.189gb:
345
+ - devices: all
346
+ mig-enabled: true
347
+ mig-devices:
348
+ "7g.189gb": 1
349
+
350
+ # GB200 HGX, B200, GH200 144G HBM3e, H200-141GB, H200 NVL, H100-96GB, GH200, H100 NVL, H800 NVL, H100-80GB, H800-80GB, A800-40GB, A800-80GB, A100-40GB, A100-80GB, A30-24GB, PG506-96GB
351
+ all-balanced:
352
+ # GB200 HGX
353
+ - device-filter: ["0x294110DE"]
354
+ devices: all
355
+ mig-enabled: true
356
+ mig-devices:
357
+ "1g.23gb": 2
358
+ "2g.47gb": 1
359
+ "3g.93gb": 1
360
+
361
+ # RTX-PRO-6000-96GB
362
+ - device-filter: ["0x2BB510DE"]
363
+ devices: all
364
+ mig-enabled: true
365
+ mig-devices:
366
+ "1g.24gb": 2
367
+ "2g.48gb": 1
368
+
369
+ # B200
370
+ - device-filter: ["0x290110DE"]
371
+ devices: all
372
+ mig-enabled: true
373
+ mig-devices:
374
+ "1g.23gb": 2
375
+ "2g.45gb": 1
376
+ "3g.90gb": 1
377
+
378
+ # GH200 144G HBM3e
379
+ - device-filter: ["0x234810DE"]
380
+ devices: all
381
+ mig-enabled: true
382
+ mig-devices:
383
+ "1g.18gb": 2
384
+ "2g.36gb": 1
385
+ "3g.72gb": 1
386
+
387
+ # H200 141GB, H200 NVL
388
+ - device-filter: ["0x233510DE", "0x233B10DE"]
389
+ devices: all
390
+ mig-enabled: true
391
+ mig-devices:
392
+ "1g.18gb": 2
393
+ "2g.35gb": 1
394
+ "3g.71gb": 1
395
+
396
+ # H100 NVL, H800 NVL
397
+ - device-filter: ["0x232110DE", "0x233A10DE"]
398
+ devices: all
399
+ mig-enabled: true
400
+ mig-devices:
401
+ "1g.12gb": 2
402
+ "2g.24gb": 1
403
+ "3g.47gb": 1
404
+
405
+ # H100-80GB, H800-80GB, A100-80GB, A800-80GB
406
+ - device-filter: ["0x233010DE", "0x233110DE", "0x232210DE", "0x20B210DE", "0x20B510DE", "0x20F310DE", "0x20F510DE", "0x232410DE"]
407
+ devices: all
408
+ mig-enabled: true
409
+ mig-devices:
410
+ "1g.10gb": 2
411
+ "2g.20gb": 1
412
+ "3g.40gb": 1
413
+
414
+ # A100-40GB, A800-40GB
415
+ - device-filter: ["0x20B010DE", "0x20B110DE", "0x20F110DE", "0x20F610DE"]
416
+ devices: all
417
+ mig-enabled: true
418
+ mig-devices:
419
+ "1g.5gb": 2
420
+ "2g.10gb": 1
421
+ "3g.20gb": 1
422
+
423
+ # A30-24GB
424
+ - device-filter: "0x20B710DE"
425
+ devices: all
426
+ mig-enabled: true
427
+ mig-devices:
428
+ "1g.6gb": 2
429
+ "2g.12gb": 1
430
+
431
+ # H100-96GB, PG506-96GB, GH200, H20
432
+ - device-filter: ["0x234210DE", "0x233D10DE", "0x20B610DE", "0x232910DE"]
433
+ devices: all
434
+ mig-enabled: true
435
+ mig-devices:
436
+ "1g.12gb": 2
437
+ "2g.24gb": 1
438
+ "3g.48gb": 1
439
+
440
+ # H200-141GB, GH200 144G HBM3e
441
+ all-1g.18gb:
442
+ - devices: all
443
+ mig-enabled: true
444
+ mig-devices:
445
+ "1g.18gb": 7
446
+
447
+ all-1g.18gb.me:
448
+ - devices: all
449
+ mig-enabled: true
450
+ mig-devices:
451
+ "1g.18gb+me": 1
452
+
453
+ # H200-141GB
454
+ all-1g.35gb:
455
+ - devices: all
456
+ mig-enabled: true
457
+ mig-devices:
458
+ "1g.35gb": 4
459
+
460
+ all-2g.35gb:
461
+ - devices: all
462
+ mig-enabled: true
463
+ mig-devices:
464
+ "2g.35gb": 3
465
+
466
+ all-3g.71gb:
467
+ - devices: all
468
+ mig-enabled: true
469
+ mig-devices:
470
+ "3g.71gb": 2
471
+
472
+ all-4g.71gb:
473
+ - devices: all
474
+ mig-enabled: true
475
+ mig-devices:
476
+ "4g.71gb": 1
477
+
478
+ all-7g.141gb:
479
+ - devices: all
480
+ mig-enabled: true
481
+ mig-devices:
482
+ "7g.141gb": 1
483
+
484
+ # GH200 144G HBM3e
485
+ all-1g.36gb:
486
+ - devices: all
487
+ mig-enabled: true
488
+ mig-devices:
489
+ "1g.36gb": 4
490
+
491
+ all-2g.36gb:
492
+ - devices: all
493
+ mig-enabled: true
494
+ mig-devices:
495
+ "2g.36gb": 3
496
+
497
+ all-3g.72gb:
498
+ - devices: all
499
+ mig-enabled: true
500
+ mig-devices:
501
+ "3g.72gb": 2
502
+
503
+ all-4g.72gb:
504
+ - devices: all
505
+ mig-enabled: true
506
+ mig-devices:
507
+ "4g.72gb": 1
508
+
509
+ all-7g.144gb:
510
+ - devices: all
511
+ mig-enabled: true
512
+ mig-devices:
513
+ "7g.144gb": 1
514
+
515
+ # Custom: B200 mixed split — GPUs 0-5 stay full (reservable as --gpu-type b200),
516
+ # GPUs 6-7 partitioned per-GPU into 2x1g.23gb + 1x2g.45gb + 1x3g.90gb.
517
+ # Per node: 6 full + 4 small + 2 medium + 2 large slices.
518
+ b200-6full-2mig-balanced:
519
+ - device-filter: ["0x290110DE"]
520
+ devices: [0, 1, 2, 3, 4, 5]
521
+ mig-enabled: false
522
+ - device-filter: ["0x290110DE"]
523
+ devices: [6, 7]
524
+ mig-enabled: true
525
+ mig-devices:
526
+ "1g.23gb": 2
527
+ "2g.45gb": 1
528
+ "3g.90gb": 1
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
- # Post-deploy setup for B200 MIG split (6 full + 2 partitioned per node).
3
- # Run ONCE after PR #77 is merged + tf applied + the new docker/lambda is live.
4
-
5
- set -e
6
-
7
- NS=gpu-operator
8
- CM=default-mig-parted-config
9
- PROFILE_NAME=b200-6full-2mig-balanced
10
-
11
- echo "=== Checking current MIG profile in ConfigMap ==="
12
- if kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' | grep -q "$PROFILE_NAME:"; then
13
- echo "Profile $PROFILE_NAME already present — skipping ConfigMap edit"
14
- else
15
- echo "Profile $PROFILE_NAME missing. Patching ConfigMap..."
16
-
17
- # Save current ConfigMap content
18
- kubectl -n "$NS" get configmap "$CM" -o yaml > /tmp/mig-config-backup.yaml
19
- echo "Backup saved to /tmp/mig-config-backup.yaml"
20
-
21
- # Append our profile under mig-configs:
22
- # NOTE: this is a sed-driven append. ClusterPolicy's controller MAY revert this if it
23
- # reconciles. If you see the profile disappear, re-run this script. If it keeps reverting,
24
- # we'll need to fork the ConfigMap (next iteration).
25
- kubectl -n "$NS" get configmap "$CM" -o jsonpath='{.data.config\.yaml}' > /tmp/mig-config.yaml
26
-
27
- cat >> /tmp/mig-config.yaml <<'EOF'
28
-
29
- # Mixed B200 split: GPUs 0-5 stay full (reservable as --gpu-type b200), GPUs 6-7 partitioned.
30
- # Per partitioned GPU: 2x 1g.23gb + 1x 2g.45gb + 1x 3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large.
31
- b200-6full-2mig-balanced:
32
- - device-filter: ["0x290110DE"]
33
- devices: [0, 1, 2, 3, 4, 5]
34
- mig-enabled: false
35
- - device-filter: ["0x290110DE"]
36
- devices: [6, 7]
37
- mig-enabled: true
38
- mig-devices:
39
- "1g.23gb": 2
40
- "2g.45gb": 1
41
- "3g.90gb": 1
42
- EOF
43
-
44
- # Re-encode and patch
45
- kubectl -n "$NS" create configmap "$CM" --from-file=config.yaml=/tmp/mig-config.yaml --dry-run=client -o yaml \
46
- | kubectl -n "$NS" patch configmap "$CM" --patch-file=/dev/stdin
47
- echo "ConfigMap patched."
48
- fi
49
-
50
- echo
51
- echo "=== Picking a B200 node to label ==="
52
- NODE=$(kubectl get nodes -l GpuType=b200 -o jsonpath='{.items[0].metadata.name}')
53
- if [ -z "$NODE" ]; then
54
- echo "No B200 nodes found. Exiting."
55
- exit 1
56
- fi
57
- echo "Will label: $NODE"
58
- read -p "Proceed? (y/N): " CONFIRM
59
- if [ "$CONFIRM" != "y" ]; then
60
- echo "Aborted."
61
- exit 0
62
- fi
63
-
64
- kubectl label node "$NODE" "nvidia.com/mig.config=$PROFILE_NAME" --overwrite
65
- echo "Node labelled. nvidia-mig-manager will partition GPUs 6-7 (drains existing pods if any)."
66
- echo
67
- echo "Watch progress with:"
68
- echo " kubectl logs -n gpu-operator -l app=nvidia-mig-manager -f"
69
- echo " kubectl get node $NODE -o jsonpath='{.status.allocatable}' | jq ."
70
- echo
71
- echo "After ~2-5 min, allocatable should show:"
72
- echo " nvidia.com/gpu: 6"
73
- echo " nvidia.com/mig-1g.23gb: 4"
74
- echo " nvidia.com/mig-2g.45gb: 2"
75
- echo " nvidia.com/mig-3g.90gb: 2"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes