PyPI - gpu-dev - Versions diffs - 0.5.11__tar.gz → 0.5.13__tar.gz - Mend

gpu-dev 0.5.11tar.gz → 0.5.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (117) hide show

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/.github/workflows/no-gitlinks.yml RENAMED Viewed

@@ -14,7 +14,7 @@ jobs:
         uses: actions/checkout@v4
       - name: Ensure no gitlinks are tracked
         run: |
-          gitlinks=$(git ls-files -s | awk "$1 == 160000 {print}")
+          gitlinks=$(git ls-files -s | awk '$1 == 160000 {print}')
           if [ -n "$gitlinks" ]; then
             echo "Unexpected gitlinks found:"
             echo "$gitlinks"

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.5.11
+Version: 0.5.13
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: gpu-dev
-Version: 0.5.11
+Version: 0.5.13
 Summary: CLI tool for PyTorch GPU developer server reservations
 Author: PyTorch Team
 Requires-Python: >=3.10

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev.egg-info/SOURCES.txt RENAMED Viewed

@@ -47,6 +47,8 @@ terraform-gpu-devservers/git-cache.tf
 terraform-gpu-devservers/kubernetes.tf
 terraform-gpu-devservers/lambda.tf
 terraform-gpu-devservers/main.tf
+terraform-gpu-devservers/mig-config.tf
+terraform-gpu-devservers/mig-parted-config.yaml
 terraform-gpu-devservers/monitoring.tf
 terraform-gpu-devservers/outputs.tf
 terraform-gpu-devservers/pyproject.toml

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/cli.py RENAMED Viewed

@@ -495,9 +495,9 @@ def main(ctx: click.Context) -> None:
     "--gpu-type",
     "-t",
     type=click.Choice(
-        ["b200", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
+        ["b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g", "h200", "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g", "a100", "rtxpro6000", "a10g", "t4", "l4", "t4-small", "cpu-arm", "cpu-x86"], case_sensitive=False
     ),
-    help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices (partial GPU on a single shared node): h100-mig-1g (10 GB / 1/7 H100 compute), h100-mig-2g (20 GB / 2/7 H100), h100-mig-3g (40 GB / 3/7 H100). CPU only: cpu-arm, cpu-x86.",
+    help="GPU type to reserve. Full GPUs: b200, h200, h100, a100, rtxpro6000, a10g, t4, l4, t4-small. H100 MIG slices: h100-mig-1g (10 GB), h100-mig-2g (20 GB), h100-mig-3g (40 GB). B200 MIG slices (on the mixed B200 node): b200-mig-1g (23 GB), b200-mig-2g (45 GB), b200-mig-3g (90 GB). CPU: cpu-arm, cpu-x86.",
 )
 @click.option(
     "--hours",
@@ -656,6 +656,9 @@ def reserve(
             "h100-mig-1g": {"max_gpus": 16, "instance_type": "p5.48xlarge"},
             "h100-mig-2g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
             "h100-mig-3g": {"max_gpus": 8, "instance_type": "p5.48xlarge"},
+            "b200-mig-1g": {"max_gpus": 4, "instance_type": "p6-b200.48xlarge"},
+            "b200-mig-2g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
+            "b200-mig-3g": {"max_gpus": 2, "instance_type": "p6-b200.48xlarge"},
             "h200": {"max_gpus": 8, "instance_type": "p5e.48xlarge"},
             "b200": {"max_gpus": 8, "instance_type": "p6-b200.48xlarge"},
             "cpu-arm": {"max_gpus": 0, "instance_type": "c7g.4xlarge"},
@@ -2454,6 +2457,9 @@ def _show_availability() -> None:
                 "h100-mig-1g": "Hopper (sm90, MIG 10GB)",
                 "h100-mig-2g": "Hopper (sm90, MIG 20GB)",
                 "h100-mig-3g": "Hopper (sm90, MIG 40GB)",
+                "b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
+                "b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
+                "b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
                 "t4": "Turing (sm75)",
                 "cpu-x86": "CPU (x86_64)",
                 "cpu-arm": "CPU (arm64)",
@@ -2462,6 +2468,9 @@ def _show_availability() -> None:
             # Sort order: newest GPU architectures first, then CPUs at the bottom
             arch_priority = {
                 "Blackwell (sm100)": 0,
+                "Blackwell (sm100, MIG 90GB)": 0,
+                "Blackwell (sm100, MIG 45GB)": 0,
+                "Blackwell (sm100, MIG 23GB)": 0,
                 "Blackwell (sm120)": 0,
                 "Hopper (sm90)": 1,
                 "Hopper (sm90, MIG 40GB)": 1,
@@ -2609,6 +2618,9 @@ def _show_availability_watch(interval: int) -> None:
                             "h100-mig-1g": "Hopper (sm90, MIG 10GB)",
                             "h100-mig-2g": "Hopper (sm90, MIG 20GB)",
                             "h100-mig-3g": "Hopper (sm90, MIG 40GB)",
+                            "b200-mig-1g": "Blackwell (sm100, MIG 23GB)",
+                            "b200-mig-2g": "Blackwell (sm100, MIG 45GB)",
+                            "b200-mig-3g": "Blackwell (sm100, MIG 90GB)",
                             "t4": "Turing (sm75)",
                             "cpu-x86": "CPU (x86_64)",
                             "cpu-arm": "CPU (arm64)",
@@ -2617,6 +2629,9 @@ def _show_availability_watch(interval: int) -> None:
                         # Sort order: newest GPU architectures first, then CPUs at the bottom
                         arch_priority = {
                             "Blackwell (sm100)": 0,
+                            "Blackwell (sm100, MIG 90GB)": 0,
+                            "Blackwell (sm100, MIG 45GB)": 0,
+                            "Blackwell (sm100, MIG 23GB)": 0,
                             "Blackwell (sm120)": 0,
                             "Hopper (sm90)": 1,
                             "Hopper (sm90, MIG 40GB)": 1,

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/interactive.py RENAMED Viewed

@@ -64,17 +64,25 @@ def select_gpu_type_interactive(
         if "-mig-" not in gt
     }
-    # Aggregate MIG slice availability so we can hint it on the h100 row of this picker.
-    mig_total_available = sum(
-        int(info.get("available", 0))
-        for gt, info in (availability_info or {}).items()
-        if gt.startswith("h100-mig-")
-    )
-    mig_total_capacity = sum(
-        int(info.get("total", 0))
-        for gt, info in (availability_info or {}).items()
-        if gt.startswith("h100-mig-")
-    )
+    # Aggregate MIG slice availability per parent type, hinted on the h100/b200 rows.
+    def _mig_aggregates(parent: str):
+        avail = sum(
+            int(info.get("available", 0))
+            for gt, info in (availability_info or {}).items()
+            if gt.startswith(f"{parent}-mig-")
+        )
+        cap = sum(
+            int(info.get("total", 0))
+            for gt, info in (availability_info or {}).items()
+            if gt.startswith(f"{parent}-mig-")
+        )
+        return avail, cap
+    h100_mig_avail, h100_mig_capacity = _mig_aggregates("h100")
+    b200_mig_avail, b200_mig_capacity = _mig_aggregates("b200")
+    # Backwards-compat aliases for the existing h100 row code below.
+    mig_total_available = h100_mig_avail
+    mig_total_capacity = h100_mig_capacity
     # Display availability table first
     console.print("\n[cyan]🖥️  GPU Availability:[/cyan]")
@@ -146,6 +154,8 @@ def select_gpu_type_interactive(
                 choice_label += f" - {queue_length} in queue"
             if gpu_type == "h100" and mig_total_capacity > 0:
                 choice_label += f" — also {mig_total_available}/{mig_total_capacity} MIG slices"
+            elif gpu_type == "b200" and b200_mig_capacity > 0:
+                choice_label += f" — also {b200_mig_avail}/{b200_mig_capacity} MIG slices"
             choices.append(questionary.Choice(title=choice_label, value=gpu_type))
@@ -223,27 +233,31 @@ def select_gpu_count_interactive(
     parent_size_etas = parent_info.get("size_etas", {}) or {}
     _now_ts = int(_time.time())
-    # MIG slice submenu: only for h100. Each tuple is (target_gpu_type, gpu_count, gb_label).
+    # MIG slice submenu: h100 (16+8+8 slices/node) or b200 (4+2+2 slices/node).
     mig_options = []
-    if gpu_type == "h100":
-        # Map to internal SKUs; the count menu surfaces 1/2/4 of each slice size.
-        mig_specs = [
-            ("h100-mig-1g", "10GB"),
-            ("h100-mig-2g", "20GB"),
-            ("h100-mig-3g", "40GB"),
-        ]
-        for sku, gb in mig_specs:
-            slice_max = {"h100-mig-1g": 16, "h100-mig-2g": 8, "h100-mig-3g": 8}[sku]
-            free = None
-            if availability_info and sku in availability_info:
-                free = availability_info[sku].get("available", 0)
-            for n in [1, 2, 4]:
-                if n > slice_max:
-                    continue
-                noun = "slice" if n == 1 else "slices"
-                avail_suffix = f"  [{free} free]" if free is not None else ""
-                label = f"{n} × {gb} {noun}{avail_suffix}"
-                mig_options.append((sku, n, label))
+    mig_spec_map = {
+        "h100": [
+            ("h100-mig-1g", "10GB", 16),
+            ("h100-mig-2g", "20GB", 8),
+            ("h100-mig-3g", "40GB", 8),
+        ],
+        "b200": [
+            ("b200-mig-1g", "23GB", 4),
+            ("b200-mig-2g", "45GB", 2),
+            ("b200-mig-3g", "90GB", 2),
+        ],
+    }
+    for sku, gb, slice_max in mig_spec_map.get(gpu_type, []):
+        free = None
+        if availability_info and sku in availability_info:
+            free = availability_info[sku].get("available", 0)
+        for n in [1, 2, 4]:
+            if n > slice_max:
+                continue
+            noun = "slice" if n == 1 else "slices"
+            avail_suffix = f"  [{free} free]" if free is not None else ""
+            label = f"{n} × {gb} {noun}{avail_suffix}"
+            mig_options.append((sku, n, label))
     # Filter single-node by actual max for this GPU type
     valid_counts = [count for count in valid_counts if count <= max_gpus]

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/cli-tools/gpu-dev-cli/gpu_dev_cli/reservations.py RENAMED Viewed

@@ -543,6 +543,9 @@ class ReservationManager:
                 "h100-mig-1g": {"max_gpus": 16},
                 "h100-mig-2g": {"max_gpus": 8},
                 "h100-mig-3g": {"max_gpus": 8},
+                "b200-mig-1g": {"max_gpus": 4},
+                "b200-mig-2g": {"max_gpus": 2},
+                "b200-mig-3g": {"max_gpus": 2},
                 "h200": {"max_gpus": 8},
                 "b200": {"max_gpus": 8},
             }

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "gpu-dev"
-version = "0.5.11"
+version = "0.5.13"
 description = "CLI tool for PyTorch GPU developer server reservations"
 authors = [{name = "PyTorch Team"}]
 readme = "cli-tools/gpu-dev-cli/README.md"

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/kubernetes.tf RENAMED Viewed

@@ -305,6 +305,14 @@ resource "helm_release" "nvidia_gpu_operator" {
     value = "all-disabled"
   }
+  # Read profiles from our forked ConfigMap (managed in mig-config.tf) instead of the
+  # operator's auto-created default-mig-parted-config. Lets us add custom mixed profiles
+  # like b200-6full-2mig-balanced without ClusterPolicy reconciliation reverting our edits.
+  set {
+    name  = "migManager.config.name"
+    value = "gpu-dev-mig-parted-config"
+  }
   set {
     name  = "nodeStatusExporter.enabled"
     value = "true"

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda/reservation_processor/index.py RENAMED Viewed

@@ -71,6 +71,10 @@ GPU_CONFIG = {
     "h100-mig-1g": {"instance_type": "p5.48xlarge", "max_gpus": 16, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.10gb", "node_gpu_type": "h100"},
     "h100-mig-2g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.20gb", "node_gpu_type": "h100"},
     "h100-mig-3g": {"instance_type": "p5.48xlarge", "max_gpus": 8, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.40gb", "node_gpu_type": "h100"},
+    # B200 MIG slices on the b200-6full-2mig-balanced node (6 full GPUs + 2 partitioned per node).
+    "b200-mig-1g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 4, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-1g.23gb", "node_gpu_type": "b200"},
+    "b200-mig-2g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-2g.45gb", "node_gpu_type": "b200"},
+    "b200-mig-3g": {"instance_type": "p6-b200.48xlarge", "max_gpus": 2, "cpus": 192, "memory_gb": 2048, "efa_count": 0, "k8s_resource": "nvidia.com/mig-3g.90gb", "node_gpu_type": "b200"},
     "t4-small": {"instance_type": "g4dn.2xlarge", "max_gpus": 1, "cpus": 8, "memory_gb": 32, "efa_count": 0},
     "g5g": {"instance_type": "g5g.2xlarge", "max_gpus": 2, "cpus": 8, "memory_gb": 32, "efa_count": 0},
     "a100": {"instance_type": "p4d.24xlarge", "max_gpus": 8, "cpus": 96, "memory_gb": 1152, "efa_count": 4},
@@ -2167,7 +2171,8 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
     # Validate GPU type
     valid_gpu_types = ["t4", "l4", "a10g", "rtxpro6000", "t4-small", "a100",
                        "h100", "h100-mig-1g", "h100-mig-2g", "h100-mig-3g",
-                       "h200", "b200", "cpu-arm", "cpu-x86"]
+                       "h200", "b200", "b200-mig-1g", "b200-mig-2g", "b200-mig-3g",
+                       "cpu-arm", "cpu-x86"]
     if gpu_type not in valid_gpu_types:
         error_msg = f"Invalid GPU type: {gpu_type}. Must be one of: {', '.join(valid_gpu_types)}"
         logger.error(error_msg)
@@ -2408,6 +2413,9 @@ def update_gpu_availability_table(
             "h100-mig-1g": {"gpus_per_instance": 16},
             "h100-mig-2g": {"gpus_per_instance": 8},
             "h100-mig-3g": {"gpus_per_instance": 8},
+            "b200-mig-1g": {"gpus_per_instance": 4},
+            "b200-mig-2g": {"gpus_per_instance": 2},
+            "b200-mig-3g": {"gpus_per_instance": 2},
             "h200": {"gpus_per_instance": 8},
             "b200": {"gpus_per_instance": 8},
         }

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/lambda.tf RENAMED Viewed

@@ -180,7 +180,7 @@ resource "aws_lambda_function" "reservation_processor" {
       HOSTED_ZONE_ID                     = local.effective_domain_name != "" ? local.hosted_zone_id : ""
       SSH_DOMAIN_MAPPINGS_TABLE          = local.effective_domain_name != "" ? aws_dynamodb_table.ssh_domain_mappings.name : ""
       SSL_CERTIFICATE_ARN                = local.effective_domain_name != "" ? aws_acm_certificate.wildcard[0].arn : ""
-      LAMBDA_VERSION                     = "0.5.11"
+      LAMBDA_VERSION                     = "0.5.13"
       MIN_CLI_VERSION                    = "0.5.9"
       DISK_CONTENTS_BUCKET               = aws_s3_bucket.disk_contents.bucket
       OPERATIONS_TABLE                   = aws_dynamodb_table.operations.name

{gpu_dev-0.5.11 → gpu_dev-0.5.13}/terraform-gpu-devservers/main.tf RENAMED Viewed

@@ -255,6 +255,46 @@ locals {
           k8s_resource        = "nvidia.com/mig-3g.40gb"
           node_gpu_type       = "h100"
         }
+        # B200 MIG slices — virtual SKUs backed by ONE B200 node labelled with the custom
+        # mig_profile "b200-6full-2mig-balanced": GPUs 0-5 stay as full B200 (still reservable
+        # via --gpu-type b200), GPUs 6-7 get partitioned per-GPU into 2x1g.23gb + 1x2g.45gb +
+        # 1x3g.90gb. Per node: 6 full + 4 small + 2 medium + 2 large slices.
+        "b200-mig-1g" = {
+          instance_type       = null
+          instance_types      = null
+          instance_count      = 0
+          gpus_per_instance   = 4 # 2 partitioned GPUs * 2 slices each
+          use_placement_group = false
+          architecture        = "x86_64"
+          efa_network_cards   = 0
+          virtual             = true
+          k8s_resource        = "nvidia.com/mig-1g.23gb"
+          node_gpu_type       = "b200"
+        }
+        "b200-mig-2g" = {
+          instance_type       = null
+          instance_types      = null
+          instance_count      = 0
+          gpus_per_instance   = 2 # 2 partitioned GPUs * 1 slice each
+          use_placement_group = false
+          architecture        = "x86_64"
+          efa_network_cards   = 0
+          virtual             = true
+          k8s_resource        = "nvidia.com/mig-2g.45gb"
+          node_gpu_type       = "b200"
+        }
+        "b200-mig-3g" = {
+          instance_type       = null
+          instance_types      = null
+          instance_count      = 0
+          gpus_per_instance   = 2 # 2 partitioned GPUs * 1 slice each
+          use_placement_group = false
+          architecture        = "x86_64"
+          efa_network_cards   = 0
+          virtual             = true
+          k8s_resource        = "nvidia.com/mig-3g.90gb"
+          node_gpu_type       = "b200"
+        }
         "cpu-arm" = {
           instance_type       = "c7g.8xlarge"
           instance_types      = null

gpu_dev-0.5.13/terraform-gpu-devservers/mig-config.tf ADDED Viewed

@@ -0,0 +1,55 @@
+# mig-config.tf — fork the NVIDIA mig-parted-config ConfigMap so we can add custom profiles
+# without fighting NVIDIA ClusterPolicy's reconciliation of its default-mig-parted-config.
+#
+# The vendored mig-parted-config.yaml in this directory mirrors the upstream profiles plus our
+# additions (e.g. b200-6full-2mig-balanced). Helm is told to use this ConfigMap by name via
+# migManager.config.name in kubernetes.tf, so the GPU operator skips creating its default and
+# reads ours instead.
+resource "kubernetes_config_map" "gpu_dev_mig_parted_config" {
+  metadata {
+    name      = "gpu-dev-mig-parted-config"
+    namespace = "gpu-operator"
+    labels = {
+      "app.kubernetes.io/managed-by" = "terraform"
+      "app.kubernetes.io/part-of"    = "gpu-dev-servers"
+    }
+  }
+  data = {
+    "config.yaml" = file("${path.module}/mig-parted-config.yaml")
+  }
+  # The gpu-operator namespace is created by the helm release; depend on that so this ConfigMap
+  # lands AFTER the namespace exists.
+  depends_on = [helm_release.nvidia_gpu_operator]
+}
+# Optional declarative B200 MIG node label. Set b200_mig_node_name in tfvars (or override the
+# variable's default below) to dedicate a specific B200 node to the mixed profile. Empty string
+# means "no node currently labelled" — the existing all-disabled stays in effect.
+variable "b200_mig_node_name" {
+  description = "Hostname of the B200 node to label with nvidia.com/mig.config=b200-6full-2mig-balanced. Leave empty to skip."
+  type        = string
+  default     = ""
+}
+resource "kubernetes_labels" "b200_mig_node" {
+  count = var.b200_mig_node_name == "" ? 0 : 1
+  api_version = "v1"
+  kind        = "Node"
+  metadata {
+    name = var.b200_mig_node_name
+  }
+  labels = {
+    "nvidia.com/mig.config" = "b200-6full-2mig-balanced"
+  }
+  # Take ownership of the label even if another tool (kubectl, gpu-operator) set it.
+  force = true
+  depends_on = [kubernetes_config_map.gpu_dev_mig_parted_config]
+}

gpu-dev 0.5.11__tar.gz → 0.5.13__tar.gz

gpu-dev 0.5.11tar.gz → 0.5.13tar.gz