PyPI - xpk - Versions diffs - 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

xpk 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

integration/README.md +19 -0
xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
xpk/blueprints/a3mega/storage_crd.yaml +52 -0
xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
xpk/blueprints/a4/storage_crd.yaml +52 -0
xpk/commands/cluster.py +33 -12
xpk/commands/cluster_gcluster_test.py +5 -1
xpk/commands/cluster_test.py +125 -0
xpk/commands/config.py +3 -3
xpk/commands/inspector.py +5 -3
xpk/commands/kind.py +2 -0
xpk/commands/managed_ml_diagnostics.py +249 -0
xpk/commands/managed_ml_diagnostics_test.py +146 -0
xpk/commands/workload.py +124 -139
xpk/commands/workload_test.py +160 -118
xpk/core/blueprint/blueprint_generator.py +3 -0
xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
xpk/core/blueprint/testing/data/a4.yaml +185 -0
xpk/core/capacity.py +2 -0
xpk/core/cluster.py +18 -47
xpk/core/cluster_test.py +76 -1
xpk/core/config.py +81 -7
xpk/core/config_test.py +67 -11
xpk/core/docker_container.py +3 -1
xpk/core/docker_image.py +10 -6
xpk/core/docker_resources.py +1 -10
xpk/core/kjob.py +17 -16
xpk/core/kueue_manager.py +13 -19
xpk/core/kueue_manager_test.py +27 -1
xpk/core/nap.py +13 -14
xpk/core/nodepool.py +17 -15
xpk/core/nodepool_test.py +25 -4
xpk/core/pathways.py +23 -0
xpk/core/pathways_test.py +57 -0
xpk/core/resources.py +84 -27
xpk/core/scheduling.py +128 -132
xpk/core/scheduling_test.py +215 -2
xpk/core/system_characteristics.py +179 -0
xpk/core/system_characteristics_test.py +49 -1
xpk/core/telemetry.py +4 -4
xpk/core/telemetry_test.py +9 -9
xpk/core/vertex.py +4 -3
xpk/core/workload_decorators/tcpx_decorator.py +5 -1
xpk/main.py +2 -0
xpk/parser/cluster.py +22 -88
xpk/parser/cluster_test.py +41 -0
xpk/parser/common.py +84 -0
xpk/parser/storage.py +10 -0
xpk/parser/storage_test.py +47 -0
xpk/parser/workload.py +14 -41
xpk/parser/workload_test.py +2 -48
xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
xpk/utils/feature_flags.py +3 -0
xpk/utils/validation.py +2 -2
xpk-0.16.0.dist-info/METADATA +127 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
xpk-0.15.0.dist-info/METADATA +0 -1666
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0

integration/README.md ADDED Viewed

@@ -0,0 +1,19 @@
+This folder contains integration tests.
+To run them env variables are needed:
+```bash
+export PROJECT_ID=...
+export REGION=...
+export ZONE=...
+export AUTH_CIDR=...
+export DEPLOYMENT_DIR=...
+export CLUSTER_NAME=...
+export GCLOUD_CFG_PATH=...
+```
+To run tests:
+```bash
+pytest src/integration
+```

xpk/blueprints/a3mega/config-map.yaml.tftpl ADDED Viewed

@@ -0,0 +1,15 @@
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: ${resource_config_name}
+data:
+  h100-mega-80gb-8: "${num_nodes}"
+---
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: ${cluster_config_name}
+data:
+  capacity_type: "${capacity_type}"
+  reservation_id: "${reservation}"
+  provisioner: gcluster

xpk/blueprints/a3mega/storage_crd.yaml ADDED Viewed

@@ -0,0 +1,52 @@
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  name: storages.xpk.x-k8s.io
+spec:
+  group: xpk.x-k8s.io
+  versions:
+    - name: v1
+      served: true
+      storage: true
+      schema:
+        openAPIV3Schema:
+          type: object
+          properties:
+            spec:
+              type: object
+              properties:
+                type:
+                  type: string
+                cluster:
+                  type: string
+                auto_mount:
+                  type: boolean
+                mount_point:
+                  type: string
+                readonly:
+                  type: boolean
+                manifest:
+                  type: string
+                pv:
+                  type: string
+                pvc:
+                  type: string
+              required:
+                - type
+                - cluster
+                - auto_mount
+                - mount_point
+                - readonly
+                - manifest
+                - pvc
+                - pv
+          x-kubernetes-validations:
+            - message: Value is immutable
+              rule: self == oldSelf
+  scope: Cluster
+  names:
+    plural: storages
+    singular: storage
+    kind: Storage
+    shortNames:
+      - stg

xpk/blueprints/a3ultra/config-map.yaml.tftpl ADDED Viewed

@@ -0,0 +1,15 @@
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: ${resource_config_name}
+data:
+  h200-141gb-8: "${num_nodes}"
+---
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: ${cluster_config_name}
+data:
+  capacity_type: "${capacity_type}"
+  reservation_id: "${reservation}"
+  provisioner: gcluster

xpk/blueprints/a3ultra/mlgru-disable.yaml ADDED Viewed

@@ -0,0 +1,59 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: disable-mglru
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      app: disable-mglru
+  template:
+    metadata:
+      labels:
+        app: disable-mglru
+    spec:
+      hostNetwork: true
+      tolerations:
+      - operator: "Exists"
+        key: nvidia.com/gpu
+      containers:
+      - name: disable-mglru
+        image: alpine:latest
+        command: ["/bin/sh"]
+        securityContext:
+          privileged: true
+        args:
+        - -c
+        - |
+          echo n | tee /sys/kernel/mm/lru_gen/enabled
+          sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
+          sleep infinity
+        volumeMounts:
+        - name: sys-kernel-mm-lru-gen
+          mountPath: /sys/kernel/mm/lru_gen
+      # Remount sysfs so that it will be writable.
+      volumes:
+      - name: sys-kernel-mm-lru-gen
+        hostPath:
+          path: /sys/kernel/mm/lru_gen

xpk/blueprints/a3ultra/nccl-installer.yaml ADDED Viewed

@@ -0,0 +1,95 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nccl-rdma-installer
+  namespace: kube-system
+  labels:
+    k8s-app: nccl-rdma-installer
+spec:
+  selector:
+    matchLabels:
+      k8s-app: nccl-rdma-installer
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nccl-rdma-installer
+        k8s-app: nccl-rdma-installer
+    spec:
+      priorityClassName: system-node-critical
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: cloud.google.com/gke-accelerator
+                operator: In
+                values:
+                - nvidia-h200-141gb
+      tolerations:
+      - operator: "Exists"
+      hostNetwork: true
+      hostPID: true
+      volumes:
+      - name: library-dir-host
+        hostPath:
+          path: /home/kubernetes/bin/nvidia/lib64
+          type: DirectoryOrCreate
+      - name: gib
+        hostPath:
+          path: /home/kubernetes/bin/gib
+      initContainers:
+      - name: disable-log-martian
+        image: alpine:latest
+        command: ["/bin/sh"]
+        securityContext:
+          privileged: true
+        args:
+        - -c
+        - |
+          sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
+      - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
+        name: nccl-rdma-installer
+        resources:
+          requests:
+            cpu: 150m
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: library-dir-host
+          mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
+        - name: gib
+          mountPath: /usr/local/home/kubernetes/bin/gib
+        command: ["/bin/sh", "-c"]
+        args:
+        - |
+          set -ex
+          /scripts/container_entry.sh install --install-nccl
+          cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
+          cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
+          echo "installation finishes"
+      containers:
+      - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
+        name: pause

xpk/blueprints/a3ultra/storage_crd.yaml ADDED Viewed

@@ -0,0 +1,52 @@
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  name: storages.xpk.x-k8s.io
+spec:
+  group: xpk.x-k8s.io
+  versions:
+    - name: v1
+      served: true
+      storage: true
+      schema:
+        openAPIV3Schema:
+          type: object
+          properties:
+            spec:
+              type: object
+              properties:
+                type:
+                  type: string
+                cluster:
+                  type: string
+                auto_mount:
+                  type: boolean
+                mount_point:
+                  type: string
+                readonly:
+                  type: boolean
+                manifest:
+                  type: string
+                pv:
+                  type: string
+                pvc:
+                  type: string
+              required:
+                - type
+                - cluster
+                - auto_mount
+                - mount_point
+                - readonly
+                - manifest
+                - pvc
+                - pv
+          x-kubernetes-validations:
+            - message: Value is immutable
+              rule: self == oldSelf
+  scope: Cluster
+  names:
+    plural: storages
+    singular: storage
+    kind: Storage
+    shortNames:
+      - stg

xpk/blueprints/a4/config-map.yaml.tftpl ADDED Viewed

@@ -0,0 +1,15 @@
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: ${resource_config_name}
+data:
+  b200-8: "${num_nodes}"
+---
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: ${cluster_config_name}
+data:
+  capacity_type: "${capacity_type}"
+  reservation_id: "${reservation}"
+  provisioner: gcluster

xpk/blueprints/a4/nccl-rdma-installer-a4.yaml ADDED Viewed

@@ -0,0 +1,66 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nccl-rdma-installer
+  namespace: kube-system
+  labels:
+    k8s-app: nccl-rdma-installer
+spec:
+  selector:
+    matchLabels:
+      k8s-app: nccl-rdma-installer
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nccl-rdma-installer
+        k8s-app: nccl-rdma-installer
+    spec:
+      priorityClassName: system-node-critical
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: cloud.google.com/gke-accelerator
+                    operator: In
+                    values:
+                      - nvidia-b200
+      tolerations:
+        - operator: "Exists"
+      hostNetwork: true
+      hostPID: true
+      volumes:
+        - name: library-dir-host
+          hostPath:
+            path: /home/kubernetes/bin/nvidia/lib64
+            type: DirectoryOrCreate
+        - name: gib
+          hostPath:
+            path: /home/kubernetes/bin/gib
+      initContainers:
+        - image: us-docker.pkg.dev/kernel-net-team/clouda4-nccl-dev/nccl-plugin-gib-diagnostic:v1.0.3-b200
+          name: nccl-rdma-installer
+          resources:
+            requests:
+              cpu: 150m
+          securityContext:
+            privileged: true
+          volumeMounts:
+            - name: library-dir-host
+              mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
+            - name: gib
+              mountPath: /usr/local/home/kubernetes/bin/gib
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -ex
+              /scripts/container_entry.sh install --install-nccl
+              cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
+              cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
+              # ibv_devinfo || exit 1
+              echo "installation finishes"
+      containers:
+        - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
+          name: pause

xpk/blueprints/a4/storage_crd.yaml ADDED Viewed

@@ -0,0 +1,52 @@
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  name: storages.xpk.x-k8s.io
+spec:
+  group: xpk.x-k8s.io
+  versions:
+    - name: v1
+      served: true
+      storage: true
+      schema:
+        openAPIV3Schema:
+          type: object
+          properties:
+            spec:
+              type: object
+              properties:
+                type:
+                  type: string
+                cluster:
+                  type: string
+                auto_mount:
+                  type: boolean
+                mount_point:
+                  type: string
+                readonly:
+                  type: boolean
+                manifest:
+                  type: string
+                pv:
+                  type: string
+                pvc:
+                  type: string
+              required:
+                - type
+                - cluster
+                - auto_mount
+                - mount_point
+                - readonly
+                - manifest
+                - pvc
+                - pv
+          x-kubernetes-validations:
+            - message: Value is immutable
+              rule: self == oldSelf
+  scope: Cluster
+  names:
+    plural: storages
+    singular: storage
+    kind: Storage
+    shortNames:
+      - stg

xpk/commands/cluster.py CHANGED Viewed

@@ -18,7 +18,8 @@ from tabulate import tabulate
 from ..utils.feature_flags import FeatureFlags
 from ..utils.versions import ReleaseChannel
-from ..core.capacity import H100_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE, get_reservation_deployment_type
+from ..core.pathways import get_pathways_machine_types
+from ..core.capacity import H100_DEVICE_TYPE, get_reservation_deployment_type
 from ..core.cluster import (
     get_all_clusters_programmatic,
     get_cluster_credentials,
@@ -27,7 +28,6 @@ from ..core.cluster import (
     set_jobset_on_cluster,
     set_pathways_job_on_cluster,
     setup_k8s_env,
-    disable_mglru_on_cluster,
     count_nodes_on_cluster,
     update_cluster_with_gcpfilestore_driver_if_necessary,
     update_cluster_with_gcsfuse_driver_if_necessary,
@@ -84,6 +84,7 @@ from jinja2 import Environment, FileSystemLoader
 from ..utils.templates import get_templates_absolute_path
 import shutil
 import os
+from .managed_ml_diagnostics import install_mldiagnostics_prerequisites
 CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
@@ -210,6 +211,25 @@ def _validate_cluster_create_args(args, system: SystemCharacteristics):
   if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing:
     validate_sub_slicing_system(system)
     _validate_sub_slicing_reservation(args)
+  if args.enable_pathways:
+    _validate_pathways_machine(args)
+def _validate_pathways_machine(args):
+  return_code, result = get_pathways_machine_types(
+      project=args.project, zone=args.zone
+  )
+  if return_code != 0:
+    xpk_print('Error: Unable to retrieve available pathways machine types')
+    xpk_exit(1)
+  if args.pathways_gce_machine_type not in result:
+    xpk_print(
+        'Error: Invalid --pathways-gce-machine-type. Specify machine type that'
+        ' has at least 100GB of memory and at least 49 CPUs.'
+    )
+    xpk_print(f'Available machine types: {", ".join(result)}')
+    xpk_exit(1)
 def _validate_sub_slicing_reservation(args):
@@ -261,11 +281,10 @@ def cluster_create(args) -> None:
     xpk_print('Fetching system characteristics failed!')
     xpk_exit(return_code)
-  _validate_cluster_create_args(args, system)
   xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
   add_zone_and_project(args)
+  _validate_cluster_create_args(args, system)
   _log_cluster_create_telemetry(args)
   release_channel = (
@@ -422,6 +441,13 @@ def cluster_create(args) -> None:
       # pylint: disable=line-too-long
       f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
   )
+  if args.managed_mldiagnostics:
+    return_code = install_mldiagnostics_prerequisites()
+    if return_code != 0:
+      xpk_print('Installation of MLDiagnostics failed.')
+      xpk_exit(return_code)
   xpk_exit(0)
@@ -979,7 +1005,7 @@ def update_coredns() -> int:
   # 6. Scale up coredns and verify readiness
   scale_up_coredns(replicas=15)
-  verify_coredns_readiness(timeout=120)
+  verify_coredns_readiness()
   xpk_print('The CoreDNS setup process has been completed.')
@@ -1220,7 +1246,8 @@ def run_gke_cluster_create_command(
   if args.enable_lustre_csi_driver:
     addons.append('LustreCsiDriver')
-    command += ' --enable-legacy-lustre-port'
+    if args.enable_legacy_lustre_port:
+      command += ' --enable-legacy-lustre-port'
   if hasattr(args, 'enable_mtc') and args.enable_mtc:
     addons.append('HighScaleCheckpointing')
@@ -1336,12 +1363,6 @@ def prepare_gpus(system: SystemCharacteristics):
     if install_nri_code != 0:
       xpk_exit(install_nri_code)
-  if system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
-    xpk_print('Disabling MGLRU')
-    err_code = disable_mglru_on_cluster()
-    if err_code > 0:
-      xpk_exit(err_code)
 def _log_cluster_create_telemetry(args) -> None:
   if FeatureFlags.TELEMETRY_ENABLED:

xpk/commands/cluster_gcluster_test.py CHANGED Viewed

@@ -20,7 +20,7 @@ import pytest
 from xpk.commands.cluster_gcluster import cluster_create
 from xpk.core.kueue_manager import KueueConfig
-from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics
+from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
 from xpk.utils.versions import ReleaseChannel
@@ -97,6 +97,8 @@ def test_install_kueue_standard(
       accelerator_type=AcceleratorType.GPU,
       device_type="h100-mega-80gb-8",
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.ARM,
+      gpu_config=GpuConfig(requires_topology=True),
   )
   mock_cluster_create_deps["get_system_characteristics"].return_value = (
       mock_system,
@@ -148,6 +150,8 @@ def test_install_kueue_with_autoprovisioning(
       accelerator_type=AcceleratorType.GPU,
       device_type="h100-mega-80gb-8",
       supports_sub_slicing=False,
+      docker_platform=DockerPlatform.ARM,
+      gpu_config=GpuConfig(requires_topology=True),
   )
   mock_cluster_create_deps["get_system_characteristics"].return_value = (
       mock_system,

xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

xpk 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl