PyPI - xpk - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

xpk 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

xpk/commands/batch.py +2 -3
xpk/commands/cluster.py +225 -73
xpk/commands/common.py +33 -1
xpk/commands/kjob_common.py +10 -1
xpk/commands/run.py +2 -3
xpk/commands/storage.py +14 -3
xpk/commands/workload.py +17 -15
xpk/core/blueprint/blueprint_generator.py +18 -18
xpk/core/cluster.py +119 -8
xpk/core/config.py +1 -1
xpk/core/filestore.py +2 -6
xpk/core/gcsfuse.py +22 -4
xpk/core/kjob.py +20 -13
xpk/core/kueue.py +30 -0
xpk/core/mtc.py +195 -0
xpk/core/network.py +23 -1
xpk/core/pathways.py +1 -1
xpk/core/resources.py +21 -0
xpk/core/workload.py +1 -1
xpk/core/workload_decorators/rdma_decorator.py +6 -10
xpk/core/workload_decorators/tcpx_decorator.py +179 -0
xpk/core/workload_decorators/tcpxo_decorator.py +15 -14
xpk/parser/cluster.py +573 -389
xpk/parser/storage.py +11 -2
xpk/utils/kubectl.py +4 -1
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/METADATA +134 -91
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/RECORD +31 -29
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0

xpk/core/blueprint/blueprint_generator.py CHANGED Viewed

@@ -52,20 +52,6 @@ cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
 cluster_toolkit_version = "v1.48.0"
-def get_subnetworks_for_a3mega(cluster_name: str) -> list[str]:
-  return [f"{cluster_name}-gpunet-{i}-subnet" for i in range(8)]
-def get_subnetworks_for_a3ultra(cluster_name: str) -> list[str]:
-  return [f"{cluster_name}-sub-1"] + [
-      f"{cluster_name}-rdma-sub-{i}" for i in range(8)
-  ]
-def get_subnetworks_for_a4() -> list[str]:
-  return ["gvnic-1"] + [f"rdma-{i}" for i in range(8)]
 class BlueprintGeneratorOutput:
   """BlueprintGeneratorOutput is a class containing fields with output blueprint file path and path to blueprint dependencies.
   Atributes:
@@ -194,13 +180,17 @@ class BlueprintGenerator:
     a3_megagpu_pool_0 = DeploymentModule(
         id="a3_megagpu_pool_0",
         source="modules/compute/gke-node-pool",
-        use=["gke_cluster", gpu_subnets_name, "group_placement_0"],
+        use=["gke_cluster", gpu_subnets_name],
         settings={
             "name": f"{cluster_name}-a3-megagpu-pool-0",
             "machine_type": system.gce_machine_type,
             "static_node_count": num_nodes,
             "zones": [zone],
-            "host_maintenance_interval": "PERIODIC",
+            "host_maintenance_interval": (
+                None
+                if capacity_type == CapacityType.RESERVATION
+                else "PERIODIC"
+            ),
             "reservation_affinity": self._getblock_reservation_affinity(
                 reservation
             ),
@@ -211,6 +201,9 @@ class BlueprintGenerator:
         },
         outputs=["instructions"],
     )
+    set_placement_policy = capacity_type != CapacityType.SPOT
+    tas_name = "topologyName: 'gke-default'" if set_placement_policy else ""
     num_chips = num_nodes * system.chips_per_vm
     workload = DeploymentModule(
         id="workload_component_install",
@@ -221,7 +214,10 @@ class BlueprintGenerator:
                 "install": True,
                 "version": "v0.10.0",  # TAS feature-gates is enabled in CT
                 "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
-                "config_template_vars": {"num_chips": num_chips},
+                "config_template_vars": {
+                    "num_chips": num_chips,
+                    "tas_name": tas_name,
+                },
             },
             "jobset": {"install": True, "version": "v0.7.2"},
             "apply_manifests": [{
@@ -257,12 +253,16 @@ class BlueprintGenerator:
             primary_vpc,
             gpunets,
             gke_cluster,
-            group_placement_0,
             a3_megagpu_pool_0,
             workload,
             workload_configmap,
         ],
     )
+    if set_placement_policy:
+      a3_megagpu_pool_0.use.append(group_placement_0.id)
+      primary_group.modules.append(group_placement_0)
     a3_mega_blueprint = Blueprint(
         terraform_backend_defaults=self._getblock_terraform_backend(
             gcs_bucket, cluster_name, prefix

xpk/core/cluster.py CHANGED Viewed

@@ -14,28 +14,37 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+import yaml
 from google.api_core.exceptions import PermissionDenied
 from google.cloud import resourcemanager_v3
 from kubernetes import client as k8s_client
 from kubernetes import config
 from kubernetes.client.exceptions import ApiException
-from .resources import get_cluster_system_characteristics
 from ..utils.console import xpk_exit, xpk_print
-from .capacity import H100_DEVICE_TYPE
+from .capacity import B200_DEVICE_TYPE, H100_DEVICE_TYPE, H200_DEVICE_TYPE
 from .commands import (
     run_command_for_value,
     run_command_with_updates,
     run_command_with_updates_retry,
 )
-from .gcloud_context import add_zone_and_project, get_gke_server_config, zone_to_region
+from .gcloud_context import (
+    add_zone_and_project,
+    get_gke_server_config,
+    zone_to_region,
+)
 from .nodepool import upgrade_gke_nodepools_version
+from .resources import get_cluster_system_characteristics
 from .system_characteristics import SystemCharacteristics
 JOBSET_VERSION = 'v0.8.0'
-PATHWAYS_JOB_VERSION = 'v0.1.0'
-INSTALLER_NCC_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
-INSTALLER_NCC_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
+PATHWAYS_JOB_VERSION = 'v0.1.1'
+INSTALLER_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-tcpx-installer.yaml'
+INSTALLER_NCCL_TCPXO = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpxo/nccl-tcpxo-installer.yaml'
+INSTALLER_NCCL_RDMA = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-rdma/nccl-rdma-installer.yaml'
+CONFIG_NCCL_TCPX = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/gpudirect-tcpx/nccl-config.yaml'
+NRI_DEVICE_INJECTOR = 'https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nri_device_injector/nri-device-injector.yaml'
+MGLRU_DISABLE = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/heads/main/examples/gke-a3-ultragpu/mglru-disable.yaml'
 DEFAULT_NAMESPACE = 'default'
 XPK_SA = 'xpk-sa'
@@ -112,9 +121,11 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
     0 if successful and 1 otherwise.
   """
   if system.device_type == H100_DEVICE_TYPE:
-    command = f'kubectl apply -f {INSTALLER_NCC_TCPX}'
+    command = f'kubectl apply -f {INSTALLER_NCCL_TCPX}'
+  elif system.device_type in [H200_DEVICE_TYPE, B200_DEVICE_TYPE]:
+    command = f'kubectl apply -f {INSTALLER_NCCL_RDMA}'
   else:
-    command = f'kubectl apply -f {INSTALLER_NCC_TCPXO}'
+    command = f'kubectl apply -f {INSTALLER_NCCL_TCPXO}'
   return_code = run_command_with_updates(
       command, 'Install NCCL Plugin On Cluster', args
@@ -126,9 +137,108 @@ def install_nccl_on_cluster(args, system: SystemCharacteristics) -> int:
     )
     return 1
+  if system.device_type == H100_DEVICE_TYPE:
+    command = f'kubectl apply -f {CONFIG_NCCL_TCPX}'
+    return_code = run_command_with_updates(
+        command, 'Install NCCL Config On Cluster', args
+    )
+    if return_code != 0:
+      xpk_print(
+          f'Install NCCL Config On Cluster request returned ERROR {return_code}'
+      )
+      return 1
+  return 0
+def disable_mglru_on_cluster(args) -> int:
+  """Disable MGLRU on the cluster.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = f'kubectl apply -f {MGLRU_DISABLE}'
+  return_code = run_command_with_updates(
+      command, 'Disable MGLRU On Cluster', args
+  )
+  if return_code != 0:
+    xpk_print('Disablig MGLRU On Cluster request returned ERROR')
+    return 1
   return 0
+def install_nri_on_cluster(args) -> int:
+  """Install NRI Device Injector on the cluster.
+  Args:
+    args: user provided arguments for running the command.
+    system: system characteristics.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = f'kubectl apply -f {NRI_DEVICE_INJECTOR}'
+  return_code = run_command_with_updates(
+      command, 'Install NRI Device Injector On Cluster', args
+  )
+  if return_code != 0:
+    xpk_print(
+        'Install NRI Device Injector On Cluster request returned ERROR'
+        f' {return_code}'
+    )
+    return 1
+  return 0
+def get_cluster_nodes_info(args) -> list[dict]:
+  """Get list of cluster's nodes descrition in yaml format
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    List of nodes info yaml objects.
+  """
+  xpk_print("Getting cluster's info...")
+  command = 'kubectl get nodes -o yaml'
+  err_code, val = run_command_for_value(
+      command=command,
+      task='Get cluster nodes info',
+      global_args=args,
+  )
+  if err_code != 0:
+    xpk_exit(err_code)
+  data = yaml.safe_load(val)
+  return data['items']  # pytype: disable=bad-return-type
+def count_nodes_on_cluster(args, system: SystemCharacteristics) -> int:
+  """Count cluster nodes by accelerator type"""
+  nodes_info = get_cluster_nodes_info(args)
+  accelerators = [
+      node['metadata']['labels']['cloud.google.com/gke-accelerator']
+      for node in nodes_info
+      if 'cloud.google.com/gke-accelerator' in node['metadata']['labels']
+  ]
+  if system.device_type != H200_DEVICE_TYPE:
+    xpk_print(
+        'Automatic node detection is not supported for device type:'
+        f' {system.device_type}'
+    )
+    xpk_exit(1)
+  num_nodes: int = sum(acc == system.gke_accelerator for acc in accelerators)
+  return num_nodes
 def get_cluster_network(args) -> str:
   xpk_print("Getting cluster's VPC network...")
   cluster_network_cmd = (
@@ -621,6 +731,7 @@ def get_cluster_credentials(args) -> None:
   command = (
       'gcloud container clusters get-credentials'
       f' {args.cluster} --region={zone_to_region(args.zone)}'
+      ' --dns-endpoint'
       f' --project={args.project} &&'
       ' kubectl config view && kubectl config set-context --current'
       ' --namespace=default'

xpk/core/config.py CHANGED Viewed

@@ -24,7 +24,7 @@ from ..utils.console import xpk_print
 from .system_characteristics import AcceleratorType, SystemCharacteristics
 # This is the version for XPK PyPI package
-__version__ = 'v0.8.0'
+__version__ = 'v0.9.0'
 XPK_CURRENT_VERSION = __version__
 XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')

xpk/core/filestore.py CHANGED Viewed

@@ -200,9 +200,7 @@ class FilestoreClient:
     ] = f"projects/{self.project}/global/networks/{network}"
     return data
-  def create_pv(
-      self, name: str, vol: str, access_mode: str, mount_options: str
-  ) -> dict:
+  def create_pv(self, name: str, vol: str, access_mode: str) -> dict:
     """Create a yaml representing filestore PersistentVolume."""
     data = templates.load(FS_PV_PATH)
     data["metadata"]["name"] = get_pv_name(name)
@@ -217,7 +215,6 @@ class FilestoreClient:
         0
     ].ip_addresses[0]
     data["spec"]["csi"]["volumeAttributes"]["volume"] = vol
-    data["spec"]["mountOptions"] = mount_options.split(",")
     return data
   def create_pvc(self, name: str, access_mode: str) -> dict:
@@ -238,10 +235,9 @@ class FilestoreClient:
       vol: str,
       access_mode: str,
       network: str,
-      mount_options: str,
   ) -> list[dict]:
     self.load_instance()
-    pv = self.create_pv(name, vol, access_mode, mount_options)
+    pv = self.create_pv(name, vol, access_mode)
     pvc = self.create_pvc(name, access_mode)
     sc = self.create_sc(name, network)
     return [pv, pvc, sc]

xpk/core/gcsfuse.py CHANGED Viewed

@@ -20,11 +20,21 @@ FUSE_PV_PATH = "/../templates/fuse-pv.yaml"
 FUSE_PVC_PATH = "/../templates/fuse-pvc.yaml"
-def create_pv(name: str, size: int, bucket: str, mount_options: str) -> dict:
+def create_pv(
+    name: str,
+    size: int,
+    bucket: str,
+    mount_options: str,
+    prefetch_metadata: bool,
+) -> dict:
   data = templates.load(FUSE_PV_PATH)
   data["metadata"]["name"] = f"{name}-pv"
   data["spec"]["capacity"]["storage"] = f"{size}Gi"
   data["spec"]["csi"]["volumeHandle"] = bucket
+  if prefetch_metadata:
+    data["spec"]["csi"]["volumeAttributes"][
+        "gcsfuseMetadataPrefetchOnMount"
+    ] = "true"
   data["spec"]["mountOptions"] = mount_options.split(",")
   return data
@@ -38,16 +48,24 @@ def create_pvc(name: str, size: int) -> dict:
 def manifest(
-    name: str, bucket: str, size: int, mount_options: str
+    name: str,
+    bucket: str,
+    size: int,
+    mount_options: str,
+    prefetch_metadata: bool,
 ) -> list[dict]:
-  """Creates GCS FUSE manifest file.
+  """Creates GCS FUSE storage manifest file.
   Args:
       name (str): base name of the volumes
       bucket (str): name of the storage bucket
       size (str): size of the storage (in GB)
+      prefetch_metadata (bool): if set, then enables metadata pre-population when mounting the volume
       mount_options (str): comma-separated list of mountOptions for PersistentVolume
+  Returns:
+      list[dict]: list of manifests
   """
-  pv = create_pv(name, size, bucket, mount_options)
+  pv = create_pv(name, size, bucket, mount_options, prefetch_metadata)
   pvc = create_pvc(name, size)
   return [pv, pvc]

xpk/core/kjob.py CHANGED Viewed

@@ -22,16 +22,9 @@ from kubernetes import client as k8s_client
 from kubernetes.client import ApiClient
 from kubernetes.client.rest import ApiException
-from ..core.blueprint.blueprint_generator import (
-    get_subnetworks_for_a3mega,
-    get_subnetworks_for_a3ultra,
-    get_subnetworks_for_a4,
-)
-from ..core.capacity import H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
-from ..core.storage import GCS_FUSE_ANNOTATIONS, PARALLELSTORE_ANNOTATIONS
-from ..core.workload_decorators import rdma_decorator, tcpxo_decorator
 from ..utils import templates
 from ..utils.console import xpk_exit, xpk_print
+from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
 from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
 from .commands import (
     run_command_for_value,
@@ -46,12 +39,24 @@ from .config import (
     KJOB_SHELL_WORKING_DIRECTORY,
     XpkConfig,
 )
+from .network import get_cluster_subnetworks
 from .resources import (
     AcceleratorType,
     SystemCharacteristics,
     get_cluster_system_characteristics,
 )
-from .storage import get_auto_mount_gcsfuse_storages, get_auto_mount_storages, get_auto_mount_parallelstore_storages
+from .storage import (
+    GCS_FUSE_ANNOTATIONS,
+    PARALLELSTORE_ANNOTATIONS,
+    get_auto_mount_gcsfuse_storages,
+    get_auto_mount_parallelstore_storages,
+    get_auto_mount_storages,
+)
+from .workload_decorators import (
+    rdma_decorator,
+    tcpx_decorator,
+    tcpxo_decorator,
+)
 from .workload_decorators.tcpxo_decorator import get_tcpxo_deamon_entry
 KJOB_API_GROUP_NAME = "kjobctl.x-k8s.io"
@@ -164,8 +169,8 @@ Kueue_TAS_annotation = "kueue.x-k8s.io/podset-preferred-topology=cloud.google.co
 default_interface_annotation = "networking.gke.io/default-interface=eth0"
-def get_a4_pod_template_annotations() -> tuple[str, str]:
-  sub_networks = get_subnetworks_for_a4()
+def get_a4_pod_template_annotations(args) -> tuple[str, str]:
+  sub_networks = get_cluster_subnetworks(args)
   interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
       sub_networks
   )
@@ -177,7 +182,7 @@ def get_a4_pod_template_annotations() -> tuple[str, str]:
 def get_a3ultra_pod_template_annotations(args: Namespace) -> tuple[str, str]:
-  sub_networks = get_subnetworks_for_a3ultra(args.cluster)
+  sub_networks = get_cluster_subnetworks(args)
   interfaces_key, interfaces_value = rdma_decorator.get_interfaces_entry(
       sub_networks
   )
@@ -192,7 +197,7 @@ def get_a3mega_pod_template_annotations(
     args: Namespace,
 ) -> tuple[str, str, str]:
   """Adds or updates annotations in the Pod template."""
-  sub_networks = get_subnetworks_for_a3mega(args.cluster)
+  sub_networks = get_cluster_subnetworks(args)
   tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
   interfaces_key, interfaces_value = tcpxo_decorator.get_interfaces_entry(
       sub_networks
@@ -267,6 +272,8 @@ def create_app_profile_instance(
 def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
   job_spec = yaml.safe_load(yml_string)["template"]
+  if gpu_type == H100_DEVICE_TYPE:
+    job_spec = tcpx_decorator.decorate_kjob_template(job_spec)
   if gpu_type == H100_MEGA_DEVICE_TYPE:
     job_spec = tcpxo_decorator.decorate_kjob_template(job_spec)
   if gpu_type == H200_DEVICE_TYPE:

xpk/core/kueue.py CHANGED Viewed

@@ -21,6 +21,7 @@ from packaging.version import Version
 from ..utils.console import xpk_exit, xpk_print
 from ..utils.file import write_tmp_file
+from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
 from .commands import (
     run_command_for_value,
     run_command_with_updates,
@@ -45,6 +46,19 @@ WAIT_FOR_KUEUE_TIMEOUT = '5m'
 packaging.version.VERSION_PATTERN = r'^v\d+\.\d+\.\d+$'
+topology_yaml = """apiVersion: kueue.x-k8s.io/v1alpha1
+kind: Topology
+metadata:
+  name: "gke-default"
+spec:
+  levels:
+  - nodeLabel: "cloud.google.com/gce-topology-block"
+  - nodeLabel: "cloud.google.com/gce-topology-subblock"
+  - nodeLabel: "cloud.google.com/gce-topology-host"
+  - nodeLabel: "kubernetes.io/hostname"
+---
+"""
 cluster_set_crd_yaml = """apiVersion: kueue.x-k8s.io/v1beta1
 kind: ResourceFlavor
 metadata:
@@ -53,6 +67,7 @@ spec:
   nodeLabels:
     {accelerator_label}
     {machine_label}
+  {topology_label}
 ---
 {pw_resource_flavors}
 apiVersion: kueue.x-k8s.io/v1beta1
@@ -300,6 +315,14 @@ def install_kueue_crs(
       resource_type=resource_type,
       total_chips=total_chips,
   )
+  topology_label = ''
+  if system.device_type in [
+      H100_MEGA_DEVICE_TYPE,
+      H200_DEVICE_TYPE,
+      B200_DEVICE_TYPE,
+  ]:
+    topology_label = 'topologyName: "gke-default"'
   yml_string = cluster_set_crd_yaml.format(
       system=system,
       cluster_hardware_name=cluster_hardware_name,
@@ -309,6 +332,7 @@ def install_kueue_crs(
       machine_label=create_machine_label(
           system.accelerator_type, system, autoprovisioning_enabled
       ),
+      topology_label=topology_label,
       covered_resources_config=covered_resources_config,
       resource_type=AcceleratorTypeToAcceleratorCharacteristics[
           system.accelerator_type
@@ -318,6 +342,12 @@ def install_kueue_crs(
       cluster_queue_name=CLUSTER_QUEUE_NAME,
       local_queue_name=LOCAL_QUEUE_NAME,
   )
+  if system.device_type in [
+      H100_MEGA_DEVICE_TYPE,
+      H200_DEVICE_TYPE,
+      B200_DEVICE_TYPE,
+  ]:
+    yml_string = topology_yaml + yml_string
   tmp = write_tmp_file(yml_string)
   command = f'kubectl apply -f {str(tmp.file.name)}'

xpk/core/mtc.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""
+Copyright 2024 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import requests
+import yaml
+from ..core.cluster import JOBSET_VERSION
+from ..core.cluster import setup_k8s_env
+from ..utils import templates
+from ..utils.console import xpk_exit
+from ..utils.console import xpk_print
+from ..utils.kubectl import apply_kubectl_manifest
+MTC_CPC_PATH = "/../templates/mtc-cpc.yaml"
+def create_mtc_cpc(
+    mtc_gcs_bucket: str,
+    mtc_machine_type: str,
+    mtc_toleration_key: str,
+    mtc_ramdisk_size: str,
+) -> dict:
+  """Create MTC Checkpoint Configuration.
+  Args:
+    mtc_gcs_bucket: GCS bucket for MTC
+    mtc_machine_type: Machine type for MTC
+    mtc_toleration_key: Toleration key for MTC
+    mtc_ramdisk_size: Ramdisk size for MTC
+  Returns:
+    MTC Checkpoint Configuration
+  """
+  data = templates.load(MTC_CPC_PATH)
+  data["spec"]["cloudStorageBucketName"] = mtc_gcs_bucket
+  data["spec"]["nodeSelector"][
+      "node.kubernetes.io/instance-type"
+  ] = mtc_machine_type
+  data["spec"]["tolerations"][0]["key"] = mtc_toleration_key
+  data["spec"]["inMemoryVolumeSize"] = mtc_ramdisk_size
+  return data
+def install_mtc_on_cluster(args, system) -> int:
+  """Install MTC on the cluster.
+  Args:
+    args: user provided arguments for running the command.
+    system: system related information.
+  Returns:
+    return code of the command.
+  """
+  if args.mtc_gcs_bucket is None:
+    xpk_print("MTC GCS bucket is required.")
+    xpk_exit(1)
+  if args.mtc_gcs_bucket.startswith("gs://"):
+    args.mtc_gcs_bucket = args.mtc_gcs_bucket.replace("gs://", "")
+  if args.mtc_ramdisk_size is None:
+    xpk_print("MTC ramdisk size is required.")
+    xpk_exit(1)
+  if args.mtc_toleration_key is None:
+    args.mtc_toleration_key = "google.com/tpu"
+  k8s_api_client = setup_k8s_env(args)
+  jobset_manifest = update_jobset_manifest()
+  if jobset_manifest is None:
+    xpk_print(
+        "Updated jobset manifest is empty, not updating the jobset controller."
+    )
+  xpk_print("Applying Jobset with MTC Configuration")
+  return_code = apply_kubectl_manifest(k8s_api_client, [jobset_manifest])
+  if return_code != 0:
+    return return_code
+  mtc_checkpoint_configuration_crd_data = create_mtc_cpc(
+      args.mtc_gcs_bucket,
+      system.gce_machine_type,
+      args.mtc_toleration_key,
+      args.mtc_ramdisk_size,
+  )
+  xpk_print("Applying MTC Checkpoint Configuration")
+  return_code = apply_kubectl_manifest(
+      k8s_api_client, [mtc_checkpoint_configuration_crd_data]
+  )
+  return return_code
+def update_jobset_manifest():
+  """Update the jobset manifest to increase the resources for the jobset controller manager.
+  Returns:
+    The updated jobset manifest.
+  """
+  manifest_url = f"https://github.com/kubernetes-sigs/jobset/releases/download/{JOBSET_VERSION}/manifests.yaml"
+  manifest_content = None
+  # Fetch the manifest content
+  try:
+    response = requests.get(manifest_url, timeout=10)
+    response.raise_for_status()  # Raise an exception for HTTP errors
+    manifest_content = response.text
+  except requests.exceptions.Timeout as e:
+    xpk_print(f"Error: Request to {manifest_url} after 10 seconds: {e}")
+    xpk_exit(1)
+  except requests.exceptions.RequestException as e:
+    xpk_print(f"Error fetching manifest from {manifest_url}: {e}")
+    xpk_exit(1)
+  if manifest_content is None:
+    xpk_print("Manifest content not found.")
+    xpk_exit(1)
+  # Load all YAML documents from the manifest
+  yaml_data_list = list(yaml.safe_load_all(manifest_content))
+  # Iterate through the yaml_data to find the Deployment for
+  # jobset-controller-manager
+  update_manifest = False
+  for yaml_data in yaml_data_list:
+    if (
+        yaml_data
+        and yaml_data.get("apiVersion") == "apps/v1"
+        and yaml_data.get("kind") == "Deployment"
+        and yaml_data.get("metadata", {}).get("name")
+        == "jobset-controller-manager"
+    ):
+      # Found the Deployment, now modify the resources
+      containers = yaml_data["spec"]["template"]["spec"]["containers"]
+      for container in containers:
+        if container["name"] == "manager":
+          # Update resource limits and requests
+          current_cpu_request = (
+              container["resources"].get("requests", {}).get("cpu", "0m")
+          )
+          current_memory_request = (
+              container["resources"].get("requests", {}).get("memory", "0Mi")
+          )
+          current_memory_limit = (
+              container["resources"].get("limits", {}).get("memory", "0Mi")
+          )
+          # Define new values for comparison
+          new_cpu_request = "1000m"
+          new_memory_request = "1Gi"
+          new_memory_limit = "2Gi"
+          if parse_resource_value(current_cpu_request) < parse_resource_value(
+              new_cpu_request
+          ):
+            container["resources"]["requests"]["cpu"] = new_cpu_request
+            update_manifest = True
+          if parse_resource_value(
+              current_memory_request
+          ) < parse_resource_value(new_memory_request):
+            container["resources"]["requests"]["memory"] = new_memory_request
+            update_manifest = True
+          if parse_resource_value(current_memory_limit) < parse_resource_value(
+              new_memory_limit
+          ):
+            container["resources"]["limits"]["memory"] = new_memory_limit
+            update_manifest = True
+          break
+      if update_manifest:
+        xpk_print("Jobset controller updation required.")
+        return yaml_data
+  xpk_print("Jobset controller no updation required.")
+def parse_resource_value(value) -> int:
+  if value.endswith("m"):
+    return int(value[:-1])
+  if value.endswith("Mi"):
+    return int(value[:-2])
+  if value.endswith("Gi"):
+    return int(value[:-2]) * 1024
+  return int(value)

xpk 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

xpk 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl