PyPI - xpk - Versions diffs - 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

xpk 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

xpk/commands/cluster.py +270 -8
xpk/commands/cluster_gcluster.py +2 -1
xpk/commands/common.py +3 -3
xpk/commands/info.py +12 -12
xpk/commands/job.py +12 -10
xpk/commands/kjob_common.py +2 -1
xpk/commands/storage.py +1 -1
xpk/commands/workload.py +12 -6
xpk/core/blueprint/blueprint_generator.py +7 -7
xpk/core/blueprint/blueprint_test.py +218 -0
xpk/core/capacity.py +5 -3
xpk/core/cluster.py +9 -7
xpk/core/cluster_private.py +5 -1
xpk/core/commands.py +3 -3
xpk/core/config.py +3 -4
xpk/core/config_test.py +71 -0
xpk/core/docker_manager.py +1 -1
xpk/core/docker_resources.py +1 -1
xpk/core/filestore.py +7 -2
xpk/core/gcloud_context.py +2 -2
xpk/core/jobset.py +1 -1
xpk/core/kjob.py +2 -1
xpk/core/kueue.py +12 -4
xpk/core/nap.py +20 -6
xpk/core/nodepool.py +52 -19
xpk/core/nodepool_test.py +82 -0
xpk/core/resources.py +1 -7
xpk/core/scheduling.py +1 -1
xpk/core/storage.py +14 -14
xpk/core/system_characteristics.py +267 -1081
xpk/core/workload.py +11 -0
xpk/core/workload_decorators/rdma_decorator.py +3 -2
xpk/core/workload_decorators/storage_decorator.py +2 -1
xpk/core/workload_decorators/tcpx_decorator.py +4 -2
xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
xpk/core/workload_test.py +28 -0
xpk/main.py +9 -10
xpk/parser/cluster.py +67 -49
xpk/parser/common.py +45 -36
xpk/parser/storage.py +12 -13
xpk/parser/workload.py +57 -39
xpk/utils/console.py +2 -1
{xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/METADATA +4 -1
{xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/RECORD +49 -44
{xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/WHEEL +0 -0
{xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/entry_points.txt +0 -0
{xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.10.1.dist-info → xpk-0.12.0.dist-info}/top_level.txt +0 -0

xpk/core/nodepool.py CHANGED Viewed

@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+from typing import List
 from ..utils.console import get_user_input, xpk_print
 from .capacity import (
     AUTOPROVISIONING_CONFIG_VALUE,
@@ -32,6 +33,8 @@ from .resources import (
     create_or_update_cluster_configmap,
 )
 from .system_characteristics import AcceleratorType
+from functools import reduce
+from operator import mul
 CLOUD_PLATFORM_AUTH_SCOPE_URL = (
     '"https://www.googleapis.com/auth/cloud-platform"'
@@ -88,20 +91,26 @@ def run_gke_node_pool_create_command(
     xpk_print('Parsing capacity arguments failed!')
     return return_code
-  if system.accelerator_type == AcceleratorType['GPU']:
-    xpk_print(
-        f'Creating 1 node pool with {args.num_nodes} nodes of'
-        f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
-    )
-    desired_node_pool_names = [f'{args.cluster}-np-0']
-  else:
-    xpk_print(
-        f'Creating {args.num_slices} node pool or pools of'
-        f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
-    )
-    desired_node_pool_names = [
-        f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
-    ]
+  desired_node_pool_count = (
+      1
+      if system.accelerator_type == AcceleratorType['GPU']
+      else args.num_slices
+  )
+  message = (
+      (
+          f'Creating 1 node pool with {args.num_nodes} nodes of'
+          f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
+      )
+      if system.accelerator_type == AcceleratorType['GPU']
+      else (
+          f'Creating {args.num_slices} node pool or pools of'
+          f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
+      )
+  )
+  xpk_print(message)
+  desired_node_pool_names = get_desired_node_pool_names(
+      existing_node_pool_names, args.cluster, desired_node_pool_count
+  )
   node_pools_to_remain = []
   delete_commands = []
@@ -275,20 +284,24 @@ def run_gke_node_pool_create_command(
         f' --host-maintenance-interval={args.host_maintenance_interval}'
         f' {capacity_args}'
         ' --enable-gvnic'
-        f' {args.custom_nodepool_arguments}'
     )
     if system.accelerator_type == AcceleratorType['TPU']:
       command += f' --node-version={gke_node_pool_version}'
+      topology_product = reduce(
+          mul, (int(x) for x in system.topology.split('x')), 1
+      )
       if capacity_type == CapacityType.FLEX_START:
         command += ' --num-nodes=0'
-      else:
+      elif topology_product > 1:
         command += f' --num-nodes={system.vms_per_slice}'
-      command += ' --placement-type=COMPACT  --max-pods-per-node 15'
       command += (
           f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
       )
-      command += f' --tpu-topology={system.topology}'
-      command += f' {args.custom_tpu_nodepool_arguments}'
+      if topology_product > 1:
+        command += ' --placement-type=COMPACT  --max-pods-per-node 15'
+        command += f' --tpu-topology={system.topology}'
+        command += f' {args.custom_tpu_nodepool_arguments}'
     elif system.accelerator_type == AcceleratorType['GPU']:
       subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
       if capacity_type == CapacityType.FLEX_START:
@@ -319,6 +332,8 @@ def run_gke_node_pool_create_command(
     if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
       command += ' --workload-metadata=GKE_METADATA'
+    command += f' {args.custom_nodepool_arguments}'
     task = f'NodepoolCreate-{node_pool_name}'
     create_commands.append(command)
     create_task_names.append(task)
@@ -594,3 +609,21 @@ def get_nodepool_workload_metadata_mode(
     return 1, None
   return 0, nodepool_WI_mode.strip()
+def get_desired_node_pool_names(
+    existing_node_pool_names: List[str],
+    cluster_name: str,
+    desired_node_pool_count: int,
+) -> List[str]:
+  cluster_node_pools = [
+      np
+      for np in existing_node_pool_names
+      if np.startswith(f'{cluster_name}-np-')
+  ]
+  result = set(cluster_node_pools[:desired_node_pool_count])
+  i = 0
+  while len(result) < desired_node_pool_count:
+    result.add(f'{cluster_name}-np-{i}')
+    i += 1
+  return list(result)

xpk/core/nodepool_test.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from xpk.core.nodepool import get_desired_node_pool_names
+CLUSTER_NAME = "running-cucumber"
+def node_pool_name(number: int) -> str:
+  return f"{CLUSTER_NAME}-np-{number}"
+def test_compute_desired_node_pool_names_with_desired_larger_than_existing():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[node_pool_name(0)],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=2,
+  )
+  expected_result = [node_pool_name(0), node_pool_name(1)]
+  assert set(result) == set(expected_result)
+def test_compute_desired_node_pool_names_with_desired_smaller_than_existing():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[node_pool_name(0), node_pool_name(1)],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=1,
+  )
+  expected_result = [node_pool_name(0)]
+  assert set(result) == set(expected_result)
+def test_compute_desired_node_pool_names_with_consecutive_numbers_missing():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=3,
+  )
+  expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)]
+  assert set(result) == set(expected_result)
+def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=2,
+  )
+  expected_result = [node_pool_name(0), node_pool_name(3)]
+  assert set(result) == set(expected_result)
+def test_compute_desired_node_pool_names_with_unknown_node_pools():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[
+          "unknown-node-pool",
+          node_pool_name(0),
+          node_pool_name(3),
+      ],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=2,
+  )
+  expected_result = [node_pool_name(0), node_pool_name(3)]
+  assert set(result) == set(expected_result)

xpk/core/resources.py CHANGED Viewed

@@ -108,13 +108,7 @@ def create_cluster_configmaps(
   device_type = system.device_type
   if system.accelerator_type == AcceleratorType['GPU']:
     resources_data = f'{device_type}: "{int(args.num_nodes)}"'
-  elif (
-      not args.enable_pathways
-      and args.enable_autoprovisioning
-      and autoprovisioning_config
-  ):
-    # Currently autoprovisioning is not supported with Pathways.
-    # Auto provisioning will have variable topologies for a gke accelerator type.
+  elif args.enable_autoprovisioning and autoprovisioning_config:
     resources_data = (
         f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
     )

xpk/core/scheduling.py CHANGED Viewed

@@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
   missing_gke_accelerator_type = False
   if not cluster_config_map.get(system.gke_accelerator):
     xpk_print(
-        f'Gke Accelerator Type Check: {args.workload} is requesting'
+        f'GKE Accelerator Type Check: {args.workload} is requesting'
         f' {system.gke_accelerator} but cluster only contains'
         f' {cluster_config_map.keys()}. '
     )

xpk/core/storage.py CHANGED Viewed

@@ -17,7 +17,7 @@ limitations under the License.
 import os
 from argparse import Namespace
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, cast
 import ruamel.yaml
 from google.cloud import storage as gcp_storage
@@ -95,17 +95,17 @@ class Storage:
     Args:
         data: A dictionary containing the Storage resource definition.
     """
-    metadata: k8s_client.V1ObjectMeta = data.get("metadata", {})
+    metadata = data.get("metadata", {})
     self.name = metadata.get("name")
     spec = data.get("spec", {})
-    self.type: str = spec.get("type")
-    self.auto_mount: bool = spec.get("auto_mount")
-    self.mount_point: bool = spec.get("mount_point")
-    self.readonly: bool = spec.get("readonly")
-    self.manifest: str = spec.get("manifest")
-    self.pvc: str = spec.get("pvc")
-    self.pv: str = spec.get("pv")
-    self.bucket: str = self._get_bucket()
+    self.type = spec.get("type")
+    self.auto_mount = spec.get("auto_mount")
+    self.mount_point = spec.get("mount_point")
+    self.readonly = spec.get("readonly")
+    self.manifest = spec.get("manifest")
+    self.pvc = spec.get("pvc")
+    self.pv = spec.get("pv")
+    self.bucket = self._get_bucket()
   def fields_as_list(self) -> list[str]:
     """
@@ -117,9 +117,9 @@ class Storage:
     return [
         self.name,
         self.type,
-        self.auto_mount,
+        str(self.auto_mount),
         self.mount_point,
-        self.readonly,
+        str(self.readonly),
         self.manifest,
     ]
@@ -133,7 +133,7 @@ class Storage:
     client = k8s_client.CoreV1Api()
     try:
       pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
-      return pv.spec.csi.volume_handle
+      return cast(str, pv.spec.csi.volume_handle)
     except ApiException as e:
       xpk_print(
           f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
@@ -150,7 +150,7 @@ class Storage:
     client = k8s_client.CoreV1Api()
     try:
       pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
-      return pv.spec.mount_options
+      return cast(list[str], pv.spec.mount_options)
     except ApiException as e:
       xpk_print(
           f"Exception when calling CoreV1Api->read_persistent_volume: {e}"

xpk 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

xpk 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl