PyPI - xpk - Versions diffs - 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

xpk 0.14.4py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

integration/README.md +19 -0
integration/gcluster_a3mega_test.py +11 -0
integration/gcluster_a3ultra_test.py +11 -0
integration/gcluster_a4_test.py +11 -0
xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
xpk/blueprints/a3mega/storage_crd.yaml +52 -0
xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
xpk/blueprints/a4/storage_crd.yaml +52 -0
xpk/commands/cluster.py +89 -32
xpk/commands/cluster_gcluster.py +25 -5
xpk/commands/cluster_gcluster_test.py +16 -3
xpk/commands/cluster_test.py +353 -7
xpk/commands/config.py +3 -5
xpk/commands/inspector.py +5 -3
xpk/commands/kind.py +3 -1
xpk/commands/managed_ml_diagnostics.py +249 -0
xpk/commands/managed_ml_diagnostics_test.py +146 -0
xpk/commands/storage.py +8 -10
xpk/commands/workload.py +143 -142
xpk/commands/workload_test.py +160 -118
xpk/core/blueprint/blueprint_generator.py +73 -33
xpk/core/blueprint/blueprint_test.py +9 -0
xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
xpk/core/blueprint/testing/data/a4.yaml +185 -0
xpk/core/capacity.py +48 -8
xpk/core/capacity_test.py +32 -1
xpk/core/cluster.py +55 -104
xpk/core/cluster_test.py +170 -0
xpk/core/commands.py +4 -10
xpk/core/config.py +88 -7
xpk/core/config_test.py +67 -11
xpk/core/docker_container.py +3 -1
xpk/core/docker_image.py +10 -6
xpk/core/docker_resources.py +1 -10
xpk/core/gcloud_context.py +18 -12
xpk/core/gcloud_context_test.py +111 -1
xpk/core/kjob.py +17 -19
xpk/core/kueue_manager.py +205 -51
xpk/core/kueue_manager_test.py +158 -4
xpk/core/nap.py +13 -14
xpk/core/nodepool.py +37 -43
xpk/core/nodepool_test.py +42 -19
xpk/core/pathways.py +23 -0
xpk/core/pathways_test.py +57 -0
xpk/core/resources.py +84 -27
xpk/core/scheduling.py +144 -133
xpk/core/scheduling_test.py +298 -6
xpk/core/system_characteristics.py +256 -19
xpk/core/system_characteristics_test.py +128 -5
xpk/core/telemetry.py +263 -0
xpk/core/telemetry_test.py +211 -0
xpk/core/vertex.py +4 -3
xpk/core/workload_decorators/tcpx_decorator.py +5 -1
xpk/main.py +33 -13
xpk/parser/cluster.py +40 -67
xpk/parser/cluster_test.py +83 -3
xpk/parser/common.py +84 -0
xpk/parser/storage.py +10 -0
xpk/parser/storage_test.py +47 -0
xpk/parser/workload.py +14 -29
xpk/parser/workload_test.py +3 -49
xpk/telemetry_uploader.py +29 -0
xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
xpk/utils/console.py +41 -10
xpk/utils/console_test.py +106 -0
xpk/utils/feature_flags.py +10 -1
xpk/utils/file.py +4 -1
xpk/utils/topology.py +4 -0
xpk/utils/user_agent.py +35 -0
xpk/utils/user_agent_test.py +44 -0
xpk/utils/user_input.py +48 -0
xpk/utils/user_input_test.py +92 -0
xpk/utils/validation.py +2 -13
xpk/utils/versions.py +31 -0
xpk-0.16.0.dist-info/METADATA +127 -0
xpk-0.16.0.dist-info/RECORD +168 -0
xpk-0.14.4.dist-info/METADATA +0 -1645
xpk-0.14.4.dist-info/RECORD +0 -139
{xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
{xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
{xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.14.4.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0

xpk/core/scheduling.py CHANGED Viewed

@@ -14,59 +14,63 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+from enum import Enum
+from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled
+from ..utils.feature_flags import FeatureFlags
+from ..utils.topology import get_slice_topology_level
 from ..utils.console import xpk_print
+from ..utils.topology import is_topology_valid
 from ..utils.execution_context import is_dry_run
 from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
-from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
 from .system_characteristics import (
+    SUB_SLICING_TOPOLOGIES,
     AcceleratorType,
-    AcceleratorTypeToAcceleratorCharacteristics,
     SystemCharacteristics,
+    create_accelerator_label,
+    create_machine_label,
 )
+from packaging.version import Version
+_SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
-def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
-  """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
-  Args:
-    args: user provided arguments for running the command.
-    system: system characteristics
+class WorkloadScheduling(Enum):
+  UNAVAILABLE = 0
+  AVAILABLE = 1
+  SUB_SLICING_AVAILABLE = 2
+def check_if_workload_can_schedule(
+    args,
+    workload_system: SystemCharacteristics,
+    cluster_system: SystemCharacteristics | None,
+    resources_config_map: dict[str, str] | None,
+) -> WorkloadScheduling:
+  """Check if workload can schedule based on the cluster resources (tpu_type and maximum VM in cluster).
   Returns:
-    returns true if workload can schedule, otherwise returns false.
+    returns WorkloadScheduling describing scheduling option.
   """
-  resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
-  cluster_config_map = get_cluster_configmap(resources_configmap_name)
+  if is_dry_run() and not cluster_system:
+    xpk_print('Skipping workload scheduling validation in dry run.')
+    return WorkloadScheduling.AVAILABLE
-  # Prevents workload creation failure for existing clusters with no ConfigMap
-  if cluster_config_map is None:
+  if resources_config_map is None:
     xpk_print(
-        'No ConfigMap exist for cluster with the name'
-        f' {resources_configmap_name}.'
+        "Skipping workload scheduling validation, because there's no Resources"
+        ' ConfigMap in the cluster.'
     )
-    return True
-  if is_dry_run():
-    return True
+    return WorkloadScheduling.AVAILABLE
-  # Check for gke accelerator type:
-  missing_gke_accelerator_type = False
-  if not cluster_config_map.get(system.gke_accelerator):
-    xpk_print(
-        f'GKE Accelerator Type Check: {args.workload} is requesting'
-        f' {system.gke_accelerator} but cluster only contains'
-        f' {cluster_config_map.keys()}. '
-    )
-    missing_gke_accelerator_type = True
-  elif (
-      cluster_config_map[system.gke_accelerator]
-      == AUTOPROVISIONING_CONFIG_VALUE
-  ):
+  if _is_cluster_set_up_for_nap(workload_system, resources_config_map):
     # Run total chip check when in autoprovisioning mode.
     max_chips_in_cluster = int(
-        cluster_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
+        resources_config_map[AUTOPROVISIONING_CONFIG_MAXIMUM_KEY]
+    )
+    num_chips_in_workload = get_total_chips_requested_from_args(
+        args, workload_system
     )
-    num_chips_in_workload = get_total_chips_requested_from_args(args, system)
     if num_chips_in_workload > max_chips_in_cluster:
       xpk_print(
@@ -75,44 +79,100 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
           '  Resize the cluster to support more chips with'
           ' `xpk cluster create --autoprovisioning-max-chips=X ...`'
       )
-      return False
-    return True
+      return WorkloadScheduling.UNAVAILABLE
+    return WorkloadScheduling.AVAILABLE
+  if workload_system.device_type in resources_config_map:
+    if _check_workload_size_fits(
+        args,
+        workload_system,
+        max_vm_in_cluster=int(
+            resources_config_map[workload_system.device_type]
+        ),
+    ):
+      return WorkloadScheduling.AVAILABLE
+    else:
+      return WorkloadScheduling.UNAVAILABLE
+  if _check_sub_slicing_availability(
+      workload_system=workload_system, cluster_system=cluster_system
+  ):
+    assert cluster_system
+    if _check_workload_size_fits(
+        args,
+        workload_system,
+        max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
+    ):
+      return WorkloadScheduling.SUB_SLICING_AVAILABLE
+    else:
+      return WorkloadScheduling.UNAVAILABLE
+  xpk_print(
+      'Workload scheduling validation failed. XPK will not create the workload'
+      f' {args.workload}.'
+  )
+  return WorkloadScheduling.UNAVAILABLE
-  # Check for device type
-  missing_device_type = False
-  device_type = system.device_type
-  if device_type not in cluster_config_map:
-    xpk_print(
-        f'Device Type Check: {args.workload} is requesting {device_type} but '
-        f'cluster only contains {cluster_config_map.keys()}. '
-    )
-    missing_device_type = True
-  if missing_device_type and missing_gke_accelerator_type:
+def _is_cluster_set_up_for_nap(
+    workload_system: SystemCharacteristics, resources_config_map: dict[str, str]
+) -> bool:
+  return (
+      resources_config_map.get(workload_system.gke_accelerator, None)
+      == AUTOPROVISIONING_CONFIG_VALUE
+  )
+def _check_workload_size_fits(
+    args,
+    workload_system: SystemCharacteristics,
+    max_vm_in_cluster: int,
+) -> bool:
+  if workload_system.accelerator_type == AcceleratorType.GPU:
+    vm_required_by_workload = args.num_nodes
+  else:
+    vm_required_by_workload = args.num_slices * workload_system.vms_per_slice
+  if vm_required_by_workload > max_vm_in_cluster:
     xpk_print(
-        'Both Device Type and GKE Accelerator Type checks failed.'
-        f' XPK will not create the workload {args.workload}.'
+        f'{args.workload} is requesting {args.num_slices} slice/slices of'
+        f' {workload_system.device_type}, which is'
+        f' {vm_required_by_workload} VMs, but the cluster only contains'
+        f' {max_vm_in_cluster} VMs of {workload_system.device_type}. XPK will'
+        ' not create this workload.'
     )
     return False
-  else:
-    # Check if the size of the workload will fit in the cluster.
-    max_vm_in_cluster = int(cluster_config_map[device_type])
-    if system.accelerator_type == AcceleratorType.GPU:
-      vm_required_by_workload = args.num_nodes
-    else:
-      vm_required_by_workload = args.num_slices * system.vms_per_slice
-    if vm_required_by_workload > max_vm_in_cluster:
-      xpk_print(
-          f'{args.workload} is requesting {args.num_slices} slice/slices of'
-          f' {device_type}, which is {vm_required_by_workload} VMs, but the'
-          f' cluster only contains {max_vm_in_cluster} VMs of {device_type}.'
-          ' XPK will not create this workload.'
-      )
-      return False
   return True
+def _check_sub_slicing_availability(
+    workload_system: SystemCharacteristics,
+    cluster_system: SystemCharacteristics | None,
+) -> bool:
+  if (
+      (not FeatureFlags.SUB_SLICING_ENABLED)
+      or (not cluster_system)
+      or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
+      or (not cluster_system.supports_sub_slicing)
+      or (workload_system.topology not in SUB_SLICING_TOPOLOGIES)
+  ):
+    return False
+  return_code, sub_slicing_enabled = has_sub_slicing_enabled()
+  if return_code != 0 or not sub_slicing_enabled:
+    return False
+  return_code, current_version = get_installed_kueue_version(
+      dry_run_version=Version('0.13')
+  )
+  return (
+      return_code == 0
+      and current_version is not None
+      and current_version >= _SUB_SLICING_MINIMUM_KUEUE_VERSION
+  )
 def get_total_chips_requested_from_args(
     args, system: SystemCharacteristics
 ) -> int:
@@ -133,7 +193,7 @@ def get_total_chips_requested_from_args(
   return int(num_chips)
-def get_cpu_affinity(accelerator_type) -> str:
+def get_cpu_affinity(accelerator_type: AcceleratorType) -> str:
   """Generate affinity rules for CPU nodepools, so that workload pods are
   not scheduled on the default pool machines.
   Args:
@@ -197,10 +257,8 @@ def get_gpu_scheduler(
               """
     gpu_scheduler = gpu_scheduler_yaml.format(
         scheduler_name=args.scheduler,
-        accelerator_label=create_accelerator_label(
-            system.accelerator_type, system
-        ),
-        machine_label=create_machine_label(system.accelerator_type, system),
+        accelerator_label=create_accelerator_label(system),
+        machine_label=create_machine_label(system),
         node_pool_name=f'{args.cluster}-np-0',
         autoprovisioning_args=autoprovisioning_args,
     )
@@ -215,74 +273,14 @@ def get_gpu_scheduler(
   return gpu_scheduler, return_code
-def create_accelerator_label(accelerator_type, system) -> str:
-  """Generates accelerator label.
-  Args:
-    accelerator_type: type of accelerator.
-    system: system characteristics.
-  Returns:
-    The accelerator label.
-  """
-  if accelerator_type == AcceleratorType.CPU:
-    return ''
-  return (
-      f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].accelerator_label}:'
-      f' {system.gke_accelerator}'
-  )
-def create_tpu_machine_type(accelerator_type, system) -> str:
-  """Generates TPU machine type..
-  Args:
-    accelerator_type: type of accelerator.
-    system: system characteristics.
-  Returns:
-    The accelerator label.
-  """
-  if accelerator_type == AcceleratorType.TPU:
+def create_tpu_machine_type(system: SystemCharacteristics) -> str:
+  if system.accelerator_type == AcceleratorType.TPU:
     return f'{system.gce_machine_type}'
   return ''
-def create_machine_label(
-    accelerator_type, system, autoprovisioning_enabled: bool = False
-) -> str:
-  """Generates machine label.
-  Args:
-    accelerator_type: type of accelerator.
-    system: system characteristics.
-    autoprovisioning_enabled: describes autoprovisioning enablement.
-  Returns:
-    The machine label.
-  """
-  if accelerator_type == AcceleratorType.TPU and not autoprovisioning_enabled:
-    return (
-        f'{AcceleratorTypeToAcceleratorCharacteristics[accelerator_type].machine_label}:'
-        f' {system.topology}'
-    )
-  return ''
-def create_tpu_topology(
-    accelerator_type, system, autoprovisioning_enabled: bool = False
-) -> str:
-  """Generates TPU topology.
-  Args:
-    accelerator_type: type of accelerator.
-    system: system characteristics.
-    autoprovisioning_enabled: describes autoprovisioning enablement.
-  Returns:
-    The machine label.
-  """
-  if accelerator_type == AcceleratorType.TPU and not autoprovisioning_enabled:
+def create_tpu_topology(system: SystemCharacteristics) -> str:
+  if system.accelerator_type == AcceleratorType.TPU:
     return f'{system.topology}'
   return ''
@@ -299,7 +297,20 @@ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
   return [
       (
           'kueue.x-k8s.io/podset-required-topology:'
-          f' "google.com/gke-tpu-slice-{sub_slicing_topology}-id"'
+          f' "{get_slice_topology_level(sub_slicing_topology)}"'
       ),
       f'cloud.google.com/gke-tpu-slice-topology: {sub_slicing_topology}',
   ]
+def create_placement_policy_label(system: SystemCharacteristics) -> str:
+  name = get_placement_policy_name(system)
+  return f'cloud.google.com/placement-policy-name: {name}'
+def get_placement_policy_name(system: SystemCharacteristics) -> str:
+  return f'{system.device_type}-{system.topology}-placement-policy'
+def is_placement_policy_supported(system: SystemCharacteristics) -> bool:
+  return system.requires_workload_policy and is_topology_valid(system.topology)

xpk/core/scheduling_test.py CHANGED Viewed

@@ -14,18 +14,310 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-from .scheduling import create_sub_slicing_annotations
+from argparse import Namespace
+from dataclasses import dataclass
+import dataclasses
+import pytest
+from pytest_mock import MockerFixture
+from xpk.core.capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
+from xpk.core.testing.commands_tester import CommandsTester
+from xpk.utils.feature_flags import FeatureFlags
+from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
+from .system_characteristics import SystemCharacteristics, AcceleratorType, DockerPlatform, get_system_characteristics_by_device_type
-def test_create_sub_slicing_annotations_returns_valid_annotations():
-  subslicing_topology = '2x2'
+def _get_system_characteristics_or_die(
+    device_type: str,
+) -> SystemCharacteristics:
+  system = get_system_characteristics_by_device_type(device_type)[0]
+  assert system
+  return system
-  result = create_sub_slicing_annotations(subslicing_topology)
+@pytest.fixture(autouse=True)
+def commands_tester(mocker: MockerFixture) -> CommandsTester:
+  return CommandsTester(
+      mocker=mocker,
+      run_command_for_value_path='xpk.core.kueue_manager.run_command_for_value',
+  )
+def test_create_sub_slicing_annotations_returns_valid_annotations():
+  result = create_sub_slicing_annotations(sub_slicing_topology='2x4')
   assert result == [
       (
           'kueue.x-k8s.io/podset-required-topology:'
-          ' "google.com/gke-tpu-slice-2x2-id"'
+          ' "cloud.google.com/gke-tpu-slice-2x4-id"'
       ),
-      'cloud.google.com/gke-tpu-slice-topology: 2x2',
+      'cloud.google.com/gke-tpu-slice-topology: 2x4',
   ]
+def test_create_placement_policy_label_returns_valid_label():
+  system_characteristics = SystemCharacteristics(
+      chips_per_vm=1,
+      gce_machine_type='tpu7x-standard-1t',
+      gke_accelerator='tpu7x',
+      requires_workload_policy=False,
+      topology='1x1x1',
+      vms_per_slice=1,
+      device_type='tpu7x',
+      accelerator_type=AcceleratorType.TPU,
+      supports_sub_slicing=False,
+      docker_platform=DockerPlatform.ARM,
+  )
+  label = create_placement_policy_label(system_characteristics)
+  assert (
+      label
+      == 'cloud.google.com/placement-policy-name: tpu7x-1x1x1-placement-policy'
+  )
+def test_get_placement_policy_name_returns_valid_name():
+  system_characteristics = SystemCharacteristics(
+      chips_per_vm=1,
+      gce_machine_type='tpu7x-standard-1t',
+      gke_accelerator='tpu7x',
+      requires_workload_policy=False,
+      topology='1x1x1',
+      vms_per_slice=1,
+      device_type='tpu7x',
+      accelerator_type=AcceleratorType.TPU,
+      supports_sub_slicing=False,
+      docker_platform=DockerPlatform.ARM,
+  )
+  name = get_placement_policy_name(system_characteristics)
+  assert name == 'tpu7x-1x1x1-placement-policy'
+def test_is_placement_policy_supported_returns_true_for_system_characteristics_supporting_workload_policy_and_having_valid_topology():
+  system_characteristics = SystemCharacteristics(
+      chips_per_vm=1,
+      gce_machine_type='tpu7x-standard-1t',
+      gke_accelerator='tpu7x',
+      requires_workload_policy=True,
+      topology='1x1x1',
+      vms_per_slice=1,
+      device_type='tpu7x',
+      accelerator_type=AcceleratorType.TPU,
+      supports_sub_slicing=False,
+      docker_platform=DockerPlatform.ARM,
+  )
+  assert is_placement_policy_supported(system_characteristics) is True
+def test_is_placement_policy_supported_returns_false_for_system_characteristics_not_supporting_workload_policy_and_having_valid_topology():
+  system_characteristics = SystemCharacteristics(
+      chips_per_vm=1,
+      gce_machine_type='tpu7x-standard-1t',
+      gke_accelerator='tpu7x',
+      requires_workload_policy=False,
+      topology='1x1x1',
+      vms_per_slice=1,
+      device_type='tpu7x',
+      accelerator_type=AcceleratorType.TPU,
+      supports_sub_slicing=False,
+      docker_platform=DockerPlatform.ARM,
+  )
+  assert is_placement_policy_supported(system_characteristics) is False
+def test_is_placement_policy_supported_returns_false_for_system_characteristics_supporting_workload_policy_and_having_invalid_topology():
+  system_characteristics = SystemCharacteristics(
+      chips_per_vm=1,
+      gce_machine_type='tpu7x-standard-1t',
+      gke_accelerator='tpu7x',
+      requires_workload_policy=True,
+      topology='aaa',
+      vms_per_slice=1,
+      device_type='tpu7x',
+      accelerator_type=AcceleratorType.TPU,
+      supports_sub_slicing=False,
+      docker_platform=DockerPlatform.ARM,
+  )
+  assert is_placement_policy_supported(system_characteristics) is False
+@dataclass(frozen=True)
+class SchedulingTestCase:
+  workload_system: SystemCharacteristics
+  num_slices: int = 1
+  cluster_system: SystemCharacteristics | None = None
+  resources_config_map: dict[str, str] | None = None
+  sub_slicing_feature_enabled: bool = False
+  kueue_version: str | None = None
+  sub_slicing_topology_set: bool = False
+SUB_SLICING_CASE = SchedulingTestCase(
+    workload_system=_get_system_characteristics_or_die('v6e-8'),
+    cluster_system=_get_system_characteristics_or_die('v6e-16'),
+    resources_config_map={'v6e-16': '8'},
+    sub_slicing_feature_enabled=True,
+    kueue_version='0.13.0',
+    sub_slicing_topology_set=True,
+    num_slices=1,
+)
+NAP_CASE = SchedulingTestCase(
+    workload_system=_get_system_characteristics_or_die('v6e-8'),
+    cluster_system=None,
+    resources_config_map={
+        'tpu-v6e-slice': AUTOPROVISIONING_CONFIG_VALUE,
+        AUTOPROVISIONING_CONFIG_MAXIMUM_KEY: '10',
+    },
+)
+@pytest.mark.parametrize(
+    'title, case, expected',
+    [
+        (
+            'No resources config map',
+            SchedulingTestCase(
+                workload_system=_get_system_characteristics_or_die('v6e-8'),
+                resources_config_map=None,
+            ),
+            WorkloadScheduling.AVAILABLE,
+        ),
+        (
+            'Cluster system matches and workload fits',
+            SchedulingTestCase(
+                workload_system=_get_system_characteristics_or_die('v6e-8'),
+                resources_config_map={'v6e-8': '8'},
+                num_slices=2,
+            ),
+            WorkloadScheduling.AVAILABLE,
+        ),
+        (
+            'Cluster system does not match',
+            SchedulingTestCase(
+                workload_system=_get_system_characteristics_or_die('v6e-8'),
+                resources_config_map={'tpu7x-32': '16'},
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Workload does not fit',
+            SchedulingTestCase(
+                workload_system=_get_system_characteristics_or_die('v6e-8'),
+                resources_config_map={'v6e-8': '8'},
+                num_slices=100,
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Correct NAP',
+            NAP_CASE,
+            WorkloadScheduling.AVAILABLE,
+        ),
+        (
+            'NAP, too big workload',
+            dataclasses.replace(NAP_CASE, num_slices=100),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Correct Sub-slicing',
+            SUB_SLICING_CASE,
+            WorkloadScheduling.SUB_SLICING_AVAILABLE,
+        ),
+        (
+            'Sub-slicing, but disabled flag',
+            dataclasses.replace(
+                SUB_SLICING_CASE, sub_slicing_feature_enabled=False
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Sub-slicing, but low Kueue version',
+            dataclasses.replace(SUB_SLICING_CASE, kueue_version='0.12.0'),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Sub-slicing, but no sub-slicing-topology',
+            dataclasses.replace(
+                SUB_SLICING_CASE, sub_slicing_topology_set=False
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Sub-slicing, but workload too big',
+            dataclasses.replace(SUB_SLICING_CASE, num_slices=100),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Sub-slicing, but cluster system is incorrect',
+            dataclasses.replace(
+                SUB_SLICING_CASE,
+                cluster_system=_get_system_characteristics_or_die('tpu7x-16'),
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Sub-slicing, but workload system is incorrect',
+            dataclasses.replace(
+                SUB_SLICING_CASE,
+                workload_system=_get_system_characteristics_or_die('tpu7x-8'),
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Sub-slicing, but workload topology is incorrect',
+            dataclasses.replace(
+                SUB_SLICING_CASE,
+                workload_system=_get_system_characteristics_or_die('v6e-2x2'),
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            (
+                'Sub-slicing should be ignored when a given device is already'
+                ' present in the cluster'
+            ),
+            dataclasses.replace(
+                SUB_SLICING_CASE,
+                workload_system=_get_system_characteristics_or_die('v6e-8'),
+                cluster_system=_get_system_characteristics_or_die('v6e-8'),
+                resources_config_map={'v6e-8': '4'},
+            ),
+            WorkloadScheduling.AVAILABLE,
+        ),
+    ],
+)
+def test_check_if_workload_can_schedule(
+    commands_tester: CommandsTester,
+    title: str,
+    case: SchedulingTestCase,
+    expected: WorkloadScheduling,
+):
+  FeatureFlags.SUB_SLICING_ENABLED = case.sub_slicing_feature_enabled
+  commands_tester.set_result_for_command(
+      (
+          0,
+          f'registry.k8s.io/kueue/kueue:v{case.kueue_version}'
+          if case.kueue_version
+          else '',
+      ),
+      'kubectl get deployment',
+      'image',
+  )
+  commands_tester.set_result_for_command(
+      (0, 'sub-slice-topology' if case.sub_slicing_topology_set else ''),
+      'kubectl get topology',
+  )
+  args = Namespace(
+      cluster='test-cluster',
+      workload='test-workload',
+      num_slices=case.num_slices,
+  )
+  assert (
+      check_if_workload_can_schedule(
+          args,
+          workload_system=case.workload_system,
+          cluster_system=case.cluster_system,
+          resources_config_map=case.resources_config_map,
+      )
+      == expected
+  )

xpk 0.14.4__py3-none-any.whl → 0.16.0__py3-none-any.whl

xpk 0.14.4py3-none-any.whl → 0.16.0py3-none-any.whl