PyPI - xpk - Versions diffs - 0.17.3__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

xpk 0.17.3py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

xpk/commands/cluster.py +33 -43
xpk/commands/cluster_gcluster.py +19 -14
xpk/commands/cluster_gcluster_test.py +2 -0
xpk/commands/cluster_test.py +1 -21
xpk/commands/common.py +39 -6
xpk/commands/common_test.py +170 -0
xpk/commands/info.py +9 -5
xpk/commands/inspector.py +33 -4
xpk/commands/inspector_test.py +142 -0
xpk/commands/workload.py +32 -11
xpk/commands/workload_test.py +71 -3
xpk/core/blueprint/blueprint_generator.py +19 -8
xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
xpk/core/blueprint/testing/data/a4.yaml +3 -1
xpk/core/capacity.py +37 -17
xpk/core/capacity_test.py +66 -1
xpk/core/cluster.py +11 -10
xpk/core/cluster_private.py +3 -3
xpk/core/cluster_test.py +29 -2
xpk/core/config.py +5 -2
xpk/core/docker_container.py +31 -24
xpk/core/docker_manager.py +4 -4
xpk/core/docker_resources.py +4 -1
xpk/core/kueue_manager.py +6 -8
xpk/core/kueue_manager_test.py +6 -5
xpk/core/nap.py +14 -3
xpk/core/nodepool.py +52 -13
xpk/core/nodepool_test.py +147 -8
xpk/core/remote_state/fuse_remote_state.py +1 -1
xpk/core/scheduling.py +32 -4
xpk/core/scheduling_test.py +39 -2
xpk/core/system_characteristics.py +44 -0
xpk/core/system_characteristics_test.py +11 -0
xpk/core/telemetry.py +11 -1
xpk/core/telemetry_test.py +39 -0
xpk/core/testing/commands_tester.py +26 -0
xpk/core/testing/commands_tester_test.py +20 -1
xpk/core/workload_decorators/rdma_decorator.py +9 -0
xpk/parser/cluster.py +11 -1
xpk/parser/cluster_test.py +59 -1
xpk/parser/common.py +11 -17
xpk/parser/core.py +0 -8
xpk/parser/storage.py +3 -14
xpk/utils/console.py +1 -1
xpk/utils/feature_flags.py +8 -4
{xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/METADATA +50 -23
{xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/RECORD +51 -60
xpk-1.1.0.dist-info/top_level.txt +1 -0
integration/README.md +0 -19
integration/__init__.py +0 -15
integration/docker_manager_test.py +0 -102
integration/gcluster_a3mega_test.py +0 -215
integration/gcluster_a3ultra_test.py +0 -187
integration/gcluster_a4_test.py +0 -187
integration/gcluster_test.py +0 -107
xpk/commands/kind.py +0 -265
xpk/parser/kind.py +0 -95
xpk/utils/user_input.py +0 -48
xpk/utils/user_input_test.py +0 -92
xpk-0.17.3.dist-info/top_level.txt +0 -2
{xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/WHEEL +0 -0
{xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/entry_points.txt +0 -0
{xpk-0.17.3.dist-info → xpk-1.1.0.dist-info}/licenses/LICENSE +0 -0

xpk/core/nodepool_test.py CHANGED Viewed

@@ -20,6 +20,7 @@ from xpk.core.nodepool import (
     ensure_resource_policy_exists,
     get_desired_node_pool_names,
     run_gke_node_pool_create_command,
+    _validate_reservation_count,
 )
 from xpk.core.system_characteristics import AcceleratorType, SystemCharacteristics, DockerPlatform, GpuConfig
 from xpk.core.commands import FailedCommand
@@ -103,6 +104,7 @@ def commands_tester(mocker):
   return CommandsTester(
       mocker,
       run_command_for_value_path="xpk.core.nodepool.run_command_for_value",
+      run_command_batch_path="xpk.core.commands.run_command_batch",
   )
@@ -119,7 +121,7 @@ def test_ensure_resource_policy_exists_with_existing_policy_retrieves_existing_p
   assert len(commands_tester.commands_history) == 1
   commands_tester.assert_command_run(
-      "gcloud compute resource-policies describe resource-policy",
+      "gcloud beta compute resource-policies describe resource-policy",
       "--project=test-project",
       "--region=us-central1",
   )
@@ -129,7 +131,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
     commands_tester: CommandsTester,
 ):
   commands_tester.set_result_for_command(
-      (1, ""), "gcloud compute resource-policies describe"
+      (1, ""), "gcloud beta compute resource-policies describe"
   )
   ensure_resource_policy_exists(
@@ -142,16 +144,17 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy(
   assert len(commands_tester.commands_history) == 2
   commands_tester.assert_command_run(
-      "gcloud compute resource-policies describe"
+      "gcloud beta compute resource-policies describe"
   )
   commands_tester.assert_command_run(
-      "gcloud compute resource-policies create workload-policy resource-policy",
+      "gcloud beta compute resource-policies create workload-policy"
+      " resource-policy",
       "--project=test-project",
       "--region=us-central1",
       "--accelerator-topology=2x2x1",
   )
   commands_tester.assert_command_not_run(
-      "gcloud compute resource-policies create workload-policy",
+      "gcloud beta compute resource-policies create workload-policy",
       "--accelerator-topology-mode",
   )
@@ -160,7 +163,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
     commands_tester: CommandsTester,
 ):
   commands_tester.set_result_for_command(
-      (1, ""), "gcloud compute resource-policies describe"
+      (1, ""), "gcloud beta compute resource-policies describe"
   )
   ensure_resource_policy_exists(
@@ -172,7 +175,7 @@ def test_ensure_resource_policy_exists_without_existing_policy_creates_policy_fo
   )
   commands_tester.assert_command_run(
-      "gcloud compute resource-policies create workload-policy",
+      "gcloud beta compute resource-policies create workload-policy",
       "--accelerator-topology-mode",
   )
@@ -182,7 +185,7 @@ def test_ensure_resource_policy_exits_without_existing_policy_throws_when_creati
 ):
   with pytest.raises(RuntimeError):
     commands_tester.set_result_for_command(
-        (1, ""), "gcloud compute resource-policies"
+        (1, ""), "gcloud beta compute resource-policies"
     )
     ensure_resource_policy_exists(
@@ -251,6 +254,7 @@ def test_placement_policy_created_for_gpu_with_valid_topology(
       device_type="h100-80gb-8",
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=True,
       docker_platform=DockerPlatform.ARM,
       gpu_config=GpuConfig(requires_topology=True),
   )
@@ -284,6 +288,7 @@ def test_placement_policy_not_created_for_gpu_with_invalid_topology(
       device_type="h100-80gb-8",
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=True,
       docker_platform=DockerPlatform.ARM,
       gpu_config=GpuConfig(requires_topology=True),
   )
@@ -320,6 +325,7 @@ def test_placement_policy_created_for_tpu7x_with_valid_topology(
       requires_workload_policy=True,
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=False,
       docker_platform=DockerPlatform.ARM,
   )
@@ -354,6 +360,7 @@ def test_placement_policy_not_created_for_non7x_tpu(
       device_type="v6e-4",
       supports_sub_slicing=True,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=True,
       docker_platform=DockerPlatform.ARM,
   )
@@ -429,3 +436,135 @@ def test_display_nodepool_creation_ignores_logs_without_errors(
       mock_xpk_print.call_args_list[0].args[0]
       == "Create Nodepools returned ERROR 1"
   )
+def test_validate_reservation_count_mismatch(mock_xpk_print):
+  result = _validate_reservation_count(
+      ["res1", "res2"], num_node_pools_to_create=3
+  )
+  assert result == 1
+  assert mock_xpk_print.call_count == 1
+  assert (
+      "reservations (2) must match the number of NEW nodepools (3)"
+      in mock_xpk_print.call_args_list[0].args[0]
+  )
+def test_run_gke_node_pool_create_command_multiple_reservations(
+    mocker,
+    commands_tester: CommandsTester,
+):
+  mocker.patch(
+      "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
+  )
+  mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
+  args = mocker.Mock(
+      num_slices=2,
+      reservation="res1,res2",
+      tpu_type="v4-8",
+      device_type=None,
+      cluster="test-cluster",
+      project="test-project",
+      zone="us-central1-a",
+      on_demand=False,
+      spot=False,
+      flex=False,
+      enable_workload_identity=False,
+      enable_gcsfuse_csi_driver=False,
+      host_maintenance_interval="AS_NEEDED",
+      custom_nodepool_arguments="",
+  )
+  system = SystemCharacteristics(
+      topology="2x2x1",
+      vms_per_slice=2,
+      gke_accelerator="tpu-v4",
+      gce_machine_type="ct4p-hightpu-4t",
+      chips_per_vm=4,
+      accelerator_type=AcceleratorType.TPU,
+      device_type="v4-8",
+      requires_workload_policy=False,
+      supports_sub_slicing=False,
+      supports_super_slicing=False,
+      supports_accelerator_network_profile=True,
+      docker_platform=DockerPlatform.AMD,
+  )
+  commands_tester.set_result_for_command(
+      (0, ""), "gcloud beta container node-pools list"
+  )
+  result = run_gke_node_pool_create_command(args, system, "1.2.3")
+  assert result == 0
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
+  )
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "test-cluster-np-0", "--reservation=res1"
+  )
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res2"
+  )
+def test_run_gke_node_pool_create_command_partial_reservations(
+    mocker,
+    commands_tester: CommandsTester,
+):
+  mocker.patch(
+      "xpk.core.nodepool.get_cluster_location", return_value="us-central1"
+  )
+  mocker.patch("xpk.core.nodepool.get_node_pools_to_delete", return_value=[])
+  mocker.patch("xpk.core.capacity.verify_reservations_exist", return_value=0)
+  args = mocker.Mock(
+      num_slices=3,
+      reservation="res1,res2",
+      tpu_type="v4-8",
+      device_type=None,
+      cluster="test-cluster",
+      project="test-project",
+      zone="us-central1-a",
+      on_demand=False,
+      spot=False,
+      flex=False,
+      enable_workload_identity=False,
+      enable_gcsfuse_csi_driver=False,
+      host_maintenance_interval="AS_NEEDED",
+      custom_nodepool_arguments="",
+  )
+  system = SystemCharacteristics(
+      topology="2x2x1",
+      vms_per_slice=2,
+      gke_accelerator="tpu-v4",
+      gce_machine_type="ct4p-hightpu-4t",
+      chips_per_vm=4,
+      accelerator_type=AcceleratorType.TPU,
+      device_type="v4-8",
+      requires_workload_policy=False,
+      supports_sub_slicing=False,
+      supports_super_slicing=False,
+      supports_accelerator_network_profile=True,
+      docker_platform=DockerPlatform.AMD,
+  )
+  commands_tester.set_result_for_command(
+      (0, "test-cluster-np-0"), "gcloud beta container node-pools list"
+  )
+  commands_tester.set_result_for_command(
+      (0, "us-central1-a"),
+      "gcloud",
+      "node-pools describe",
+      '--format="value(locations)"',
+  )
+  result = run_gke_node_pool_create_command(args, system, "1.2.3")
+  assert result == 0
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "--tpu-topology=2x2x1", times=2
+  )
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "test-cluster-np-1", "--reservation=res1"
+  )
+  commands_tester.assert_command_run(
+      "gcloud", "node-pools create", "test-cluster-np-2", "--reservation=res2"
+  )

xpk/core/remote_state/fuse_remote_state.py CHANGED Viewed

@@ -56,7 +56,7 @@ class FuseStateClient(RemoteStateClient):
   def upload_state(self) -> None:
     xpk_print(
-        f'Uploading dependecies from directory {self.state_dir} to bucket:'
+        f'Uploading dependencies from directory {self.state_dir} to bucket:'
         f' {self.bucket}. Path within bucket is: {self._get_bucket_path()}'
     )
     upload_directory_to_gcs(

xpk/core/scheduling.py CHANGED Viewed

@@ -18,7 +18,7 @@ from enum import Enum
 from .kueue_manager import get_installed_kueue_version, has_sub_slicing_enabled, has_super_slicing_enabled
 from ..utils.feature_flags import FeatureFlags
-from ..utils.topology import get_slice_topology_level
+from ..utils.topology import get_slice_topology_level, parse_topology
 from ..utils.console import xpk_print
 from ..utils.topology import is_topology_valid
 from ..utils.execution_context import is_dry_run
@@ -33,7 +33,11 @@ from .system_characteristics import (
 from packaging.version import Version
 _SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
-_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.14.0')
+_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.15.2')
+_SUPER_SLICING_MAX_TOPOLOGY = (16, 24, 24)
+ONE_TO_ONE_REPLICA_NODE_POOL_ASSIGNMENT_ANNOTATION = (
+    'alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool'
+)
 class WorkloadScheduling(Enum):
@@ -115,7 +119,7 @@ def check_if_workload_can_schedule(
         args,
         workload_system,
         max_vm_in_cluster=int(resources_config_map[cluster_system.device_type]),
-    ):
+    ) and _check_super_slicing_topology(workload_system):
       return WorkloadScheduling.SUPER_SLICING_AVAILABLE
     else:
       return WorkloadScheduling.UNAVAILABLE
@@ -189,7 +193,6 @@ def _check_super_slicing_availability(
     workload_system: SystemCharacteristics,
     cluster_system: SystemCharacteristics,
 ) -> bool:
-  # TODO: b/465447813 - Add super-slicing workload topology validation.
   if (
       (not FeatureFlags.SUPER_SLICING_ENABLED)
       or (workload_system.gke_accelerator != cluster_system.gke_accelerator)
@@ -212,6 +215,27 @@ def _check_super_slicing_availability(
   )
+def _check_super_slicing_topology(
+    workload_system: SystemCharacteristics,
+) -> bool:
+  topology = parse_topology(workload_system.topology)
+  result = (
+      all(size % 4 == 0 and size >= 4 for size in topology)
+      and len(topology) == len(_SUPER_SLICING_MAX_TOPOLOGY)
+      and topology[0] <= topology[1] <= topology[2]
+      and all(a <= b for a, b in zip(topology, _SUPER_SLICING_MAX_TOPOLOGY))
+  )
+  if not result:
+    xpk_print(
+        'Error: Invalid super-slicing topology. It must adhere to the format of'
+        ' 4i x 4j x 4k, where i <= j <= k, and i, j, k are integers, with a'
+        ' maximum of 16x24x24.'
+    )
+  return result
 def get_total_chips_requested_from_args(
     args, system: SystemCharacteristics
 ) -> int:
@@ -342,6 +366,10 @@ def create_sub_slicing_annotations(sub_slicing_topology: str) -> list[str]:
   ]
+def create_tpu_slice_topology_annotation(workload_topology: str) -> str:
+  return f'cloud.google.com/gke-tpu-slice-topology: {workload_topology}'
 def create_placement_policy_label(
     system: SystemCharacteristics, super_slicing: bool
 ) -> str:

xpk/core/scheduling_test.py CHANGED Viewed

@@ -22,7 +22,7 @@ from pytest_mock import MockerFixture
 from xpk.core.capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
 from xpk.core.testing.commands_tester import CommandsTester
 from xpk.utils.feature_flags import FeatureFlags
-from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, get_placement_policy_name, is_placement_policy_supported
+from .scheduling import WorkloadScheduling, check_if_workload_can_schedule, create_sub_slicing_annotations, create_placement_policy_label, create_tpu_slice_topology_annotation, get_placement_policy_name, is_placement_policy_supported
 from .system_characteristics import SystemCharacteristics, AcceleratorType, DockerPlatform, get_system_characteristics_by_device_type
@@ -66,6 +66,7 @@ def test_create_placement_policy_label_returns_valid_label():
       accelerator_type=AcceleratorType.TPU,
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=False,
       docker_platform=DockerPlatform.ARM,
   )
   label = create_placement_policy_label(
@@ -89,6 +90,7 @@ def test_get_placement_policy_name_returns_valid_name():
       accelerator_type=AcceleratorType.TPU,
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=False,
       docker_platform=DockerPlatform.ARM,
   )
   name = get_placement_policy_name(system_characteristics, super_slicing=False)
@@ -107,6 +109,7 @@ def test_get_placement_policy_name_super_slicing_returns_valid_name():
       accelerator_type=AcceleratorType.TPU,
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=False,
       docker_platform=DockerPlatform.ARM,
   )
   name = get_placement_policy_name(system_characteristics, super_slicing=True)
@@ -125,6 +128,7 @@ def test_is_placement_policy_supported_returns_true_for_system_characteristics_s
       accelerator_type=AcceleratorType.TPU,
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=False,
       docker_platform=DockerPlatform.ARM,
   )
   assert is_placement_policy_supported(system_characteristics) is True
@@ -142,6 +146,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
       accelerator_type=AcceleratorType.TPU,
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=False,
       docker_platform=DockerPlatform.ARM,
   )
   assert is_placement_policy_supported(system_characteristics) is False
@@ -159,6 +164,7 @@ def test_is_placement_policy_supported_returns_false_for_system_characteristics_
       accelerator_type=AcceleratorType.TPU,
       supports_sub_slicing=False,
       supports_super_slicing=False,
+      supports_accelerator_network_profile=False,
       docker_platform=DockerPlatform.ARM,
   )
   assert is_placement_policy_supported(system_characteristics) is False
@@ -202,7 +208,7 @@ SUPER_SLICING_CASE = SchedulingTestCase(
     cluster_system=_get_system_characteristics_or_die('tpu7x-4x4x4'),
     # 5 4x4x4 cubes:
     resources_config_map={'tpu7x-128': str(64 // 4 * 5)},
-    kueue_version='0.14.0',
+    kueue_version='0.15.2',
     super_slicing_feature_enabled=True,
     super_slicing_topology_set=True,
     num_slices=1,
@@ -369,6 +375,28 @@ SUPER_SLICING_CASE = SchedulingTestCase(
             ),
             WorkloadScheduling.UNAVAILABLE,
         ),
+        (
+            'Super-slicing, but workload topology is not divisible by four',
+            dataclasses.replace(
+                SUPER_SLICING_CASE,
+                workload_system=_get_system_characteristics_or_die(
+                    'tpu7x-2x2x1'
+                ),
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
+        (
+            'Super-slicing, but workload topology is too big for super-slice',
+            dataclasses.replace(
+                SUPER_SLICING_CASE,
+                workload_system=_get_system_characteristics_or_die(
+                    'tpu7x-4x4x32'
+                ),
+                # 10 cubes, to make sure vms fit:
+                resources_config_map={'tpu7x-128': str(64 // 4 * 10)},
+            ),
+            WorkloadScheduling.UNAVAILABLE,
+        ),
         (
             (
                 'Super-slicing should be ignored when a given device is already'
@@ -426,3 +454,12 @@ def test_check_if_workload_can_schedule(
       )
       == expected
   )
+def test_create_tpu_slice_topology_annotation():
+  workload_system = _get_system_characteristics_or_die('tpu7x-4x4x8')
+  assert (
+      create_tpu_slice_topology_annotation(workload_system.topology)
+      == 'cloud.google.com/gke-tpu-slice-topology: 4x4x8'
+  )

xpk 0.17.3__py3-none-any.whl → 1.1.0__py3-none-any.whl

xpk 0.17.3py3-none-any.whl → 1.1.0py3-none-any.whl