PyPI - xpk - Versions diffs - 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

xpk 0.16.0py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

xpk/commands/cluster.py +48 -5
xpk/commands/cluster_gcluster.py +3 -0
xpk/commands/cluster_gcluster_test.py +2 -0
xpk/commands/cluster_test.py +203 -0
xpk/commands/common.py +6 -0
xpk/commands/kind.py +2 -0
xpk/commands/workload.py +35 -15
xpk/commands/workload_test.py +1 -0
xpk/core/capacity.py +83 -46
xpk/core/capacity_test.py +82 -28
xpk/core/commands.py +39 -12
xpk/core/kueue_manager.py +42 -11
xpk/core/kueue_manager_test.py +83 -3
xpk/core/nap.py +5 -4
xpk/core/nodepool.py +57 -20
xpk/core/nodepool_test.py +152 -23
xpk/core/pathways.py +2 -1
xpk/core/resources.py +3 -3
xpk/core/scheduling.py +54 -10
xpk/core/scheduling_test.py +118 -13
xpk/core/system_characteristics.py +41 -24
xpk/core/system_characteristics_test.py +37 -4
xpk/core/telemetry.py +5 -0
xpk/core/telemetry_test.py +19 -2
xpk/core/updates.py +1 -1
xpk/main.py +2 -1
xpk/parser/cluster.py +34 -2
xpk/parser/cluster_test.py +117 -0
xpk/parser/common.py +32 -0
xpk/parser/common_test.py +49 -0
xpk/templates/kueue_config.yaml.j2 +21 -5
xpk/templates/kueue_super_slicing_topology.yaml.j2 +9 -0
xpk/utils/kueue.py +6 -2
{xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/METADATA +2 -1
{xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/RECORD +39 -37
{xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/WHEEL +0 -0
{xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/entry_points.txt +0 -0
{xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.16.0.dist-info → xpk-0.17.0.dist-info}/top_level.txt +0 -0

xpk/core/capacity.py CHANGED Viewed

@@ -15,10 +15,12 @@ limitations under the License.
 """
 import enum
+from dataclasses import dataclass
+from .commands import run_command_with_updates, run_command_for_value
+from .system_characteristics import AcceleratorType
 from ..utils.console import xpk_print, xpk_exit
 from ..utils.kueue import is_queued_cluster
-from .commands import run_command_with_updates, run_command_for_value
 AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
 AUTOPROVISIONING_CONFIG_MINIMUM_KEY = 'minimum_chips'
@@ -42,6 +44,14 @@ class CapacityType(enum.Enum):
   FLEX_START = 'flex_start'
+@dataclass
+class Reservation:
+  project: str
+  name: str
+  block_name: str | None = None
+  sub_block_name: str | None = None
 def print_reservations(args) -> int:
   """Print the reservations in the project.
@@ -107,7 +117,7 @@ def get_capacity_type(args) -> tuple[CapacityType, int]:
 def get_reservation_maintenance_interval(
-    reservation: str, zone: str, project: str
+    reservation_path: str, zone: str, project: str
 ) -> str:
   """Get reservation maintenance interval.
@@ -117,12 +127,10 @@ def get_reservation_maintenance_interval(
   Returns:
     0 if successful and 1 otherwise.
   """
-  reservation_project, reservation_name = get_reservation_project_and_name(
-      reservation, project
-  )
+  reservation = parse_reservation(reservation_path, project)
   command = (
-      f'gcloud beta compute reservations describe {reservation_name}'
-      f' --project={reservation_project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
+      f'gcloud beta compute reservations describe {reservation.name}'
+      f' --project={reservation.project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
   )
   return_code, output = run_command_for_value(
       command, 'Get reservation maintenance interval'
@@ -134,7 +142,7 @@ def get_reservation_maintenance_interval(
 def get_reservation_placement_policy(
-    reservation: str, zone: str, project: str
+    reservation_path: str, zone: str, project: str
 ) -> str:
   """Get reservation placement policy.
@@ -144,12 +152,10 @@ def get_reservation_placement_policy(
   Returns:
     0 if successful and 1 otherwise.
   """
-  reservation_project, reservation_name = get_reservation_project_and_name(
-      reservation, project
-  )
+  reservation = parse_reservation(reservation_path, project)
   command = (
-      f'gcloud beta compute reservations describe {reservation_name}'
-      f' --project={reservation_project} --zone={zone} --format="value(resourcePolicies.policy)"'
+      f'gcloud beta compute reservations describe {reservation.name}'
+      f' --project={reservation.project} --zone={zone} --format="value(resourcePolicies.policy)"'
   )
   return_code, output = run_command_for_value(
       command, 'Get reservation placement policy'
@@ -161,15 +167,13 @@ def get_reservation_placement_policy(
 def get_reservation_deployment_type(
-    reservation: str, zone: str, project: str
+    reservation_path: str, zone: str, project: str
 ) -> str:
   """Get reservation deployment type."""
-  reservation_project, reservation_name = get_reservation_project_and_name(
-      reservation, project
-  )
+  reservation = parse_reservation(reservation_path, project)
   command = (
-      f'gcloud beta compute reservations describe {reservation_name}'
-      f' --project={reservation_project} --zone={zone} --format="value(deploymentType)"'
+      f'gcloud beta compute reservations describe {reservation.name}'
+      f' --project={reservation.project} --zone={zone} --format="value(deploymentType)"'
   )
   return_code, output = run_command_for_value(
       command, 'Get reservation deployment type', dry_run_return_val='DENSE'
@@ -189,12 +193,10 @@ def verify_reservation_exists(args) -> int:
   Returns:
     0 if successful and 1 otherwise.
   """
-  reservation_project, reservation_name = get_reservation_project_and_name(
-      args.reservation, args.project
-  )
+  reservation = parse_reservation(args.reservation, args.project)
   command = (
-      f'gcloud beta compute reservations describe {reservation_name}'
-      f' --project={reservation_project} --zone={args.zone}'
+      f'gcloud beta compute reservations describe {reservation.name}'
+      f' --project={reservation.project} --zone={args.zone}'
   )
   return_code = run_command_with_updates(command, 'Describe reservation')
   if return_code != 0:
@@ -205,7 +207,10 @@ def verify_reservation_exists(args) -> int:
 def get_capacity_arguments_from_capacity_type(
-    args, capacity_type: CapacityType, max_nodes: int
+    args,
+    capacity_type: CapacityType,
+    max_nodes: int,
+    accelerator_type: AcceleratorType,
 ) -> tuple[str, int]:
   """Determine the Nodepool creation capacity arguments needed.
@@ -231,7 +236,7 @@ def get_capacity_arguments_from_capacity_type(
           ' --location-policy=ANY --reservation-affinity=none'
           f' --no-enable-autorepair --max-nodes={max_nodes}'
       )
-      if is_queued_cluster(args.num_slices):
+      if is_queued_cluster(args.num_slices, accelerator_type):
         capacity_args += ' --enable-queued-provisioning'
     case CapacityType.RESERVATION:
       capacity_args = (
@@ -280,27 +285,59 @@ def get_capacity_node_selectors_from_capacity_type(
   return node_selector, return_code
-def get_reservation_project_and_name(
-    reservation_name_or_path: str, cluster_project: str
-) -> tuple[str, str]:
-  """Get the reservation project and name.
+def parse_reservation(
+    reservation_path: str, cluster_project: str
+) -> Reservation:
+  """Parses the reservation details from the reservation path.
+      Also supports reservation blocks and sub-blocks.
+      Assumes cluster project if project is not contained in the path.
-  Args:
-    reservation_name_or_path: either reservation name or reservation path in format
-      projects/RESERVATION_PROJECT_ID/reservations/RESERVATION_NAME
-    cluster_project: the cluster project
+      Args:
+        reservation_path: path to the reservation, reservation block or sub-block in format:
+  `[projects/RESERVATION_PROJECT_ID/reservations/]RESERVATION_NAME[/reservationBlocks/BLOCK_NAME[/reservationSubBlocks/SUB_BLOCK_NAME]]`
+        cluster_project: the cluster project
-  Returns:
-    Tuple with reservation project and reservation name.
+      Returns:
+        Reservation instance containing reservation details.
   """
-  if '/' not in reservation_name_or_path:
-    return cluster_project, reservation_name_or_path
-  reservation_parts = reservation_name_or_path.split('/')
-  if (
-      len(reservation_parts) != 4
-      or reservation_parts[0] != 'projects'
-      or reservation_parts[2] != 'reservations'
-  ):
-    xpk_print('Unable to parse reservation: ', reservation_name_or_path)
+  reservation = _try_parse_reservation(reservation_path, cluster_project)
+  if reservation is None:
+    xpk_print('Unable to parse reservation: ', reservation_path)
     xpk_exit(1)
-  return reservation_parts[1], reservation_parts[3]
+  return reservation
+def _try_parse_reservation(
+    reservation_path: str, cluster_project: str
+) -> Reservation | None:
+  # assume trivial case, path contains just the reservation name
+  reservation = Reservation(
+      project=cluster_project,
+      name=reservation_path,
+      block_name=None,
+      sub_block_name=None,
+  )
+  parts = reservation_path.split('/')
+  if min(map(len, parts)) == 0:  # all parts must be non-empty
+    return None
+  if len(parts) == 1:
+    return reservation  # trivial case
+  if parts[0] == 'projects':
+    reservation.project = parts[1]
+    if len(parts) < 4 or parts[2] != 'reservations':
+      return None
+    parts = parts[3:]  # remove projects/PROJECT/reservations/ prefix
+  if len(parts) not in (1, 3, 5):
+    return None
+  reservation.name = parts[0]
+  if len(parts) >= 3:
+    if parts[1] != 'reservationBlocks':
+      return None
+    reservation.block_name = parts[2]
+    if len(parts) >= 5:
+      if parts[3] != 'reservationSubBlocks':
+        return None
+      reservation.sub_block_name = parts[4]
+  return reservation

xpk/core/capacity_test.py CHANGED Viewed

@@ -16,7 +16,7 @@ limitations under the License.
 import pytest
 from unittest.mock import MagicMock, patch
-from .capacity import get_reservation_deployment_type, get_reservation_project_and_name
+from .capacity import get_reservation_deployment_type, parse_reservation, Reservation
 @patch('xpk.core.capacity.xpk_print')
@@ -28,7 +28,7 @@ def test_get_reservation_deployment_type_exits_with_command_fails(
   )
   with pytest.raises(SystemExit):
     get_reservation_deployment_type(
-        reservation='reservation', zone='zone', project='project'
+        reservation_path='reservation', zone='zone', project='project'
     )
   assert (
@@ -45,37 +45,91 @@ def test_get_reservation_deployment_type_returns_deployment_type_when_command_su
       return_value=(0, 'DENSE'),
   )
   result = get_reservation_deployment_type(
-      reservation='reservation', zone='zone', project='project'
+      reservation_path='reservation', zone='zone', project='project'
   )
   assert result == 'DENSE'
-def test_get_reservation_project_and_name_parses_local_reservation():
-  project, name = get_reservation_project_and_name(
-      'test-reservation', 'cluster-project'
-  )
-  assert project == 'cluster-project'
-  assert name == 'test-reservation'
-def test_get_reservation_project_and_name_parses_shared_reservation():
-  project, name = get_reservation_project_and_name(
-      'projects/reservation-project/reservations/test-reservation',
-      'cluster-project',
-  )
-  assert project == 'reservation-project'
-  assert name == 'test-reservation'
+@pytest.mark.parametrize(
+    argnames='reservation_path,expected_reservation',
+    argvalues=[
+        (
+            'reservation',
+            Reservation(project='cluster-project', name='reservation'),
+        ),
+        (
+            'reservation/reservationBlocks/block',
+            Reservation(
+                project='cluster-project',
+                name='reservation',
+                block_name='block',
+            ),
+        ),
+        (
+            'reservation/reservationBlocks/block/reservationSubBlocks/subblock',
+            Reservation(
+                project='cluster-project',
+                name='reservation',
+                block_name='block',
+                sub_block_name='subblock',
+            ),
+        ),
+        (
+            'projects/p/reservations/reservation',
+            Reservation(project='p', name='reservation'),
+        ),
+        (
+            'projects/p/reservations/reservation/reservationBlocks/block',
+            Reservation(
+                project='p',
+                name='reservation',
+                block_name='block',
+            ),
+        ),
+        (
+            'projects/p/reservations/reservation/reservationBlocks/block/reservationSubBlocks/subblock',
+            Reservation(
+                project='p',
+                name='reservation',
+                block_name='block',
+                sub_block_name='subblock',
+            ),
+        ),
+    ],
+)
+def test_parse_reservation_parses_valid_reservations(
+    reservation_path: str,
+    expected_reservation: Reservation,
+):
+  actual_reservation = parse_reservation(reservation_path, 'cluster-project')
+  assert actual_reservation == expected_reservation
+@pytest.mark.parametrize(
+    argnames='reservation_path',
+    argvalues=[
+        '',
+        '/name',
+        'name/',
+        'name/reservationBlocks/',
+        'name/reservationBlocks/block/reservationSubBlocks/',
+        'name/reservationBlocks/block/reservationSubBlocks/subblock/extra',
+        'name/reservationBlock/block/reservationSubBlocks/subblock',
+        'name/reservationBlocks/block/reservationSubBlock/subblock',
+        'reservations/name',
+        'project/p/reservations/name',
+        'projects/p/reservation/name',
+        'projects/p/reservations',
+        'projects/p/reservations/name/reservationBlocks/block/reservationSubBlocks/subblock/extra',
+        'projects/p/reservations/name/reservationBlocks//reservationSubBlocks/subblock',
+    ],
+)
 @patch('xpk.core.capacity.xpk_print')
-def test_get_reservation_project_and_name_fails_for_invalid_reservation(
-    xpk_print: MagicMock, mocker
+def test_parse_reservation_fails_on_invalid_reservations(
+    xpk_print: MagicMock, reservation_path: str
 ):
   with pytest.raises(SystemExit):
-    get_reservation_project_and_name(
-        'invalid/reservation',
-        'cluster-project',
-    )
+    parse_reservation(reservation_path, 'cluster-project')
   assert 'Unable to parse reservation' in xpk_print.mock_calls[0].args[0]

xpk/core/commands.py CHANGED Viewed

@@ -19,13 +19,27 @@ import subprocess
 import sys
 import time
+from dataclasses import dataclass
 from ..utils.objects import chunks
 from ..utils.file import make_tmp_files, write_tmp_file
 from ..utils.console import xpk_print
 from ..utils.execution_context import is_dry_run
-def run_commands(commands, jobname, per_command_name, batch=10):
+@dataclass
+class FailedCommand:
+  return_code: int
+  name: str
+  command: str
+  logfile: str
+def run_commands(
+    commands: list[str],
+    jobname: str,
+    per_command_name: list[str],
+    batch: int = 10,
+) -> FailedCommand | None:
   """Run commands in groups of `batch`.
   Args:
@@ -35,8 +49,10 @@ def run_commands(commands, jobname, per_command_name, batch=10):
     batch: number of commands to run in parallel.
   Returns:
-    0 if successful and 1 otherwise.
+    None if all commands were successful, FailedCommand instance containing
+    details of a single failing command otherwise
   """
   temporary_files_batches = chunks(make_tmp_files(per_command_name), batch)
   commands_batched = chunks(commands, batch)
   per_command_name_batches = chunks(per_command_name, batch)
@@ -47,24 +63,27 @@ def run_commands(commands, jobname, per_command_name, batch=10):
   )
   if is_dry_run():
     xpk_print('Pretending all the jobs succeeded')
-    return 0
+    return None
-  max_return_code = 0
   for i, _ in enumerate(commands_batched):
     xpk_print(f'Dispatching batch {i}/{len(commands_batched)}')
-    batch_max_return_code, _ = run_command_batch(
+    maybe_failure = run_command_batch(
         commands_batched[i],
         jobname,
         per_command_name_batches[i],
         temporary_files_batches[i],
     )
-    max_return_code = max(max_return_code, batch_max_return_code)
-    if max_return_code > 0:
-      return max_return_code
-  return max_return_code
+    if maybe_failure is not None:
+      return maybe_failure
+  return None
-def run_command_batch(commands, jobname, per_command_name, output_logs):
+def run_command_batch(
+    commands: list[str],
+    jobname: str,
+    per_command_name: list[str],
+    output_logs: list[str],
+) -> FailedCommand | None:
   """Runs commands in parallel.
   Args:
@@ -74,7 +93,8 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
     output_logs: list of n log paths, each command will output to each log.
   Returns:
-    The max return code and a list of all the return codes.
+    None if all commands were successful, FailedCommand instance containing
+    details of a single failing command otherwise
   """
   files = [open(f, 'w', encoding='utf-8') for f in output_logs]
@@ -86,6 +106,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
         subprocess.Popen(command, stdout=file, stderr=file, shell=True)
     )
+  maybe_failure: FailedCommand | None = None
   while True:
     returncodes = [child.poll() for child in children]
     max_returncode = max([0] + [r for r in returncodes if r is not None])
@@ -118,6 +139,12 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
       )
       for child in children:
         child.terminate()
+      maybe_failure = FailedCommand(
+          return_code=returncodes[failing_index] or 0,
+          name=per_command_name[failing_index],
+          command=commands[failing_index],
+          logfile=output_logs[failing_index],
+      )
       break
     if completed == total:
@@ -128,7 +155,7 @@ def run_command_batch(commands, jobname, per_command_name, output_logs):
   for file in files:
     file.close()
-  return max_returncode, returncodes
+  return maybe_failure
 def run_command_with_updates_retry(

xpk/core/kueue_manager.py CHANGED Viewed

@@ -15,7 +15,6 @@ limitations under the License.
 """
 import math
-import textwrap
 from dataclasses import dataclass
 from typing import Optional, List, Dict, Any
 import json
@@ -48,10 +47,12 @@ WAIT_FOR_KUEUE_TIMEOUT = "10m"
 CLUSTER_QUEUE_NAME = "cluster-queue"
 LOCAL_QUEUE_NAME = "multislice-queue"
 SUB_SLICE_TOPOLOGY_NAME = "sub-slice-topology"
+SUPER_SLICE_TOPOLOGY_NAME = "super-slice-topology"
 KUEUE_CONFIG_JINJA_FILE = "kueue_config.yaml.j2"
 KUEUE_GKE_DEFAULT_TOPOLOGY_JINJA_FILE = "kueue_gke_default_topology.yaml.j2"
 KUEUE_CONTROLLER_MANAGER_JINJA_FILE = "kueue_controller_manager.yaml.j2"
 KUEUE_SUB_SLICING_TOPOLOGY_JINJA_FILE = "kueue_sub_slicing_topology.yaml.j2"
+KUEUE_SUPER_SLICING_TOPOLOGY_JINJA_FILE = "kueue_super_slicing_topology.yaml.j2"
 MEMORY_SIZE_PER_VM = 1.2
 MIN_MEMORY_LIMIT_SIZE = 4096
@@ -63,6 +64,7 @@ class KueueConfig:
   cpu_limit: int
   memory_limit: str
   configure_sub_slicing: bool
+  configure_super_slicing: bool
   is_pathways_cluster: bool = False
   autoprovisioning_enabled: bool = False
   flex: bool = False
@@ -268,7 +270,9 @@ class KueueManager:
     template = self.template_env.get_template(KUEUE_CONFIG_JINJA_FILE)
     topology_name_and_yaml = self.__get_topology_name_and_yaml(
-        kueue_config.system, kueue_config.configure_sub_slicing
+        kueue_config.system,
+        kueue_config.configure_sub_slicing,
+        kueue_config.configure_super_slicing,
     )
     topology_name = (
         topology_name_and_yaml.name if topology_name_and_yaml else None
@@ -324,7 +328,11 @@ class KueueManager:
       key, value = accelerator_label.split(":", 1)
       node_labels_dict[key] = value.strip()
-    if not autoprovisioning:
+    if system.supports_super_slicing:
+      node_labels_dict["cloud.google.com/gke-tpu-partition-4x4x4-state"] = (
+          "HEALTHY"
+      )
+    elif not autoprovisioning:
       machine_label = create_machine_label(system)
       if machine_label:
         key, value = machine_label.split(":", 1)
@@ -374,13 +382,11 @@ class KueueManager:
           }],
       })
-    if flex and is_queued_cluster(num_slices):
-      admission_checks = textwrap.dedent("""
-        admissionChecks:
-        - dws-prov
-      """)
-    else:
-      admission_checks = ""
+    admission_checks = []
+    if system.supports_super_slicing:
+      admission_checks.append("ss-kueue-operator")
+    if flex and is_queued_cluster(num_slices, system.accelerator_type):
+      admission_checks.append("dws-prov")
     return {
         "flavors": flavors,
@@ -393,7 +399,10 @@ class KueueManager:
     }
   def __get_topology_name_and_yaml(
-      self, system: SystemCharacteristics, configure_sub_slicing: bool
+      self,
+      system: SystemCharacteristics,
+      configure_sub_slicing: bool,
+      configure_super_slicing: bool,
   ) -> _NameAndYaml | None:
     if (
         system.accelerator_type == AcceleratorType["GPU"]
@@ -427,6 +436,15 @@ class KueueManager:
               "levels": levels,
           }),
       )
+    elif configure_super_slicing:
+      return _NameAndYaml(
+          name=SUPER_SLICE_TOPOLOGY_NAME,
+          yaml=self.template_env.get_template(
+              KUEUE_SUPER_SLICING_TOPOLOGY_JINJA_FILE
+          ).render({
+              "super_slice_topology_name": SUPER_SLICE_TOPOLOGY_NAME,
+          }),
+      )
     else:
       return None
@@ -552,6 +570,19 @@ def has_sub_slicing_enabled() -> tuple[int, bool | None]:
   return return_code, SUB_SLICE_TOPOLOGY_NAME in value
+def has_super_slicing_enabled() -> tuple[int, bool | None]:
+  return_code, value = run_command_for_value(
+      command="kubectl get topology",
+      task="Get defined topologies",
+      dry_run_return_val=SUPER_SLICE_TOPOLOGY_NAME,
+  )
+  if return_code != 0:
+    return return_code, None
+  return return_code, SUPER_SLICE_TOPOLOGY_NAME in value
 def _autocorrect_cpu_limit(cpu_limit: int, cpu_capacity: int) -> int:
   if cpu_limit > cpu_capacity:
     xpk_print(

xpk 0.16.0__py3-none-any.whl → 0.17.0__py3-none-any.whl

xpk 0.16.0py3-none-any.whl → 0.17.0py3-none-any.whl