PyPI - xpk - Versions diffs - 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

xpk 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

integration/README.md +19 -0
xpk/blueprints/a3mega/config-map.yaml.tftpl +15 -0
xpk/blueprints/a3mega/storage_crd.yaml +52 -0
xpk/blueprints/a3ultra/config-map.yaml.tftpl +15 -0
xpk/blueprints/a3ultra/mlgru-disable.yaml +59 -0
xpk/blueprints/a3ultra/nccl-installer.yaml +95 -0
xpk/blueprints/a3ultra/storage_crd.yaml +52 -0
xpk/blueprints/a4/config-map.yaml.tftpl +15 -0
xpk/blueprints/a4/nccl-rdma-installer-a4.yaml +66 -0
xpk/blueprints/a4/storage_crd.yaml +52 -0
xpk/commands/cluster.py +33 -12
xpk/commands/cluster_gcluster_test.py +5 -1
xpk/commands/cluster_test.py +125 -0
xpk/commands/config.py +3 -3
xpk/commands/inspector.py +5 -3
xpk/commands/kind.py +2 -0
xpk/commands/managed_ml_diagnostics.py +249 -0
xpk/commands/managed_ml_diagnostics_test.py +146 -0
xpk/commands/workload.py +124 -139
xpk/commands/workload_test.py +160 -118
xpk/core/blueprint/blueprint_generator.py +3 -0
xpk/core/blueprint/testing/data/a3_mega.yaml +129 -0
xpk/core/blueprint/testing/data/a3_mega_spot.yaml +125 -0
xpk/core/blueprint/testing/data/a3_ultra.yaml +173 -0
xpk/core/blueprint/testing/data/a4.yaml +185 -0
xpk/core/capacity.py +2 -0
xpk/core/cluster.py +18 -47
xpk/core/cluster_test.py +76 -1
xpk/core/config.py +81 -7
xpk/core/config_test.py +67 -11
xpk/core/docker_container.py +3 -1
xpk/core/docker_image.py +10 -6
xpk/core/docker_resources.py +1 -10
xpk/core/kjob.py +17 -16
xpk/core/kueue_manager.py +13 -19
xpk/core/kueue_manager_test.py +27 -1
xpk/core/nap.py +13 -14
xpk/core/nodepool.py +17 -15
xpk/core/nodepool_test.py +25 -4
xpk/core/pathways.py +23 -0
xpk/core/pathways_test.py +57 -0
xpk/core/resources.py +84 -27
xpk/core/scheduling.py +128 -132
xpk/core/scheduling_test.py +215 -2
xpk/core/system_characteristics.py +179 -0
xpk/core/system_characteristics_test.py +49 -1
xpk/core/telemetry.py +4 -4
xpk/core/telemetry_test.py +9 -9
xpk/core/vertex.py +4 -3
xpk/core/workload_decorators/tcpx_decorator.py +5 -1
xpk/main.py +2 -0
xpk/parser/cluster.py +22 -88
xpk/parser/cluster_test.py +41 -0
xpk/parser/common.py +84 -0
xpk/parser/storage.py +10 -0
xpk/parser/storage_test.py +47 -0
xpk/parser/workload.py +14 -41
xpk/parser/workload_test.py +2 -48
xpk/templates/arm_gpu_workload_crate.yaml.j2 +46 -0
xpk/utils/feature_flags.py +3 -0
xpk/utils/validation.py +2 -2
xpk-0.16.0.dist-info/METADATA +127 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/RECORD +67 -48
xpk-0.15.0.dist-info/METADATA +0 -1666
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/WHEEL +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/entry_points.txt +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.15.0.dist-info → xpk-0.16.0.dist-info}/top_level.txt +0 -0

xpk/core/workload_decorators/tcpx_decorator.py CHANGED Viewed

@@ -39,12 +39,16 @@ def decorate_job(job_manifest: dict) -> dict:
   return job_manifest
-def decorate_jobset(jobset_manifest_str: str) -> str:
+def decorate_jobset(  # pylint: disable=dangerous-default-value
+    jobset_manifest_str: str,
+    sub_networks: list[str] = [],  # pylint: disable=unused-argument
+) -> str:
   """
   Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
   Args:
     jobset_manifest_str: The JobSet manifest as a YAML string.
+    sub_networks: This parameter is accepted for interface consistency but is not used.
   Returns:
     The modified JobSet manifest as a YAML string.

xpk/main.py CHANGED Viewed

@@ -37,6 +37,7 @@ import sys
 from .parser.core import set_parser
 from .core.updates import print_xpk_hello
+from .core.config import set_config, FileSystemConfig
 from .core.telemetry import MetricsCollector, send_clearcut_payload, should_send_telemetry
 from .utils.console import xpk_print, exit_code_to_int
 from .utils.execution_context import set_context
@@ -69,6 +70,7 @@ def main() -> None:
     main_args = parser.parse_args()
     main_args.enable_ray_cluster = False
+    set_config(FileSystemConfig())
     set_context(
         dry_run_value='dry_run' in main_args and main_args.dry_run,
         quiet_value=(

xpk/parser/cluster.py CHANGED Viewed

@@ -26,11 +26,10 @@ from ..commands.cluster import (
     cluster_describe,
     cluster_list,
 )
-from ..core.config import xpk_config
-from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType
+from ..core.config import get_config
 from ..core.config import CFG_BUCKET_KEY
 from ..core.vertex import DEFAULT_VERTEX_TENSORBOARD_NAME
-from .common import add_shared_arguments, ParserOrArgumentGroup
+from .common import add_shared_arguments, ParserOrArgumentGroup, add_tpu_type_argument, add_tpu_and_device_type_arguments
 from .validators import name_type
 from ..utils.feature_flags import FeatureFlags
@@ -99,27 +98,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
           required=True
       )
   )
-  cluster_device_group.add_argument(
-      '--tpu-type',
-      type=str,
-      default=None,
-      help='The tpu type to use, v5litepod-16, etc.',
-      metavar='TPU_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(
-          [AcceleratorType.TPU]
-      ),
-  )
-  cluster_device_group.add_argument(
-      '--device-type',
-      type=str,
-      default=None,
-      help=(
-          'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
-          ' h100-80gb-8, n2-standard-32-4 etc.'
-      ),
-      metavar='DEVICE_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(),
-  )
+  add_tpu_and_device_type_arguments(cluster_device_group)
   ### Optional arguments specific to "cluster create"
   cluster_create_optional_arguments = cluster_create_parser.add_argument_group(
@@ -131,7 +110,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
   cluster_create_optional_arguments.add_argument(
       '--cluster-state-gcs-bucket',
       type=str,
-      default=xpk_config.get(CFG_BUCKET_KEY),
+      default=get_config().get(CFG_BUCKET_KEY),
       help='The name of the bucket to store cluster state.',
       required=False,
   )
@@ -150,6 +129,7 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
           ' enable cluster to accept Pathways workloads.'
       ),
   )
   if FeatureFlags.SUB_SLICING_ENABLED:
     add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
@@ -207,15 +187,8 @@ def set_cluster_create_pathways_parser(
   add_shared_cluster_create_required_arguments(
       cluster_create_pathways_required_arguments
   )
-  cluster_create_pathways_required_arguments.add_argument(
-      '--tpu-type',
-      type=str,
-      default=None,
-      help='The tpu type to use, v5litepod-16, etc.',
-      metavar='TPU_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(
-          [AcceleratorType.TPU]
-      ),
+  add_tpu_type_argument(
+      cluster_create_pathways_required_arguments, required=True
   )
   ### Optional arguments specific to "cluster create-pathways"
@@ -292,17 +265,8 @@ def set_cluster_create_ray_parser(cluster_create_ray_parser: ArgumentParser):
   add_shared_cluster_create_required_arguments(
       cluster_create_ray_required_arguments
   )
-  cluster_create_ray_required_arguments.add_argument(
-      '--tpu-type',
-      type=str,
-      default=None,
-      help='The tpu type to use, v5litepod-16, etc.',
-      required=True,
-      metavar='TPU_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(
-          [AcceleratorType.TPU]
-      ),
-  )
+  add_tpu_type_argument(cluster_create_ray_required_arguments, required=True)
   # TODO(bzmarke): Add --device-type to support GPU/CPU
   cluster_create_ray_required_arguments.add_argument(
       '--ray-version',
@@ -392,7 +356,7 @@ def set_cluster_delete_parser(cluster_delete_parser: ArgumentParser):
   cluster_delete_optional_arguments.add_argument(
       '--cluster-state-gcs-bucket',
       type=str,
-      default=xpk_config.get(CFG_BUCKET_KEY),
+      default=get_config().get(CFG_BUCKET_KEY),
       help='The name of the bucket to store cluster state.',
       required=False,
   )
@@ -421,27 +385,7 @@ def set_cluster_cacheimage_parser(cluster_cacheimage_parser: ArgumentParser):
   )
   ### Device Type Argument
-  cluster_cacheimage_group.add_argument(
-      '--tpu-type',
-      type=str,
-      default=None,
-      help='The tpu type to cache images on, v5litepod-16, etc.',
-      metavar='TPU_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(
-          [AcceleratorType.TPU]
-      ),
-  )
-  cluster_cacheimage_group.add_argument(
-      '--device-type',
-      type=str,
-      default=None,
-      help=(
-          'The device type to cache images on (can be tpu or gpu),'
-          ' v5litepod-16, h100-80gb-8, etc.'
-      ),
-      metavar='DEVICE_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(),
-  )
+  add_tpu_and_device_type_arguments(cluster_cacheimage_group)
   ### Required arguments
   cluster_cacheimage_required_arguments.add_argument(
@@ -526,27 +470,7 @@ def set_cluster_adapt_parser(cluster_adapt_parser: ArgumentParser):
           required=True
       )
   )
-  cluster_adapt_device_group.add_argument(
-      '--tpu-type',
-      type=str,
-      default=None,
-      help='The tpu type used on cluster, v5litepod-16, etc.',
-      metavar='TPU_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(
-          [AcceleratorType.TPU]
-      ),
-  )
-  cluster_adapt_device_group.add_argument(
-      '--device-type',
-      type=str,
-      default=None,
-      help=(
-          'The device type used on cluster (can be tpu or gpu or cpu), eg.'
-          ' h100-80gb-8, n2-standard-32-4 etc.'
-      ),
-      metavar='DEVICE_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(),
-  )
+  add_tpu_and_device_type_arguments(cluster_adapt_device_group)
   cluster_adapt_optional_arguments = cluster_adapt_parser.add_argument_group(
       'Optional Arguments',
@@ -691,6 +615,11 @@ def add_shared_cluster_create_optional_arguments(
           ' regional clusters, all zones must support the machine type.'
       ),
   )
+  parser_or_group.add_argument(
+      '--managed-mldiagnostics',
+      action='store_true',
+      help='Enables the installation of required ML Diagnostics components.',
+  )
   parser_or_group.add_argument(
       '--cluster-cpu-machine-type',
       type=str,
@@ -819,6 +748,11 @@ def add_driver_arguments(parser_or_group: ParserOrArgumentGroup):
       action='store_true',
       help='Enable Lustre CSI driver on the cluster.',
   )
+  parser_or_group.add_argument(
+      '--enable-legacy-lustre-port',
+      action='store_true',
+      help='Enable legacy port for Lustre CSI driver on the cluster.',
+  )
 def add_shared_cluster_create_tensorboard_arguments(

xpk/parser/cluster_test.py CHANGED Viewed

@@ -103,3 +103,44 @@ def test_cluster_create_ray_sub_slicing_is_hidden_but_set_to_false():
   assert args.sub_slicing is False
   assert "--sub-slicing" not in help_str
+def test_cluster_create_managed_mldiagnostics():
+  parser = argparse.ArgumentParser()
+  set_cluster_create_parser(parser)
+  args = parser.parse_args([
+      "--cluster",
+      "test-cluster",
+      "--tpu-type",
+      "v5p-8",
+      "--managed-mldiagnostics",
+  ])
+  assert args.managed_mldiagnostics is True
+def test_cluster_create_enable_lustre_legacy_port_is_false_by_default():
+  parser = argparse.ArgumentParser()
+  set_cluster_create_parser(parser)
+  args = parser.parse_args(
+      ["--cluster", "test-cluster", "--tpu-type", "tpu7x-2"]
+  )
+  assert args.enable_legacy_lustre_port is False
+def test_cluster_create_enable_lustre_legacy_port_can_be_set():
+  parser = argparse.ArgumentParser()
+  set_cluster_create_parser(parser)
+  args = parser.parse_args([
+      "--cluster",
+      "test-cluster",
+      "--tpu-type",
+      "tpu7x-2",
+      "--enable-legacy-lustre-port",
+  ])
+  assert args.enable_legacy_lustre_port is True

xpk/parser/common.py CHANGED Viewed

@@ -16,6 +16,10 @@ limitations under the License.
 import argparse
 from typing import Protocol, Any
+from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType
+import difflib
+from argcomplete import ChoicesCompleter
+from argparse import Action, ArgumentError
 class ParserOrArgumentGroup(Protocol):
@@ -24,6 +28,46 @@ class ParserOrArgumentGroup(Protocol):
     ...
+class ManyChoicesAction(Action):
+  """An action class to output better error message for arguments with large lists of choices."""
+  def __init__(self, *args, large_choice_list, **kwargs):
+    self.large_list_of_choices = large_choice_list
+    super().__init__(*args, **kwargs)
+  def __call__(self, parser, namespace, value, option_string=None):
+    if value not in self.large_list_of_choices:
+      close_matches = difflib.get_close_matches(
+          value, self.large_list_of_choices, n=5, cutoff=0
+      )
+      msg = (
+          f"invalid choice: '{value}' (closest matches:"
+          f" {', '.join(close_matches)})"
+      )
+      raise ArgumentError(self, msg)
+    setattr(namespace, self.dest, value)
+def add_many_choices_argument(
+    parserOrGroup: ParserOrArgumentGroup,
+    flag_name,
+    choices: list[str],
+    metavar: str,
+    help_msg: str,
+    required: bool = False,
+) -> None:
+  parserOrGroup.add_argument(
+      flag_name,
+      action=ManyChoicesAction,
+      large_choice_list=choices,
+      type=str,
+      metavar=metavar,
+      help=help_msg,
+      required=required,
+      default=None,
+  ).completer = ChoicesCompleter(choices)
 def add_shared_arguments(
     custom_parser_or_group: ParserOrArgumentGroup, required=False
 ) -> None:
@@ -285,3 +329,43 @@ def add_slurm_arguments(custom_parser_or_group: ParserOrArgumentGroup):
           ' `very-high`. Defaults to `medium`.'
       ),
   )
+def add_tpu_type_argument(
+    custom_parser_or_group: ParserOrArgumentGroup,
+    required: bool = False,
+) -> None:
+  add_many_choices_argument(
+      custom_parser_or_group,
+      '--tpu-type',
+      choices=get_system_characteristics_keys_by_accelerator_type(
+          [AcceleratorType.TPU]
+      ),
+      metavar='TPU_TYPE',
+      help_msg='The tpu type to use, v5litepod-16, etc.',
+      required=required,
+  )
+def add_device_type_argument(
+    custom_parser_or_group: ParserOrArgumentGroup,
+    required: bool = False,
+) -> None:
+  add_many_choices_argument(
+      custom_parser_or_group,
+      '--device-type',
+      choices=get_system_characteristics_keys_by_accelerator_type(),
+      metavar='DEVICE_TYPE',
+      help_msg=(
+          'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
+          ' h100-80gb-8, n2-standard-32-4 etc.'
+      ),
+      required=required,
+  )
+def add_tpu_and_device_type_arguments(
+    custom_parser_or_group: ParserOrArgumentGroup,
+) -> None:
+  add_tpu_type_argument(custom_parser_or_group)
+  add_device_type_argument(custom_parser_or_group)

xpk/parser/storage.py CHANGED Viewed

@@ -104,6 +104,16 @@ def add_storage_attach_parser(
       help='If true workloads can only read from storage',
   )
+  lustre_args = storage_attach_parser.add_argument_group(
+      'Lustre arguments',
+      'Arguments used when --type=lustre',
+  )
+  lustre_args.add_argument(
+      '--enable-legacy-lustre-port',
+      action='store_true',
+      help='Enable legacy port for Lustre CSI driver on the cluster.',
+  )
   gcsfuse_args = storage_attach_parser.add_argument_group(
       'FUSE arguments',
       'Arguments used when --type=gcsfuse',

xpk/parser/storage_test.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import argparse
+from xpk.parser.storage import set_storage_parser
+DEFAULT_ATTACH_ARGUMENTS = (
+    "attach test-storage --cluster test-cluster --zone test-zone"
+    " --project test-project --mount-point test-mount-point"
+    " --readonly false --auto-mount true"
+)
+DEFAULT_LUSTRE_ATTACH_ARGUMENTS = (
+    DEFAULT_ATTACH_ARGUMENTS + " --type lustre --manifest test-manifest"
+)
+def test_cluster_create_enable_lustre_legacy_port_is_false_by_default():
+  parser = argparse.ArgumentParser()
+  set_storage_parser(parser)
+  args = parser.parse_args(DEFAULT_LUSTRE_ATTACH_ARGUMENTS.split())
+  assert args.enable_legacy_lustre_port is False
+def test_cluster_create_enable_lustre_legacy_port_can_be_set():
+  parser = argparse.ArgumentParser()
+  set_storage_parser(parser)
+  args = parser.parse_args(
+      DEFAULT_LUSTRE_ATTACH_ARGUMENTS.split() + ["--enable-legacy-lustre-port"]
+  )
+  assert args.enable_legacy_lustre_port is True

xpk/parser/workload.py CHANGED Viewed

@@ -22,10 +22,8 @@ from ..commands.workload import (
     workload_list,
 )
 from ..core.docker_image import DEFAULT_DOCKER_IMAGE, DEFAULT_SCRIPT_DIR
-from .common import add_shared_arguments
+from .common import add_shared_arguments, add_tpu_type_argument, add_tpu_and_device_type_arguments
 from .validators import directory_path_type, name_type
-from ..utils.feature_flags import FeatureFlags
-from ..core.system_characteristics import get_system_characteristics_keys_by_accelerator_type, AcceleratorType, SUB_SLICING_TOPOLOGIES
 def set_workload_parsers(workload_parser: ArgumentParser):
@@ -119,27 +117,7 @@ def set_workload_create_parser(workload_create_parser: ArgumentParser):
           required=True
       )
   )
-  workload_device_group.add_argument(
-      '--tpu-type',
-      type=str,
-      default=None,
-      help='The tpu type to use, v5litepod-16, etc.',
-      metavar='TPU_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(
-          [AcceleratorType.TPU]
-      ),
-  )
-  workload_device_group.add_argument(
-      '--device-type',
-      type=str,
-      default=None,
-      help=(
-          'The device type to use (can be tpu or gpu or cpu), v5litepod-16,'
-          ' h100-80gb-8, n2-standard-32-4 etc.'
-      ),
-      metavar='DEVICE_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(),
-  )
+  add_tpu_and_device_type_arguments(workload_device_group)
   workload_create_parser_optional_arguments.add_argument(
       '--storage',
@@ -287,15 +265,8 @@ def set_workload_create_pathways_parser(
       )
   )
   ### "workload create-pathways" Required arguments, specific to Pathways
-  workload_create_pathways_parser_required_arguments.add_argument(
-      '--tpu-type',
-      type=str,
-      default=None,
-      help='The tpu type to use, v5litepod-16, etc.',
-      metavar='TPU_TYPE',
-      choices=get_system_characteristics_keys_by_accelerator_type(
-          [AcceleratorType.TPU]
-      ),
+  add_tpu_type_argument(
+      workload_create_pathways_parser_required_arguments, required=True
   )
   ### "workload create-pathways" Optional arguments, specific to Pathways
@@ -612,6 +583,16 @@ def add_shared_workload_create_optional_arguments(args_parsers):
             ' `jax-tpu`.'
         ),
     )
+    custom_parser.add_argument(
+        '--output-manifest-file',
+        type=str,
+        default=None,
+        help=(
+            'If you want to see the generated manifest, provide a file path'
+            ' here. This will write the manifest to the file. If used with'
+            ' --dry-run, it will skip the actual deployment and cluster checks.'
+        ),
+    )
     custom_parser.add_argument(
         '--num-slices',
         type=int,
@@ -670,14 +651,6 @@ def add_shared_workload_create_optional_arguments(args_parsers):
             ' the workload.'
         ),
     )
-    if FeatureFlags.SUB_SLICING_ENABLED:
-      custom_parser.add_argument(
-          '--sub-slicing-topology',
-          type=str,
-          help='Sub-slicing topology to use.',
-          required=False,
-          choices=SUB_SLICING_TOPOLOGIES,
-      )
 def add_shared_workload_create_env_arguments(args_parsers):

xpk/parser/workload_test.py CHANGED Viewed

@@ -16,35 +16,9 @@ limitations under the License.
 import argparse
 from xpk.parser.workload import set_workload_create_parser
-from ..utils.feature_flags import FeatureFlags
-import pytest
-@pytest.fixture(autouse=True)
-def with_sub_slicing_enabled():
-  FeatureFlags.SUB_SLICING_ENABLED = True
-def test_workload_create_sub_slicing_topology_is_hidden_with_flag_off():
-  FeatureFlags.SUB_SLICING_ENABLED = False
-  parser = argparse.ArgumentParser()
-  set_workload_create_parser(parser)
-  help_str = parser.format_help()
-  assert "--sub-slicing" not in help_str
-def test_workload_create_sub_slicing_topology_is_shown_with_flag_on():
-  parser = argparse.ArgumentParser()
-  set_workload_create_parser(parser)
-  help_str = parser.format_help()
-  assert "--sub-slicing" in help_str
-def test_workload_create_sub_slicing_topology_is_none_by_default():
+def test_workload_create_parses():
   parser = argparse.ArgumentParser()
   set_workload_create_parser(parser)
@@ -59,24 +33,4 @@ def test_workload_create_sub_slicing_topology_is_none_by_default():
       "tpu7x-2",
   ])
-  assert args.sub_slicing_topology is None
-def test_workload_create_sub_slicing_topology_can_be_set():
-  parser = argparse.ArgumentParser()
-  set_workload_create_parser(parser)
-  args = parser.parse_args([
-      "--cluster",
-      "test-cluster",
-      "--command",
-      "python3",
-      "--workload",
-      "test",
-      "--tpu-type",
-      "tpu7x-8",
-      "--sub-slicing-topology",
-      "2x4",
-  ])
-  assert args.sub_slicing_topology is "2x4"
+  assert args

xpk/templates/arm_gpu_workload_crate.yaml.j2 ADDED Viewed

@@ -0,0 +1,46 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: {{ workload }}
+  labels:
+    kueue.x-k8s.io/queue-name: multislice-queue  # Name of the LocalQueue
+    xpk.google.com/workload: {{ workload }}
+spec:
+  ttlSecondsAfterFinished: {{ ttl_seconds_after_finished }}
+  failurePolicy:
+    {{ failure_policy_rules }}
+    maxRestarts: {{ max_restarts }}
+  replicatedJobs:
+    - name: slice-job
+      replicas: 1
+      template:
+        spec:
+          parallelism: {{ num_nodes }}
+          completions: {{ num_nodes }}
+          backoffLimit: 0   # When any pod fails, the job is failed
+          {{ pod_failure_policy }}
+          template:
+            metadata:
+              labels:
+                xpk.google.com/workload: {{ workload }}
+              annotations:
+                {{ annotations }}
+            spec:
+              priorityClassName: {{ priority }}
+              restartPolicy: Never
+              nodeSelector:
+                {{ placement_policy_label }}
+              imagePullSecrets:
+              - name: {{ docker_image_pull_secret }}
+              dnsPolicy: ClusterFirstWithHostNet
+              terminationGracePeriodSeconds: {{ termination_grace_period_seconds }}
+              serviceAccountName: {{ service_account }}
+              tolerations:
+              - operator: "Exists"
+                key: nvidia.com/gpu
+              - key: "kubernetes.io/arch"
+                operator: "Equal"
+                value: "arm64"
+                effect: "NoSchedule"
+              containers:
+              {{ container }}

xpk/utils/feature_flags.py CHANGED Viewed

@@ -29,6 +29,9 @@ def _get_boolean_flag(flag: str, default: bool) -> bool:
 class _FeatureFlags:
   SUB_SLICING_ENABLED = _get_boolean_flag("SUB_SLICING_ENABLED", default=False)
   TELEMETRY_ENABLED = _get_boolean_flag("TELEMETRY_ENABLED", default=False)
+  SUPER_SLICING_ENABLED = _get_boolean_flag(
+      "SUPER_SLICING_ENABLED", default=False
+  )
 FeatureFlags = _FeatureFlags()

xpk/utils/validation.py CHANGED Viewed

@@ -72,8 +72,8 @@ class SystemDependency(Enum):
 def should_validate_dependencies(args):
-  skip_validation = 'skip_validation' in args and args.skip_validation
-  dry_run = 'dry_run' in args and args.dry_run
+  skip_validation = hasattr(args, 'skip_validation') and args.skip_validation
+  dry_run = hasattr(args, 'dry_run') and args.dry_run
   return not skip_validation and not dry_run

xpk 0.15.0__py3-none-any.whl → 0.16.0__py3-none-any.whl

xpk 0.15.0py3-none-any.whl → 0.16.0py3-none-any.whl