PyPI - xpk - Versions diffs - 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl - Mend

xpk 0.13.0py3-none-any.whl → 0.14.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

integration/__init__.py +15 -0
integration/docker_manager_test.py +102 -0
integration/gcluster_a3mega_test.py +204 -0
integration/gcluster_a3ultra_test.py +176 -0
integration/gcluster_a4_test.py +176 -0
integration/gcluster_test.py +107 -0
xpk/commands/batch.py +9 -2
xpk/commands/cluster.py +143 -117
xpk/commands/cluster_gcluster.py +81 -14
xpk/commands/cluster_gcluster_test.py +177 -0
xpk/commands/cluster_test.py +92 -0
xpk/commands/common.py +14 -26
xpk/commands/info.py +11 -9
xpk/commands/inspector.py +21 -10
xpk/commands/job.py +25 -9
xpk/commands/kind.py +39 -40
xpk/commands/kjob_common.py +4 -4
xpk/commands/run.py +9 -2
xpk/commands/shell.py +13 -10
xpk/commands/storage.py +21 -0
xpk/commands/version.py +0 -4
xpk/commands/workload.py +84 -29
xpk/commands/workload_test.py +81 -0
xpk/core/blueprint/blueprint_generator.py +4 -40
xpk/core/blueprint/blueprint_test.py +0 -6
xpk/core/blueprint/testing/__init__.py +15 -0
xpk/core/capacity.py +6 -5
xpk/core/cluster.py +91 -194
xpk/core/cluster_private.py +6 -11
xpk/core/commands.py +11 -18
xpk/core/config.py +1 -1
xpk/core/docker_image.py +3 -4
xpk/core/gcloud_context.py +26 -2
xpk/core/gcloud_context_test.py +96 -0
xpk/core/gcluster_manager.py +0 -3
xpk/core/jobset.py +4 -7
xpk/core/kjob.py +14 -27
xpk/core/kueue_manager.py +423 -0
xpk/core/kueue_manager_test.py +574 -0
xpk/core/monitoring.py +1 -1
xpk/core/nap.py +10 -15
xpk/core/network.py +17 -18
xpk/core/nodepool.py +66 -77
xpk/core/nodepool_test.py +198 -1
xpk/core/pathways.py +5 -5
xpk/core/ray.py +10 -14
xpk/core/resources.py +6 -11
xpk/core/scheduling.py +19 -1
xpk/core/scheduling_test.py +31 -0
xpk/core/system_characteristics.py +350 -232
xpk/core/system_characteristics_test.py +73 -0
xpk/core/vertex.py +1 -1
xpk/core/workload.py +7 -8
xpk/main.py +2 -4
xpk/parser/cluster.py +7 -0
xpk/parser/cluster_test.py +66 -0
xpk/parser/common.py +11 -0
xpk/parser/workload.py +62 -25
xpk/parser/workload_test.py +82 -0
xpk/templates/cluster_preheat.yaml.j2 +31 -0
xpk/templates/filestore-pv.yaml +17 -0
xpk/templates/filestore-pvc.yaml +11 -0
xpk/templates/filestore-sc.yaml +10 -0
xpk/templates/fuse-pv.yaml +17 -0
xpk/templates/fuse-pvc.yaml +13 -0
xpk/templates/kueue_config.yaml.j2 +95 -0
xpk/templates/kueue_gke_default_topology.yaml.j2 +10 -0
xpk/templates/kueue_sub_slicing_topology.yaml.j2 +14 -0
xpk/templates/mtc-cpc.yaml +15 -0
xpk/templates/volume_bundle.yaml +7 -0
xpk/utils/feature_flags.py +28 -0
xpk/utils/kueue.py +20 -0
xpk/utils/templates.py +15 -0
xpk/utils/topology.py +46 -0
xpk/utils/topology_test.py +63 -0
xpk/utils/validation.py +79 -55
xpk/utils/validation_test.py +37 -0
{xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/METADATA +6 -1
xpk-0.14.1.dist-info/RECORD +133 -0
xpk-0.14.1.dist-info/top_level.txt +2 -0
xpk/core/kueue.py +0 -561
xpk-0.13.0.dist-info/RECORD +0 -101
xpk-0.13.0.dist-info/top_level.txt +0 -1
{xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/WHEEL +0 -0
{xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/entry_points.txt +0 -0
{xpk-0.13.0.dist-info → xpk-0.14.1.dist-info}/licenses/LICENSE +0 -0

xpk/commands/workload.py CHANGED Viewed

@@ -34,7 +34,7 @@ from ..core.docker_container import (
 )
 from ..core.docker_resources import get_volumes, parse_env_config
 from ..core.gcloud_context import add_zone_and_project
-from ..core.kueue import LOCAL_QUEUE_NAME
+from ..core.kueue_manager import LOCAL_QUEUE_NAME
 from ..core.monitoring import get_gke_outlier_dashboard
 from ..core.nap import (
     get_autoprovisioning_node_selector_args,
@@ -52,10 +52,7 @@ from ..core.pathways import (
     get_user_workload_for_pathways,
     try_to_delete_pathwaysjob_first,
 )
-from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics
-from ..core.capacity import (
-    CapacityType,
-)
+from ..core.resources import get_cluster_capacity_type, get_cluster_system_characteristics, SystemCharacteristics
 from ..core.resources import CLUSTER_METADATA_CONFIGMAP, get_cluster_configmap
 from ..core.scheduling import (
     check_if_workload_can_schedule,
@@ -65,6 +62,7 @@ from ..core.scheduling import (
     create_tpu_topology,
     get_cpu_affinity,
     get_gpu_scheduler,
+    create_sub_slicing_annotations,
 )
 from ..core.storage import (
     GCE_PD_TYPE,
@@ -80,6 +78,7 @@ from ..core.storage import (
 from ..core.system_characteristics import (
     AcceleratorType,
     get_system_characteristics,
+    compute_vms_per_slice,
 )
 from ..core.vertex import create_vertex_experiment
 from ..core.workload import (
@@ -87,7 +86,7 @@ from ..core.workload import (
     get_jobsets_list_gcp_link,
     get_workload_list,
     wait_for_job_completion,
-    zone_to_region,
+    get_cluster_location,
 )
 from ..core.workload_decorators import (
     rdma_decorator,
@@ -98,8 +97,11 @@ from ..core.workload_decorators import (
 from ..utils.console import get_user_input, xpk_exit, xpk_print
 from ..utils.file import write_tmp_file
 from ..utils.execution_context import is_dry_run
+from ..utils.validation import validate_dependencies_list, SystemDependency, should_validate_dependencies
 from . import cluster_gcluster
-from .common import is_TAS_possible
+from .common import is_TAS_possible, validate_sub_slicing_system
+from ..utils.topology import is_topology_contained
+from ..utils.feature_flags import FeatureFlags
 WORKLOAD_CREATE_YAML = """apiVersion: jobset.x-k8s.io/v1alpha2
 kind: JobSet
@@ -120,8 +122,8 @@ spec:
       replicas: {args.num_slices}
       template:
         spec:
-          parallelism: {system.vms_per_slice}    # Equal to the number of VMs per slice
-          completions: {system.vms_per_slice}    # Same as the above.
+          parallelism: {vms_per_slice}    # Equal to the number of VMs per slice (or sub-slice).
+          completions: {vms_per_slice}    # Same as the above.
           backoffLimit: 0   # When any pod fails, the job is failed
           {pod_failure_policy}
           template:
@@ -130,6 +132,7 @@ spec:
                 xpk.google.com/workload: {args.workload}
               annotations:
                 {storage_annotations}
+                {sub_slicing_annotations}
             spec:
               schedulerName: {args.scheduler}
               imagePullSecrets:
@@ -267,6 +270,8 @@ PW_WORKLOAD_CREATE_YAML = """
         maxSliceRestarts: {args.max_slice_restarts}
         terminationGracePeriodSeconds: {args.termination_grace_period_seconds}
         priorityClassName: {args.priority}
+        nodeSelector:
+          {autoprovisioning_args}
       pathwaysDir: {args.pathways_gcs_location} #This bucket needs to be created in advance.
       controller:
         # #Pod template for training, default mode.
@@ -277,6 +282,8 @@ PW_WORKLOAD_CREATE_YAML = """
       {user_workload}
 """
+SUB_SLICING_TOPOLOGIES = ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
 def workload_create_pathways(args) -> None:
   """Run jobset apply command for a file, specifically for Pathways.
@@ -307,6 +314,12 @@ def workload_create(args) -> None:
   Returns:
     0 if successful and 1 otherwise.
   """
+  if should_validate_dependencies(args):
+    validate_dependencies_list([
+        SystemDependency.KUBECTL,
+        SystemDependency.GCLOUD,
+        SystemDependency.DOCKER,
+    ])
   k8s_api_client = None
   if not is_dry_run():
     k8s_api_client = setup_k8s_env(args)
@@ -321,20 +334,21 @@ def workload_create(args) -> None:
     )
     xpk_exit(1)
-  xpk_print('Starting workload create', flush=True)
   system, return_code = get_system_characteristics(args)
   if return_code > 0 or system is None:
     xpk_print('Fetching system characteristics failed!')
     xpk_exit(return_code)
+  if FeatureFlags.SUB_SLICING_ENABLED and args.sub_slicing_topology is not None:
+    _validate_sub_slicing_topology(system, args.sub_slicing_topology)
   if not check_if_workload_can_schedule(args, system):
     xpk_exit(1)
   xpk_print('Starting workload create', flush=True)
   metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
-  cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
+  cluster_config_map = get_cluster_configmap(metadata_configmap_name)
   cluster_xpk_version = None
   if cluster_config_map is None:
     xpk_print(
@@ -482,16 +496,12 @@ def workload_create(args) -> None:
     capacity_type = get_cluster_capacity_type(args)
     annotations = (
-        ''
-        if not is_TAS_possible(
-            system_characteristics,
-            capacity_type,
-            flex=True if capacity_type == CapacityType.FLEX_START else False,
-        )
-        else (
+        (
             'kueue.x-k8s.io/podset-preferred-topology:'
             ' "cloud.google.com/gce-topology-host"'
         )
+        if is_TAS_possible(system_characteristics, capacity_type)
+        else ''
     )
     if (
@@ -507,7 +517,7 @@ def workload_create(args) -> None:
           annotations=annotations,
       )
-      sub_networks = get_cluster_subnetworks(args)
+      sub_networks = get_cluster_subnetworks()
       if args.device_type == a3high_device_type:
         yml_string = tcpx_decorator.decorate_jobset(yml_string)
       elif args.device_type == a3mega_device_type:
@@ -545,6 +555,7 @@ def workload_create(args) -> None:
         colocated_python_sidecar=append_custom_colocated_python_sidecar(args),
         user_workload=get_user_workload_for_pathways(args, system),
         local_queue_name=LOCAL_QUEUE_NAME,
+        autoprovisioning_args=autoprovisioning_args,
     )
   else:
     container, debugging_dashboard_id = get_user_workload_container(
@@ -552,12 +563,26 @@ def workload_create(args) -> None:
     )
     yml_string = WORKLOAD_CREATE_YAML.format(
         args=args,
-        system=system,
         container=container,
+        vms_per_slice=(
+            compute_vms_per_slice(args.sub_slicing_topology)
+            if system.accelerator_type == AcceleratorType['TPU']
+            and FeatureFlags.SUB_SLICING_ENABLED
+            and args.sub_slicing_topology is not None
+            else system.vms_per_slice
+        ),
         affinity=get_cpu_affinity(system.accelerator_type),
         accelerator_label=create_accelerator_label(
             system.accelerator_type, system
         ),
+        sub_slicing_annotations=(
+            ''
+            if not FeatureFlags.SUB_SLICING_ENABLED
+            or args.sub_slicing_topology is None
+            else ('\n' + (' ' * 16)).join(
+                create_sub_slicing_annotations(args.sub_slicing_topology)
+            )
+        ),
         machine_label=create_machine_label(system.accelerator_type, system),
         local_queue_name=LOCAL_QUEUE_NAME,
         autoprovisioning_args=autoprovisioning_args,
@@ -575,7 +600,7 @@ def workload_create(args) -> None:
     )
   tmp = write_tmp_file(yml_string)
   command = f'kubectl apply -f {str(tmp)}'
-  return_code = run_command_with_updates(command, 'Creating Workload', args)
+  return_code = run_command_with_updates(command, 'Creating Workload')
   if return_code != 0:
     xpk_print(f'Create Workload request returned ERROR {return_code}')
@@ -622,7 +647,9 @@ def workload_create(args) -> None:
           ' JAX_PLATFORMS=proxy; JAX_BACKEND_TARGET=grpc://127.0.0.1:29000;'
           " python -c 'import pathwaysutils; import jax; print(jax.devices())'"
       )
-      pathways_proxy_link = f'https://console.cloud.google.com/kubernetes/job/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
+      pathways_proxy_link = (
+          f'https://console.cloud.google.com/kubernetes/job/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}-proxy-0/details?project={args.project}'
+      )
       xpk_print(
           'Follow the proxy here:'
           # pylint: disable=line-too-long)
@@ -636,7 +663,7 @@ def workload_create(args) -> None:
     xpk_print(
         'Follow your workload here:'
         # pylint: disable=line-too-long
-        f' https://console.cloud.google.com/kubernetes/service/{zone_to_region(args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
+        f' https://console.cloud.google.com/kubernetes/service/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/default/{args.workload}/details?project={args.project}'
     )
     duration_of_logs = 'P1D'  # Past 1 Day
     xpk_print(
@@ -645,12 +672,35 @@ def workload_create(args) -> None:
         ' ([prefix]-slice-job-[slice_number]-[worker_number])'
         ' after clicking the url if you want other worker logs.'
         # pylint: disable=line-too-long
-        f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{zone_to_region(args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
+        f' https://console.cloud.google.com/logs/query;query=resource.type%3D%22k8s_container%22%0Aresource.labels.project_id%3D%22{args.project}%22%0Aresource.labels.location%3D%22{get_cluster_location(args.project, args.cluster, args.zone)}%22%0Aresource.labels.cluster_name%3D%22{args.cluster}%22%0Aresource.labels.namespace_name%3D%22default%22%0Aresource.labels.pod_name:%22{args.workload}-slice-job-0-0-%22%20severity%3E%3DDEFAULT;storageScope=project;duration={duration_of_logs}?e=13802955&mods=allow_workbench_image_override&project={args.project}'
     )
   xpk_exit(0)
+def _validate_sub_slicing_topology(
+    system_characteristics: SystemCharacteristics, sub_slicing_topology: str
+) -> None:
+  if sub_slicing_topology not in SUB_SLICING_TOPOLOGIES:
+    xpk_print(
+        f'Error: --sub-slicing-topology={sub_slicing_topology} shape is'
+        f' invalid. It has to be one of: {", ".join(SUB_SLICING_TOPOLOGIES)}.'
+    )
+    xpk_exit(1)
+  if not is_topology_contained(
+      contained=sub_slicing_topology, container=system_characteristics.topology
+  ):
+    xpk_print(
+        f'Error: --sub-slicing-topology={sub_slicing_topology} shape is too'
+        ' large. The shape cannot be bigger than'
+        f' {system_characteristics.topology}.'
+    )
+    xpk_exit(1)
+  validate_sub_slicing_system(system_characteristics)
 def get_restart_exit_codes(args) -> list:
   exit_codes = [42]
   exit_codes.extend(range(127, 256, 1))
@@ -678,6 +728,10 @@ def workload_delete(args) -> None:
   Returns:
     0 if successful and 1 otherwise.
   """
+  if should_validate_dependencies(args):
+    validate_dependencies_list(
+        [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
+    )
   xpk_print('Starting Workload delete', flush=True)
   add_zone_and_project(args)
   get_cluster_credentials(args)
@@ -725,16 +779,13 @@ def workload_delete(args) -> None:
     # Not batching deletion for single workload
     if len(workloads) == 1:
-      return_code = run_command_with_updates(
-          commands[0], 'Delete Workload', args
-      )
+      return_code = run_command_with_updates(commands[0], 'Delete Workload')
     else:
       return_code = run_commands(
           commands,
           'Delete Workload',
           task_names,
           batch=100,
-          dry_run=args.dry_run,
       )
     if return_code != 0:
@@ -752,6 +803,10 @@ def workload_list(args) -> None:
   Returns:
     0 if successful and 1 otherwise.
   """
+  if should_validate_dependencies(args):
+    validate_dependencies_list(
+        [SystemDependency.KUBECTL, SystemDependency.GCLOUD]
+    )
   xpk_print('Starting workload list', flush=True)
   add_zone_and_project(args)
   get_cluster_credentials(args)

xpk/commands/workload_test.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import dataclasses
+from unittest.mock import MagicMock, patch
+import pytest
+from ..core.system_characteristics import SystemCharacteristics
+from .workload import _validate_sub_slicing_topology
+SYSTEM_CHARACTERISTICS = SystemCharacteristics(
+    topology='8x8',
+    vms_per_slice=1,
+    gke_accelerator='nvidia-l4',
+    gce_machine_type='g2-standard-12',
+    chips_per_vm=1,
+    accelerator_type=1,
+    device_type='l4-1',
+    supports_sub_slicing=True,
+    requires_workload_policy=False,
+)
+@pytest.fixture(autouse=True)
+def xpk_print(mocker):
+  return mocker.patch('xpk.commands.workload.xpk_print')
+def test_validate_sub_slicing_topology_exits_for_unsupported_topology(
+    xpk_print,
+):
+  with pytest.raises(SystemExit):
+    _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '2x1')
+  assert (
+      'shape is invalid. It has to be one of' in xpk_print.mock_calls[0].args[0]
+  )
+def test_validate_sub_slicing_topology_exits_for_too_large_topology(xpk_print):
+  with pytest.raises(SystemExit):
+    _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '16x16')
+  assert (
+      'shape is too large. The shape cannot be'
+      in xpk_print.mock_calls[0].args[0]
+  )
+def test_validate_sub_slicing_topology_does_nothing_for_supported_topology():
+  _validate_sub_slicing_topology(SYSTEM_CHARACTERISTICS, '4x4')
+@patch('xpk.commands.common.xpk_print')
+def test_validate_sub_slicing_topology_fails_for_unsupported_system(
+    common_xpk_print: MagicMock,
+):
+  unsupported_system = dataclasses.replace(
+      SYSTEM_CHARACTERISTICS, supports_sub_slicing=False
+  )
+  with pytest.raises(SystemExit):
+    _validate_sub_slicing_topology(unsupported_system, '4x4')
+  assert (
+      'l4-1 does not support Sub-slicing.'
+      in common_xpk_print.mock_calls[0].args[0]
+  )

xpk/core/blueprint/blueprint_generator.py CHANGED Viewed

@@ -32,7 +32,6 @@ from ..capacity import (
 )
 from ..system_characteristics import get_system_characteristics_by_device_type
 from .blueprint_definitions import Blueprint, DeploymentGroup, DeploymentModule
-from ..kueue import KUEUE_VERSION
 yaml_parser = yaml.YAML()
@@ -53,6 +52,7 @@ blueprint_dependencies_dir = {
 cluster_toolkit_url = "github.com/GoogleCloudPlatform/cluster-toolkit"
 cluster_toolkit_version = "v1.62.2"
+common_cluster_labels = {"gke_product_type": "xpk"}
 class BlueprintGeneratorOutput:
@@ -216,26 +216,11 @@ class BlueprintGenerator:
       a3_megagpu_pool_0.settings.update({"static_node_count": num_nodes})
     set_placement_policy = capacity_type != CapacityType.SPOT
-    num_chips = num_nodes * system.chips_per_vm
     workload = DeploymentModule(
         id="workload_component_install",
         source="modules/management/kubectl-apply",
         use=["gke_cluster"],
         settings={
-            "kueue": {
-                "install": True,
-                "version": KUEUE_VERSION,  # TAS feature-gates is enabled in CT
-                "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
-                "config_template_vars": {
-                    "num_chips": num_chips,
-                    "reservation": (
-                        1 if capacity_type == CapacityType.RESERVATION else 0
-                    ),
-                    "flex_start": (
-                        1 if capacity_type == CapacityType.FLEX_START else 0
-                    ),
-                },
-            },
             "jobset": {"install": True, "version": "v0.7.2"},
             "apply_manifests": [{
                 "source": f'$(ghpc_stage("{blueprint_name}"))/storage_crd.yaml'
@@ -298,6 +283,7 @@ class BlueprintGenerator:
             "deployment_name": blueprint_name,
             "region": region,
             "zone": zone,
+            "labels": common_cluster_labels,
         },
     )
@@ -598,24 +584,12 @@ class BlueprintGenerator:
     else:
       gpu_pool.settings.update({"static_node_count": num_nodes})
-    num_chips = num_nodes * system.chips_per_vm
     workload_manager_install_id = "workload-manager-install"
     workload_manager_install = DeploymentModule(
         id=workload_manager_install_id,
         source="modules/management/kubectl-apply",
         use=[cluster_id],
         settings={
-            "kueue": {
-                "install": True,
-                "version": KUEUE_VERSION,  # TAS feature-gates is enabled in CT
-                "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
-                "config_template_vars": {
-                    "num_chips": num_chips,
-                    "flex_start": (
-                        1 if capacity_type == CapacityType.FLEX_START else 0
-                    ),
-                },
-            },
             "jobset": {"install": True, "version": "v0.7.2"},
             "apply_manifests": [
                 {"source": nccl_installer_path},
@@ -676,6 +650,7 @@ class BlueprintGenerator:
             "deployment_name": blueprint_name,
             "region": region,
             "zone": zone,
+            "labels": common_cluster_labels,
         },
     )
@@ -884,24 +859,12 @@ class BlueprintGenerator:
     else:
       gpu_pool.settings.update({"static_node_count": num_nodes})
-    num_chips = num_nodes * system.chips_per_vm
     workload_manager_install_id = "workload-manager-install"
     workload_manager_install = DeploymentModule(
         id=workload_manager_install_id,
         source="modules/management/kubectl-apply",
         use=[cluster_id],
         settings={
-            "kueue": {
-                "install": True,
-                "version": KUEUE_VERSION,  # TAS feature-gates is enabled in CT
-                "config_path": f'$(ghpc_stage("{blueprint_name}"))/kueue-xpk-configuration.yaml.tftpl',
-                "config_template_vars": {
-                    "num_chips": num_chips,
-                    "flex_start": (
-                        1 if capacity_type == CapacityType.FLEX_START else 0
-                    ),
-                },
-            },
             "jobset": {"install": True, "version": "v0.7.2"},
             "apply_manifests": [
                 {"source": nccl_installer_path},
@@ -962,6 +925,7 @@ class BlueprintGenerator:
             "deployment_name": blueprint_name,
             "region": region,
             "zone": zone,
+            "labels": common_cluster_labels,
         },
     )

xpk/core/blueprint/blueprint_test.py CHANGED Viewed

@@ -32,7 +32,6 @@ a3_spot_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_mega_spot.yaml"
 a3_ultra_yaml_test_path = "src/xpk/core/blueprint/testing/data/a3_ultra.yaml"
 a4_yaml_test_path = "src/xpk/core/blueprint/testing/data/a4.yaml"
 config_map_filename = "config-map.yaml.tftpl"
-kueue_conf_filename = "kueue-xpk-configuration.yaml.tftpl"
 tmp_test_dir = "/tmp/xpk_test"
@@ -82,11 +81,6 @@ def test_generate_a3_mega_blueprint():
               tmp_test_dir, "prefix", blueprint_name, config_map_filename
           )
       )
-      assert os.path.exists(
-          os.path.join(
-              tmp_test_dir, "prefix", blueprint_name, kueue_conf_filename
-          )
-      )
   shutil.rmtree(tmp_test_dir)

xpk/core/blueprint/testing/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""
+Copyright 2024 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""

xpk/core/capacity.py CHANGED Viewed

@@ -17,6 +17,7 @@ limitations under the License.
 import enum
 from ..utils.console import xpk_print, xpk_exit
+from ..utils.kueue import is_queued_cluster
 from .commands import run_command_with_updates, run_command_for_value
 AUTOPROVISIONING_CONFIG_VALUE = 'AUTOPROVISION'
@@ -50,7 +51,7 @@ def print_reservations(args) -> int:
   """
   command = f'gcloud beta compute reservations list --project={args.project}'
   return_code = run_command_with_updates(
-      command, 'Get all reservations in the project', args
+      command, 'Get all reservations in the project'
   )
   if return_code != 0:
     xpk_print(f'Get all reservations returned ERROR {return_code}')
@@ -119,7 +120,7 @@ def get_reservation_maintenance_interval(
       f' --project={project} --zone={zone} --format="value(specificReservation.instanceProperties.maintenanceInterval)"'
   )
   return_code, output = run_command_for_value(
-      command, 'Get reservation maintenance interval', None
+      command, 'Get reservation maintenance interval'
   )
   if return_code != 0:
     xpk_print(f'Get reservation maintenance interval ERROR {return_code}')
@@ -143,7 +144,7 @@ def get_reservation_placement_policy(
       f' --project={project} --zone={zone} --format="value(resourcePolicies.policy)"'
   )
   return_code, output = run_command_for_value(
-      command, 'Get reservation placement policy', None
+      command, 'Get reservation placement policy'
   )
   if return_code != 0:
     xpk_print(f'Get reservation placement policy ERROR {return_code}')
@@ -164,7 +165,7 @@ def verify_reservation_exists(args) -> int:
       f'gcloud beta compute reservations describe {args.reservation}'
       f' --project={args.project} --zone={args.zone}'
   )
-  return_code = run_command_with_updates(command, 'Describe reservation', args)
+  return_code = run_command_with_updates(command, 'Describe reservation')
   if return_code != 0:
     xpk_print(f'Describe reservation returned ERROR {return_code}')
     xpk_print('Please confirm that your reservation name is correct.')
@@ -199,7 +200,7 @@ def get_capacity_arguments_from_capacity_type(
           ' --location-policy=ANY --reservation-affinity=none'
           f' --no-enable-autorepair --max-nodes={max_nodes}'
       )
-      if args.num_slices <= 1:
+      if is_queued_cluster(args.num_slices):
         capacity_args += ' --enable-queued-provisioning'
     case CapacityType.RESERVATION:
       capacity_args = (

xpk 0.13.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

xpk 0.13.0py3-none-any.whl → 0.14.1py3-none-any.whl