PyPI - xpk - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

xpk 1.0.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

xpk/commands/cluster.py +29 -30
xpk/commands/cluster_gcluster.py +19 -14
xpk/commands/cluster_test.py +1 -21
xpk/commands/common.py +39 -6
xpk/commands/common_test.py +170 -0
xpk/commands/info.py +9 -5
xpk/commands/inspector.py +33 -4
xpk/commands/inspector_test.py +142 -0
xpk/commands/workload.py +35 -17
xpk/commands/workload_test.py +70 -3
xpk/core/blueprint/blueprint_generator.py +19 -8
xpk/core/blueprint/testing/data/a3_ultra.yaml +3 -1
xpk/core/blueprint/testing/data/a4.yaml +3 -1
xpk/core/capacity.py +37 -17
xpk/core/capacity_test.py +66 -1
xpk/core/cluster.py +10 -10
xpk/core/cluster_private.py +3 -3
xpk/core/cluster_test.py +29 -2
xpk/core/docker_container.py +55 -30
xpk/core/docker_manager.py +4 -4
xpk/core/docker_resources.py +4 -1
xpk/core/kueue_manager.py +6 -8
xpk/core/kueue_manager_test.py +4 -5
xpk/core/nap.py +14 -3
xpk/core/nodepool.py +46 -13
xpk/core/nodepool_test.py +143 -8
xpk/core/pathways.py +4 -8
xpk/core/remote_state/fuse_remote_state.py +1 -1
xpk/core/scheduling.py +16 -13
xpk/core/scheduling_test.py +15 -7
xpk/core/system_characteristics.py +6 -0
xpk/core/telemetry.py +11 -1
xpk/core/telemetry_test.py +39 -0
xpk/core/testing/commands_tester.py +26 -0
xpk/core/testing/commands_tester_test.py +20 -1
xpk/core/workload_decorators/rdma_decorator.py +9 -0
xpk/parser/cluster.py +11 -1
xpk/parser/cluster_test.py +59 -1
xpk/parser/common.py +11 -0
xpk/parser/storage.py +3 -3
xpk/utils/console.py +1 -1
xpk/utils/feature_flags.py +7 -3
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/METADATA +37 -21
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/RECORD +48 -55
xpk-1.1.1.dist-info/top_level.txt +1 -0
integration/README.md +0 -19
integration/__init__.py +0 -15
integration/docker_manager_test.py +0 -102
integration/gcluster_a3mega_test.py +0 -215
integration/gcluster_a3ultra_test.py +0 -187
integration/gcluster_a4_test.py +0 -187
integration/gcluster_test.py +0 -107
xpk/utils/user_input.py +0 -48
xpk/utils/user_input_test.py +0 -92
xpk-1.0.0.dist-info/top_level.txt +0 -2
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/WHEEL +0 -0
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/entry_points.txt +0 -0
{xpk-1.0.0.dist-info → xpk-1.1.1.dist-info}/licenses/LICENSE +0 -0

xpk/core/cluster.py CHANGED Viewed

@@ -158,7 +158,7 @@ def install_nri_on_cluster() -> int:
 def get_cluster_nodes_info() -> list[dict]:
-  """Get list of cluster's nodes descrition in yaml format
+  """Get list of cluster's nodes description in yaml format
   Returns:
     List of nodes info yaml objects.
@@ -393,11 +393,13 @@ def project_id_to_project_number(project_id: str) -> str:
 def setup_k8s_env(args) -> k8s_client.ApiClient:
   add_zone_and_project(args)
   get_cluster_credentials(args)
-  args.project_number = (
-      project_id_to_project_number(args.project)
-      if not args.dry_run
-      else abs(hash(args.project) % (10**12))  # 12 digit hash
-  )
+  # Use provided project number if available, otherwise fetch via API
+  if getattr(args, 'project_number', None):
+    xpk_print(f'Using provided project number: {args.project_number}')
+  elif args.dry_run:
+    args.project_number = abs(hash(args.project) % (10**12))  # 12 digit hash
+  else:
+    args.project_number = project_id_to_project_number(args.project)
   config.load_kube_config()
   return k8s_client.ApiClient()
@@ -716,10 +718,8 @@ def get_cluster_credentials(args) -> int:
       location=location,
       dns_endpoint=True,
   )
-  if return_code != 0:
-    return return_code
-  if not _are_credentials_valid():
+  if return_code != 0 or not _are_credentials_valid():
     xpk_print('Detected error. Retrying without --dns-endpoint flag...')
     return_code = _get_credentials(
         project=args.project,
@@ -751,6 +751,6 @@ def _get_credentials(
 def _are_credentials_valid() -> bool:
   kubectl_command = 'kubectl get pods'
   kubectl_return_code = run_command_with_updates(
-      kubectl_command, 'Test kubectl credentials'
+      kubectl_command, 'Test kubectl credentials', verbose=False
   )
   return kubectl_return_code == 0

xpk/core/cluster_private.py CHANGED Viewed

@@ -61,7 +61,7 @@ def authorize_private_cluster_access_if_necessary(args) -> int:
   if new_authorized_networks_needed or not is_current_machine_in_network:
     return update_cluster_new_authorized_networks(args, authorized_networks)
-  xpk_print("Current machine's IP adrress is already authorized.")
+  xpk_print("Current machine's IP address is already authorized.")
   return 0
@@ -84,7 +84,7 @@ def add_current_machine_to_networks_if_needed(
       is_current_machine_in_any_network(authorized_networks)
   )
   if is_current_machine_in_network_return_code != 0:
-    xpk_print("Error on checking current machine's IP adrress.")
+    xpk_print("Error on checking current machine's IP address.")
     return is_current_machine_in_network_return_code, False, authorized_networks
   if not is_current_machine_in_network:
@@ -148,7 +148,7 @@ def is_cluster_private(args) -> bool:
 def get_cluster_authorized_networks(args) -> list[str]:
-  """Retreives the networks list that are authorized to have access to Control Plane.
+  """Retrieves the networks list that are authorized to have access to Control Plane.
   Args:
     args: user provided arguments for running the command.

xpk/core/cluster_test.py CHANGED Viewed

@@ -41,11 +41,15 @@ def command_args(mocker: MockerFixture):
   return mocker.Mock(cluster="cluster", project="project", zone="zone")
-def test_get_cluster_credentials_returns_1_when_retrieval_command_fails(
+def test_get_cluster_credentials_returns_1_when_retrieval_commands_fail(
     commands_tester: CommandsTester, command_args
 ):
   commands_tester.set_result_for_command(
-      (1, ""), "gcloud container clusters get-credentials"
+      (1, ""), "gcloud container clusters get-credentials", " --dns-endpoint"
+  )
+  commands_tester.set_result_for_command(
+      (1, ""),
+      "gcloud container clusters get-credentials",
   )
   assert get_cluster_credentials(command_args) == 1
@@ -95,6 +99,29 @@ def test_get_cluster_credentials_retries_without_dns_when_dns_retrieval_fails(
   assert len(non_dns_endpoint_commands) == 1
+def test_get_cluster_credentials_retries_without_dns_when_dns_retrieval_returns_error(
+    commands_tester: CommandsTester, command_args
+):
+  commands_tester.set_result_for_command(
+      (1, ""), "gcloud container clusters get-credentials", "--dns-endpoint"
+  )
+  commands_tester.set_result_for_command(
+      (0, ""),
+      "gcloud container clusters get-credentials",
+  )
+  assert get_cluster_credentials(command_args) == 0
+  non_dns_endpoint_commands = [
+      c
+      for c in commands_tester.get_matching_commands(
+          "gcloud container clusters get-credentials"
+      )
+      if "dns-endpoint" not in c
+  ]
+  assert len(non_dns_endpoint_commands) == 1
 def test_update_cluster_with_lustre_driver_if_necessary_with_default_port_runs_correct_checks(
     commands_tester: CommandsTester, command_args
 ):

xpk/core/docker_container.py CHANGED Viewed

@@ -17,9 +17,7 @@ limitations under the License.
 from ..utils.console import xpk_exit, xpk_print
 from .docker_image import setup_docker_image
 from .docker_resources import (
-    add_container_ports,
     add_image_pull_policy_for_pw_or_gpu,
-    add_jax_coordinator_port,
     get_env_container,
     get_main_container_resources,
     get_volume_mounts,
@@ -32,12 +30,18 @@ from .system_characteristics import (
 )
-def get_main_and_sidecar_container(args, system, docker_image) -> str:
+def get_main_and_sidecar_container(
+    args,
+    system: SystemCharacteristics,
+    docker_image: str,
+    parallel_containers: int,
+) -> str:
   """Generate yaml for main and sidecar container.
   Args:
     args: user provided arguments for running the command.
     system: system characteristics
     docker_image: docker image
+    parallel_containers: number of containers to run per VM.
   Returns:
     str:
@@ -46,7 +50,9 @@ def get_main_and_sidecar_container(args, system, docker_image) -> str:
   resource_type = AcceleratorTypeToAcceleratorCharacteristics[
       system.accelerator_type
   ].resource_type
-  main_container = get_main_container(args, system, docker_image, resource_type)
+  main_container = get_main_container(
+      args, system, docker_image, resource_type, parallel_containers
+  )
   yaml = """- name: stacktrace-explorer
                 image: busybox:1.28
                 args: [/bin/sh, -c, "check_signal() (while [ ! -f /shared-volume/stacktrace_signal ]; do sleep 1; done; pid=$(pidof 'tail'); kill $pid;); check_signal & while [ ! -d /tmp/debugging ]; do sleep 60; done; while [ ! -e /tmp/debugging/* ]; do sleep 60; done; tail -n+1 -f /tmp/debugging/*; exit 0;"]
@@ -61,13 +67,20 @@ def get_main_and_sidecar_container(args, system, docker_image) -> str:
   return yaml.format(main_container=main_container)
-def get_main_container(args, system, docker_image, resource_type) -> str:
+def get_main_container(
+    args,
+    system: SystemCharacteristics,
+    docker_image: str,
+    resource_type,
+    parallel_containers: int,
+) -> str:
   """Generate yaml for main container including the xpk command.
   Args:
     args: user provided arguments for running the command.
     system: system characteristics
     docker_image: docker image
     resource_type: The label to describe the resource type for TPUs/GPUs/CPUs.
+    parallel_containers: number of containers to run per VM.
   Returns:
     str:
@@ -112,13 +125,12 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
         'touch /shared-volume/stacktrace_signal; '
     )
-  yaml = """- name: {docker_name}
+  containers = []
+  container_yaml = """
+              - name: {docker_name}
                 image: {docker_image}
                 {image_pull_policy}
                 env: {env}
-                ports:
-                {container_ports}
-                {jax_coordinator_port}
                 securityContext:
                   privileged: true
                 command:
@@ -145,37 +157,46 @@ def get_main_container(args, system, docker_image, resource_type) -> str:
                   limits:
                     {resources}
 """
+  docker_name = get_main_container_docker_image(args, system)
   volume_mounts = get_volume_mounts(args, system)
   if volume_mounts != '':
-    yaml += """
+    container_yaml += """
                 volumeMounts:
                 {volume_mounts}
 """
-  return yaml.format(
-      args=args,
-      system=system,
-      image_pull_policy=add_image_pull_policy_for_pw_or_gpu(args, system),
-      env=get_env_container(args, system),
-      container_ports=add_container_ports(args, system),
-      jax_coordinator_port=add_jax_coordinator_port(system),
-      docker_name=get_main_container_docker_image(args, system),
-      docker_image=docker_image,
-      gsutil_test_command=gsutil_test_command,
-      command=command,
-      tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
-      gpu_workload_terminate_command=gpu_workload_terminate_command,
-      xpk_internal_commands=xpk_internal_commands,
-      resources=get_main_container_resources(args, system, resource_type),
-      volume_mounts=volume_mounts,
-  )
+  env = get_env_container(args, system)
+  image_pull_policy = add_image_pull_policy_for_pw_or_gpu(args, system)
+  for i in range(parallel_containers):
+    docker_name_sufix = f'-{i + 1}' if parallel_containers > 1 else ''
+    containers.append(
+        container_yaml.format(
+            args=args,
+            system=system,
+            image_pull_policy=image_pull_policy,
+            env=env,
+            docker_name=f'{docker_name}{docker_name_sufix}',
+            docker_image=docker_image,
+            gsutil_test_command=gsutil_test_command,
+            command=command,
+            tpu_stacktrace_terminate_command=tpu_stacktrace_terminate_command,
+            gpu_workload_terminate_command=gpu_workload_terminate_command,
+            xpk_internal_commands=xpk_internal_commands,
+            resources=get_main_container_resources(args, system, resource_type),
+            volume_mounts=volume_mounts,
+        )
+    )
+  return ''.join(containers)
-def get_user_workload_container(args, system: SystemCharacteristics):
+def get_user_workload_container(
+    args, system: SystemCharacteristics, parallel_containers: int
+):
   """Deploy user workload container
   Args:
       args: user provided args.
       system: system characteristics.
+      parallel_containers: number of containers to run per VM.
   Returns:
       container: main container
@@ -202,11 +223,15 @@ def get_user_workload_container(args, system: SystemCharacteristics):
         'Sidecar container to display stack traces for TPU workloads will also'
         ' be deployed.'
     )
-    container = get_main_and_sidecar_container(args, system, docker_image)
+    container = get_main_and_sidecar_container(
+        args, system, docker_image, parallel_containers
+    )
     # Get GKE debugging dashboard only when sidecar container is deployed for TPU workloads
     debugging_dashboard_id = get_gke_debugging_dashboard(args)
   else:
-    container = get_main_container(args, system, docker_image, resource_type)
+    container = get_main_container(
+        args, system, docker_image, resource_type, parallel_containers
+    )
   return container, debugging_dashboard_id

xpk/core/docker_manager.py CHANGED Viewed

@@ -44,7 +44,7 @@ class CommandRunner(ABC):
   @abstractmethod
   def initialize(self) -> None:
-    """initialize is a method that should implement all steps neccessary to run command.
+    """initialize is a method that should implement all steps necessary to run command.
     Returns:
         None
@@ -95,7 +95,7 @@ class DockerManager(CommandRunner):
     - gcloud_cfg_path (str) : path to directory containing gcloud configuration
     - working_dir (str) : path to directory in which gcluster deployment directory will be saved
     - client (DockerClient) : docker client
-    - nocache (bool) : wheter to use docker cache when building image
+    - nocache (bool) : whether to use docker cache when building image
     - img_name (str) : name of docker image to create
     - container_name (str) : name of the container that will be created from img_name
     - rm_container_after (bool) : if set to True, docker container in which command is executed will be removed after each execution.
@@ -294,12 +294,12 @@ class DockerManager(CommandRunner):
       xpk_print(f"error while building image {self.img_name}: {e.msg}")
       xpk_exit(dockerBuildErrorCode)
     except APIError as e:
-      xpk_print(f"erro while building image {self.img_name}: {e.explanation}")
+      xpk_print(f"error while building image {self.img_name}: {e.explanation}")
       xpk_exit(dockerBuildErrorCode)
     except TypeError as e:
       xpk_print(f"TypeError while building image {self.img_name}: {e.args}")
       xpk_exit(dockerBuildErrorCode)
-    xpk_print("Docker image build succesfully.")
+    xpk_print("Docker image build successfully.")
     os.remove(self.dockerfile_path)
     tmp_dockerfile_dir = "/".join(self.dockerfile_path.split("/")[:-1])
     os.rmdir(tmp_dockerfile_dir)

xpk/core/docker_resources.py CHANGED Viewed

@@ -53,7 +53,10 @@ def get_main_container_resources(
     offset_vCPUs = int(system.chips_per_vm) * 0.95
     return f'{resource_type}: {offset_vCPUs}'
-  return f'{resource_type}: {system.chips_per_vm}'
+  return (
+      f'{resource_type}:'
+      f' {int(system.chips_per_vm / system.parallel_containers)}'
+  )
 def get_env_container(args, system: SystemCharacteristics) -> str:

xpk/core/kueue_manager.py CHANGED Viewed

@@ -41,8 +41,8 @@ from ..utils.console import xpk_print, xpk_exit, ask_for_user_consent
 from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
 from packaging.version import Version
-KUEUE_VERSION = Version("v0.14.3")
-LATEST_BREAKING_VERSION = Version("v0.14.0")
+KUEUE_VERSION = Version("v0.15.2")
+LATEST_BREAKING_VERSION = Version("v0.15.0")
 WAIT_FOR_KUEUE_TIMEOUT = "10m"
 CLUSTER_QUEUE_NAME = "cluster-queue"
 LOCAL_QUEUE_NAME = "multislice-queue"
@@ -290,6 +290,7 @@ class KueueManager:
         cpu_limit=cpu_limit,
         memory_limit=memory_limit,
         topology_name=topology_name,
+        configure_super_slicing=kueue_config.configure_super_slicing,
     )
     config_yaml = template.render(context)
@@ -316,6 +317,7 @@ class KueueManager:
       cpu_limit: int,
       memory_limit: str,
       topology_name: str | None,
+      configure_super_slicing: bool,
   ) -> Dict[str, Any]:
     """Prepares the context for the Jinja2 template."""
     # Main accelerator flavor
@@ -328,11 +330,7 @@ class KueueManager:
       key, value = accelerator_label.split(":", 1)
       node_labels_dict[key] = value.strip()
-    if system.supports_super_slicing:
-      node_labels_dict["cloud.google.com/gke-tpu-partition-4x4x4-state"] = (
-          "HEALTHY"
-      )
-    elif not autoprovisioning:
+    if not autoprovisioning and not configure_super_slicing:
       machine_label = create_machine_label(system)
       if machine_label:
         key, value = machine_label.split(":", 1)
@@ -383,7 +381,7 @@ class KueueManager:
       })
     admission_checks = []
-    if system.supports_super_slicing:
+    if configure_super_slicing:
       admission_checks.append("ss-kueue-operator")
     if flex and is_queued_cluster(num_slices, system.accelerator_type):
       admission_checks.append("dws-prov")

xpk/core/kueue_manager_test.py CHANGED Viewed

@@ -113,7 +113,7 @@ def test_install_or_upgrade_when_outdated(
   result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
   assert result == 0
-  mock_commands.assert_command_run("kubectl apply", "v0.14.3/manifests.yaml")
+  mock_commands.assert_command_run("kubectl apply", "v0.15.2/manifests.yaml")
   mock_commands.assert_command_run("kubectl apply -f", "/tmp/")
@@ -126,7 +126,7 @@ def test_install_or_upgrade_when_not_installed(
   result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
   assert result == 0
-  mock_commands.assert_command_run("kubectl apply", "v0.14.3/manifests.yaml")
+  mock_commands.assert_command_run("kubectl apply", "v0.15.2/manifests.yaml")
   mock_commands.assert_command_run("kubectl apply -f", "/tmp/")
@@ -135,7 +135,7 @@ def test_upgrade_when_no_breaking_changes_between_versions_no_preparation_needed
     kueue_manager: KueueManager,
     mock_ask_for_user_consent: MagicMock,
 ):
-  set_installed_kueue_version(mock_commands, Version("0.14.0"))
+  set_installed_kueue_version(mock_commands, Version("0.15.0"))
   kueue_manager.install_or_upgrade(KUEUE_CONFIG)
@@ -162,7 +162,7 @@ def test_upgrade_with_breaking_changes_between_versions_runs_preparation(
   assert result == 0
   mock_ask_for_user_consent.assert_called_once()
   assert (
-      "CHANGELOG/CHANGELOG-0.14.md"
+      "CHANGELOG/CHANGELOG-0.15.md"
       in mock_ask_for_user_consent.mock_calls[0].args[0]
   )
   mock_commands.assert_command_run(
@@ -492,7 +492,6 @@ def test_configure_generates_correct_manifest_with_super_slicing(
   assert resource_flavor["spec"]["topologyName"] == "super-slice-topology"
   assert resource_flavor["spec"]["nodeLabels"] == {
       "cloud.google.com/gke-tpu-accelerator": "tpu7x",
-      "cloud.google.com/gke-tpu-partition-4x4x4-state": "HEALTHY",
   }
   topology = _first(doc for doc in manifest_docs if doc["kind"] == "Topology")
   assert topology["metadata"]["name"] == "super-slice-topology"

xpk/core/nap.py CHANGED Viewed

@@ -24,7 +24,8 @@ from .capacity import (
     CapacityType,
     get_capacity_node_selectors_from_capacity_type,
     get_capacity_type,
-    verify_reservation_exists,
+    get_reservations_list,
+    verify_reservations_exist,
 )
 from .commands import run_command_with_updates, run_commands
 from .gcloud_context import get_cluster_location
@@ -345,14 +346,24 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
       )
       if return_code != 0:
         return node_selector_args, return_code
-      return_code = verify_reservation_exists(args)
+      return_code = verify_reservations_exist(args)
       if return_code > 0:
         xpk_print('Unable to verify reservation name saved in config map.')
         return node_selector_args, return_code
   # Check if reservation id is valid. Shared function with cluster creation.
+  reservation_name = None
+  if capacity_type_str == CapacityType.RESERVATION.name:
+    reservations = get_reservations_list(args)
+    if len(reservations) > 1:
+      xpk_print('Error: NAP based clusters only support a single reservation.')
+      return node_selector_args, 1
+    reservation_name = reservations[0] if len(reservations) > 0 else None
   node_selector_args, return_code = (
-      get_capacity_node_selectors_from_capacity_type(args, capacity_type_str)
+      get_capacity_node_selectors_from_capacity_type(
+          capacity_type_str, reservation_name
+      )
   )
   if return_code != 0:
     xpk_print('Unable to get node selectors from capacity type.')

xpk/core/nodepool.py CHANGED Viewed

@@ -14,7 +14,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-from typing import List
+from typing import Iterator, List
+from itertools import cycle
 from ..utils.feature_flags import FeatureFlags
 from ..utils.console import ask_for_user_consent, xpk_print
@@ -25,6 +26,7 @@ from .capacity import (
     CapacityType,
     get_capacity_arguments_from_capacity_type,
     get_capacity_type,
+    get_reservations_list,
     print_reservations,
 )
 from .commands import run_command_for_value, run_commands, FailedCommand
@@ -85,12 +87,6 @@ def run_gke_node_pool_create_command(
     max_nodes = system.vms_per_slice
   else:
     max_nodes = 1000
-  capacity_args, return_code = get_capacity_arguments_from_capacity_type(
-      args, capacity_type, max_nodes, system.accelerator_type
-  )
-  if return_code > 0:
-    xpk_print('Parsing capacity arguments failed!')
-    return return_code
   desired_node_pool_count = (
       1 if system.accelerator_type == AcceleratorType.GPU else args.num_slices
@@ -274,9 +270,34 @@ def run_gke_node_pool_create_command(
   create_commands = []
   create_task_names = []
-  for node_pool_name in desired_node_pool_names:
-    if node_pool_name in node_pools_to_remain:
-      continue
+  node_pools_to_create = [
+      np for np in desired_node_pool_names if np not in node_pools_to_remain
+  ]
+  reservations_iter: Iterator[str] | None = None
+  if capacity_type == CapacityType.RESERVATION:
+    reservations = get_reservations_list(args)
+    if (
+        _validate_reservation_count(reservations, len(node_pools_to_create))
+        != 0
+    ):
+      return 1
+    reservations_iter = (
+        cycle(reservations) if len(reservations) == 1 else iter(reservations)
+    )
+  for node_pool_name in node_pools_to_create:
+    capacity_args, return_code = get_capacity_arguments_from_capacity_type(
+        args,
+        capacity_type,
+        max_nodes,
+        system.accelerator_type,
+        reservation_name=next(reservations_iter) if reservations_iter else None,
+    )
+    if return_code > 0:
+      xpk_print('Parsing capacity arguments failed!')
+      return return_code
     command = (
         'gcloud beta container node-pools create'
         f' {node_pool_name}'
@@ -632,7 +653,7 @@ def ensure_resource_policy_exists(
 ) -> None:
   return_code, _ = run_command_for_value(
       (
-          'gcloud compute resource-policies describe'
+          'gcloud beta compute resource-policies describe'
           f' {resource_policy_name}'
           f' --project={project}'
           f' --region={zone_to_region(zone)}'
@@ -643,13 +664,12 @@ def ensure_resource_policy_exists(
   if return_code == 0:
     return
-  # TODO: b/465696970 - Verify the flag below before launching SUPER_SLICING:
   accelerator_topology_mode = (
       ' --accelerator-topology-mode=PROVISION_ONLY' if super_slicing else ''
   )
   return_code, _ = run_command_for_value(
       (
-          'gcloud compute resource-policies create workload-policy'
+          'gcloud beta compute resource-policies create workload-policy'
           f' {resource_policy_name} --project={project} --region={zone_to_region(zone)} --type=HIGH_THROUGHPUT'
           f' --accelerator-topology={topology}{accelerator_topology_mode}'
       ),
@@ -658,3 +678,16 @@ def ensure_resource_policy_exists(
   if return_code != 0:
     raise RuntimeError('Unable to create resource policy')
+def _validate_reservation_count(
+    reservations: List[str], num_node_pools_to_create: int
+) -> int:
+  """Validate that reservation count matches new nodepool count or is 1."""
+  if len(reservations) > 1 and len(reservations) != num_node_pools_to_create:
+    xpk_print(
+        f'Error: Number of reservations ({len(reservations)}) must match'
+        f' the number of NEW nodepools ({num_node_pools_to_create}) or be 1.'
+    )
+    return 1
+  return 0

xpk 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

xpk 1.0.0py3-none-any.whl → 1.1.1py3-none-any.whl