PyPI - xpk - Versions diffs - 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

xpk 0.11.0py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

xpk/commands/batch.py +8 -8
xpk/commands/cluster.py +19 -19
xpk/commands/cluster_gcluster.py +2 -1
xpk/commands/common.py +7 -3
xpk/commands/info.py +12 -12
xpk/commands/inspector.py +1 -1
xpk/commands/job.py +42 -12
xpk/commands/kjob_common.py +2 -1
xpk/commands/storage.py +6 -3
xpk/commands/workload.py +28 -15
xpk/core/blueprint/blueprint_generator.py +7 -7
xpk/core/blueprint/blueprint_test.py +218 -0
xpk/core/capacity.py +3 -1
xpk/core/cluster.py +14 -8
xpk/core/cluster_private.py +8 -2
xpk/core/commands.py +13 -10
xpk/core/config.py +3 -4
xpk/core/config_test.py +71 -0
xpk/core/docker_image.py +14 -5
xpk/core/docker_manager.py +1 -1
xpk/core/docker_resources.py +10 -5
xpk/core/filestore.py +7 -2
xpk/core/gcloud_context.py +2 -2
xpk/core/jobset.py +1 -1
xpk/core/kjob.py +7 -3
xpk/core/kueue.py +28 -8
xpk/core/nap.py +5 -5
xpk/core/network.py +1 -1
xpk/core/nodepool.py +8 -3
xpk/core/nodepool_test.py +82 -0
xpk/core/pathways.py +6 -2
xpk/core/ray.py +1 -1
xpk/core/resources.py +18 -14
xpk/core/scheduling.py +4 -0
xpk/core/storage.py +14 -14
xpk/core/system_characteristics.py +1 -1
xpk/core/workload.py +11 -0
xpk/core/workload_decorators/rdma_decorator.py +3 -2
xpk/core/workload_decorators/storage_decorator.py +2 -1
xpk/core/workload_decorators/tcpx_decorator.py +4 -2
xpk/core/workload_decorators/tcpx_decorator_test.py +267 -0
xpk/core/workload_decorators/tcpxo_decorator.py +2 -1
xpk/core/workload_test.py +28 -0
xpk/main.py +12 -10
xpk/parser/cluster.py +110 -49
xpk/parser/common.py +45 -36
xpk/parser/storage.py +12 -13
xpk/parser/workload.py +57 -39
xpk/utils/console.py +2 -1
xpk/utils/execution_context.py +28 -0
xpk/utils/file.py +25 -10
xpk/utils/network.py +4 -0
{xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/METADATA +4 -1
xpk-0.13.0.dist-info/RECORD +101 -0
xpk-0.11.0.dist-info/RECORD +0 -95
{xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/WHEEL +0 -0
{xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/entry_points.txt +0 -0
{xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.11.0.dist-info → xpk-0.13.0.dist-info}/top_level.txt +0 -0

xpk/core/kjob.py CHANGED Viewed

@@ -23,6 +23,7 @@ from kubernetes.client import ApiClient
 from kubernetes.client.rest import ApiException
 from ..utils import templates
+from ..utils.execution_context import is_dry_run
 from ..utils.console import xpk_exit, xpk_print
 from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
 from .cluster import DEFAULT_NAMESPACE, XPK_SA, setup_k8s_env
@@ -277,7 +278,8 @@ def decorate_job_template_with_gpu(yml_string: str, gpu_type: str) -> str:
     job_spec = rdma_decorator.decorate_kjob_template(job_spec)
   job_template_dict = yaml.safe_load(yml_string)
   job_template_dict["template"] = job_spec
-  return yaml.dump(job_template_dict, sort_keys=False)
+  yaml_result: str = yaml.dump(job_template_dict, sort_keys=False)
+  return yaml_result
 def create_job_template_instance(
@@ -367,8 +369,10 @@ def create_pod_template_instance(args: Namespace, service_account: str) -> int:
 def prepare_kjob(args: Namespace) -> int:
   system = get_cluster_system_characteristics(args)
-  k8s_api_client = setup_k8s_env(args)
-  storages = get_auto_mount_storages(k8s_api_client)
+  storages = []
+  if not is_dry_run():
+    k8s_api_client = setup_k8s_env(args)
+    storages = get_auto_mount_storages(k8s_api_client)
   service_account = ""
   if len(storages) > 0:

xpk/core/kueue.py CHANGED Viewed

@@ -43,7 +43,7 @@ from .system_characteristics import (
 KUEUE_VERSION = 'v0.12.2'
 CLUSTER_QUEUE_NAME = 'cluster-queue'
 LOCAL_QUEUE_NAME = 'multislice-queue'
-WAIT_FOR_KUEUE_TIMEOUT = '5m'
+WAIT_FOR_KUEUE_TIMEOUT = '10m'
 MEMORY_SIZE_PER_VM = 1.2
 MIN_MEMORY_LIMIT_SIZE = 4096
@@ -89,6 +89,10 @@ metadata:
   name: dws-config
 spec:
   provisioningClassName: queued-provisioning.gke.io
+  podSetUpdates:
+    nodeSelector:
+    - key: autoscaling.gke.io/provisioning-request
+      valueFromProvisioningClassDetail: ResizeRequestName
   managedResources:
   - {managed_resource}
 ---
@@ -320,7 +324,7 @@ def delete_multikueueclusters_definitions(args) -> int:
   return return_code
-def get_kueue_version(args) -> (int, str):
+def get_kueue_version(args) -> tuple[int, str]:
   command = 'kubectl kueue version'
   task = 'Get kueue version on server'
   return_code, val = run_command_for_value(command, task, args)
@@ -432,6 +436,8 @@ def install_kueue_crs(
       cluster_hardware_name=cluster_hardware_name,
       resource_type=resource_type,
       total_chips=total_chips,
+      cpu_limit=args.cpu_limit,
+      memory_limit=args.memory_limit,
   )
   topology_label = ''
   if system.device_type in [
@@ -470,7 +476,7 @@ def install_kueue_crs(
     yml_string = topology_yaml + yml_string
   tmp = write_tmp_file(yml_string)
-  command = f'kubectl apply -f {str(tmp.file.name)}'
+  command = f'kubectl apply -f {str(tmp)}'
   task = 'Applying Kueue Custom Resources'
   return_code = run_command_with_updates_retry(command, task, args)
@@ -480,7 +486,7 @@ def install_kueue_crs(
 def get_kueue_covered_resources_config(
-    cluster_hardware_name, resource_type, total_chips
+    cluster_hardware_name, resource_type, total_chips, cpu_limit, memory_limit
 ) -> str:
   """Gets Kueue covered resources configuration.
@@ -493,17 +499,31 @@ def get_kueue_covered_resources_config(
     A string of Kueue covered resources configuration.
   """
   config_format = """
-  - coveredResources: ["{resource_type}"]
+  - coveredResources: {resource_types}
     flavors:
     - name: {cluster_hardware_name}
       resources:
       - name: "{resource_type}"
-        nominalQuota: {total_chips}
-  """
+        nominalQuota: {total_chips}"""
+  resource_types = [resource_type]
+  if cpu_limit:
+    config_format = config_format + """
+      - name: "cpu"
+        nominalQuota: {cpu_limit}"""
+    resource_types.append('cpu')
+  if memory_limit:
+    config_format = config_format + """
+      - name: "memory"
+        nominalQuota: {memory_limit}"""
+    resource_types.append('memory')
   config_string = config_format.format(
       cluster_hardware_name=cluster_hardware_name,
+      resource_types=resource_types,
       resource_type=resource_type,
       total_chips=total_chips,
+      cpu_limit=cpu_limit,
+      memory_limit=memory_limit,
   )
   return config_string
@@ -532,7 +552,7 @@ def update_kueue_resources_if_necessary(args):
       memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
   )
   tmp = write_tmp_file(yml_string)
-  command = f'kubectl apply -f {str(tmp.file.name)}'
+  command = f'kubectl apply -f {str(tmp)}'
   task = 'Updating Kueue Controller Manager resources'
   return_code = run_command_with_updates_retry(command, task, args)

xpk/core/nap.py CHANGED Viewed

@@ -37,6 +37,7 @@ from .resources import (
 )
 from .scheduling import get_total_chips_requested_from_args
 from .system_characteristics import AcceleratorType, SystemCharacteristics
+from typing import cast
 AUTOPROVISIONING_CONFIG_FILE = """
 management:
@@ -249,7 +250,7 @@ def create_autoprovisioning_config(
       zones=f'- {args.zone}',
   )
   autoprovisioning_config = AutoprovisioningConfig(
-      config_filename=write_tmp_file(yml_string).name,
+      config_filename=write_tmp_file(yml_string),
       minimum_chips=minimum,
       maximum_chips=maximum,
   )
@@ -269,9 +270,6 @@ def is_autoprovisioning_enabled(
     bool is true if autoprovisioning is enabled, false otherwise.
     int of 0 if successful and 1 otherwise.
   """
-  # Currently autoprovisioning is not enabled for Pathways workloads. b/360898087
-  if args.use_pathways:
-    return False, 0
   resources_configmap_name = f'{args.cluster}-{CLUSTER_RESOURCES_CONFIGMAP}'
   cluster_config_map = get_cluster_configmap(args, resources_configmap_name)
@@ -339,11 +337,13 @@ def get_autoprovisioning_node_selector_args(args) -> tuple[str, int]:
       )
       return node_selector_args, 1
-    return_code, capacity_type_str = get_value_from_map(
+    return_code, optional_capacity_type_str = get_value_from_map(
         CAPACITY_TYPE_CONFIG_KEY, cluster_config_map
     )
     if return_code != 0:
       return node_selector_args, return_code
+    # return_code==0 implies capacity_type is defined
+    capacity_type_str = cast(str, optional_capacity_type_str)
     if capacity_type_str == CapacityType.RESERVATION.name:
       return_code, args.reservation = get_value_from_map(

xpk/core/network.py CHANGED Viewed

@@ -221,7 +221,7 @@ def create_cluster_network_config(args) -> int:
   """
   yml_string = CLUSTER_NETWORK_YAML.format(cluster_name=args.cluster)
   tmp = write_tmp_file(yml_string)
-  command = f'kubectl apply -f {str(tmp.file.name)}'
+  command = f'kubectl apply -f {str(tmp)}'
   return_code = run_command_with_updates(
       command, 'GKE Cluster Create Network Config', args

xpk/core/nodepool.py CHANGED Viewed

@@ -265,7 +265,9 @@ def run_gke_node_pool_create_command(
       )
       configmap_yml = {}
       configmap_yml[resources_configmap_name] = resources_yml
-      return_code = create_or_update_cluster_configmap(configmap_yml)
+      return_code = create_or_update_cluster_configmap(
+          configmap_yml, args.dry_run
+      )
       if return_code != 0:
         return 1
@@ -461,7 +463,7 @@ def get_nodepool_zone(args, nodepool_name) -> tuple[int, str | None]:
       f' --region={zone_to_region(args.zone)} --format="value(locations)"'
   )
   return_code, nodepool_zone = run_command_for_value(
-      command, 'Get Node Pool Zone', args
+      command, 'Get Node Pool Zone', args, dry_run_return_val=args.zone
   )
   if return_code != 0:
     xpk_print(f'Get Node Pool Zone returned ERROR {return_code}')
@@ -570,7 +572,10 @@ def upgrade_gke_nodepools_version(args, default_rapid_gke_version) -> int:
   for i, command in enumerate(commands):
     xpk_print(f'To complete {task_names[i]} we are executing {command}')
   max_return_code = run_commands(
-      commands, 'Update GKE node pools to default RAPID GKE version', task_names
+      commands,
+      'Update GKE node pools to default RAPID GKE version',
+      task_names,
+      dry_run=args.dry_run,
   )
   if max_return_code != 0:
     xpk_print(

xpk/core/nodepool_test.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from xpk.core.nodepool import get_desired_node_pool_names
+CLUSTER_NAME = "running-cucumber"
+def node_pool_name(number: int) -> str:
+  return f"{CLUSTER_NAME}-np-{number}"
+def test_compute_desired_node_pool_names_with_desired_larger_than_existing():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[node_pool_name(0)],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=2,
+  )
+  expected_result = [node_pool_name(0), node_pool_name(1)]
+  assert set(result) == set(expected_result)
+def test_compute_desired_node_pool_names_with_desired_smaller_than_existing():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[node_pool_name(0), node_pool_name(1)],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=1,
+  )
+  expected_result = [node_pool_name(0)]
+  assert set(result) == set(expected_result)
+def test_compute_desired_node_pool_names_with_consecutive_numbers_missing():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=3,
+  )
+  expected_result = [node_pool_name(0), node_pool_name(1), node_pool_name(3)]
+  assert set(result) == set(expected_result)
+def test_compute_desired_node_pool_names_with_consecutive_numbers_missing_and_desired_equal_to_existing():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[node_pool_name(0), node_pool_name(3)],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=2,
+  )
+  expected_result = [node_pool_name(0), node_pool_name(3)]
+  assert set(result) == set(expected_result)
+def test_compute_desired_node_pool_names_with_unknown_node_pools():
+  result = get_desired_node_pool_names(
+      existing_node_pool_names=[
+          "unknown-node-pool",
+          node_pool_name(0),
+          node_pool_name(3),
+      ],
+      cluster_name=CLUSTER_NAME,
+      desired_node_pool_count=2,
+  )
+  expected_result = [node_pool_name(0), node_pool_name(3)]
+  assert set(result) == set(expected_result)

xpk/core/pathways.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..core.docker_container import get_user_workload_container
 from ..core.gcloud_context import zone_to_region
 from ..core.nodepool import get_all_nodepools_programmatic
 from ..utils.console import xpk_exit, xpk_print
+from ..utils.execution_context import is_dry_run
 from .system_characteristics import AcceleratorType, SystemCharacteristics
@@ -79,7 +80,10 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
   # Ensure the cluster and CPU nodepools were created with create-pathways
   all_node_pools = get_all_nodepools_programmatic(args)
   desired_pw_cpu_node_pools = {'cpu-np'}
-  if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
+  if (
+      not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0]))
+      and not is_dry_run()
+  ):
     xpk_print(
         'Cluster needs to be created with `xpk create-pathways` to run'
         ' Pathways workloads.'
@@ -322,7 +326,7 @@ def try_to_delete_pathwaysjob_first(args, workloads) -> bool:
     return_code = run_command_with_updates(commands[0], 'Delete Workload', args)
   else:
     return_code = run_commands(
-        commands, 'Delete Workload', task_names, batch=100
+        commands, 'Delete Workload', task_names, batch=100, dry_run=args.dry_run
     )
   if return_code != 0:

xpk/core/ray.py CHANGED Viewed

@@ -132,7 +132,7 @@ def install_ray_cluster(args, system) -> int:
   )
   tmp = write_tmp_file(yml_string)
-  command = f'kubectl apply -f {str(tmp.file.name)}'
+  command = f'kubectl apply -f {str(tmp)}'
   task = 'Applying RayCluster'
   retry_attempts = 1
   return_code = run_command_with_updates_retry(

xpk/core/resources.py CHANGED Viewed

@@ -66,7 +66,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
   )
   return_code, return_value = run_command_for_value(
-      command, 'GKE Cluster Get ConfigMap', args
+      command,
+      'GKE Cluster Get ConfigMap',
+      args,
+      dry_run_return_val='map[]',
   )
   if return_code != 0:
     xpk_print(f'GKE Cluster Get ConfigMap request returned ERROR {return_code}')
@@ -81,8 +84,10 @@ def get_cluster_configmap(args, configmap_name) -> dict[str, str] | None:
     configs = return_value[4:-1].split(' ')
     for config in configs:
-      key, value = config.strip().split(':')
-      config_map[key] = value
+      parts = config.strip().split(':')
+      if len(parts) != 2:
+        continue
+      config_map[parts[0]] = parts[1]
   return config_map
@@ -108,13 +113,7 @@ def create_cluster_configmaps(
   device_type = system.device_type
   if system.accelerator_type == AcceleratorType['GPU']:
     resources_data = f'{device_type}: "{int(args.num_nodes)}"'
-  elif (
-      not args.enable_pathways
-      and args.enable_autoprovisioning
-      and autoprovisioning_config
-  ):
-    # Currently autoprovisioning is not supported with Pathways.
-    # Auto provisioning will have variable topologies for a gke accelerator type.
+  elif args.enable_autoprovisioning and autoprovisioning_config:
     resources_data = (
         f'{system.gke_accelerator}: {AUTOPROVISIONING_CONFIG_VALUE}'
     )
@@ -156,10 +155,12 @@ def create_cluster_configmaps(
       args=args, name=metadata_configmap_name, data=metadata
   )
   configmap_yml[metadata_configmap_name] = metadata_yml
-  return create_or_update_cluster_configmap(configmap_yml)
+  return create_or_update_cluster_configmap(configmap_yml, args.dry_run)
-def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
+def create_or_update_cluster_configmap(
+    configmap_yml: dict, dry_run: bool
+) -> int:
   """
   Args:
     configmap_yml: dict containing ConfigMap name and yml string.
@@ -171,13 +172,16 @@ def create_or_update_cluster_configmap(configmap_yml: dict) -> int:
   task_names = []
   for configmap_name, yml_string in configmap_yml.items():
     tmp = write_tmp_file(yml_string)
-    command = f'kubectl apply -f {str(tmp.file.name)}'
+    command = f'kubectl apply -f {str(tmp)}'
     commands.append(command)
     task_name = f'ConfigMap CreateOrUpdate-{configmap_name}'
     task_names.append(task_name)
   return_code = run_commands(
-      commands, 'GKE Cluster CreateOrUpdate ConfigMap(s)', task_names
+      commands,
+      'GKE Cluster CreateOrUpdate ConfigMap(s)',
+      task_names,
+      dry_run=dry_run,
   )
   if return_code != 0:
     xpk_print(

xpk/core/scheduling.py CHANGED Viewed

@@ -15,6 +15,7 @@ limitations under the License.
 """
 from ..utils.console import xpk_print
+from ..utils.execution_context import is_dry_run
 from .capacity import AUTOPROVISIONING_CONFIG_MAXIMUM_KEY, AUTOPROVISIONING_CONFIG_VALUE
 from .resources import CLUSTER_RESOURCES_CONFIGMAP, get_cluster_configmap
 from .system_characteristics import (
@@ -45,6 +46,9 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
     )
     return True
+  if is_dry_run():
+    return True
   # Check for gke accelerator type:
   missing_gke_accelerator_type = False
   if not cluster_config_map.get(system.gke_accelerator):

xpk/core/storage.py CHANGED Viewed

@@ -17,7 +17,7 @@ limitations under the License.
 import os
 from argparse import Namespace
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, cast
 import ruamel.yaml
 from google.cloud import storage as gcp_storage
@@ -95,17 +95,17 @@ class Storage:
     Args:
         data: A dictionary containing the Storage resource definition.
     """
-    metadata: k8s_client.V1ObjectMeta = data.get("metadata", {})
+    metadata = data.get("metadata", {})
     self.name = metadata.get("name")
     spec = data.get("spec", {})
-    self.type: str = spec.get("type")
-    self.auto_mount: bool = spec.get("auto_mount")
-    self.mount_point: bool = spec.get("mount_point")
-    self.readonly: bool = spec.get("readonly")
-    self.manifest: str = spec.get("manifest")
-    self.pvc: str = spec.get("pvc")
-    self.pv: str = spec.get("pv")
-    self.bucket: str = self._get_bucket()
+    self.type = spec.get("type")
+    self.auto_mount = spec.get("auto_mount")
+    self.mount_point = spec.get("mount_point")
+    self.readonly = spec.get("readonly")
+    self.manifest = spec.get("manifest")
+    self.pvc = spec.get("pvc")
+    self.pv = spec.get("pv")
+    self.bucket = self._get_bucket()
   def fields_as_list(self) -> list[str]:
     """
@@ -117,9 +117,9 @@ class Storage:
     return [
         self.name,
         self.type,
-        self.auto_mount,
+        str(self.auto_mount),
         self.mount_point,
-        self.readonly,
+        str(self.readonly),
         self.manifest,
     ]
@@ -133,7 +133,7 @@ class Storage:
     client = k8s_client.CoreV1Api()
     try:
       pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
-      return pv.spec.csi.volume_handle
+      return cast(str, pv.spec.csi.volume_handle)
     except ApiException as e:
       xpk_print(
           f"Exception when calling CoreV1Api->read_persistent_volume: {e}"
@@ -150,7 +150,7 @@ class Storage:
     client = k8s_client.CoreV1Api()
     try:
       pv: V1PersistentVolume = client.read_persistent_volume(self.pv)
-      return pv.spec.mount_options
+      return cast(list[str], pv.spec.mount_options)
     except ApiException as e:
       xpk_print(
           f"Exception when calling CoreV1Api->read_persistent_volume: {e}"

xpk/core/system_characteristics.py CHANGED Viewed

@@ -55,7 +55,7 @@ class SystemCharacteristics:
   gke_accelerator: str
   gce_machine_type: str
   chips_per_vm: int
-  accelerator_type: AcceleratorType  # type: ignore
+  accelerator_type: int  # TODO: use enums
   device_type: str

xpk/core/workload.py CHANGED Viewed

@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+import re
 from ..utils.console import xpk_exit, xpk_print
 from .commands import run_command_for_value
 from .gcloud_context import zone_to_region
@@ -240,3 +241,13 @@ def wait_for_job_completion(args) -> int:
     xpk_print('Your workload did not complete successfully')
     return 125
   return 0
+GCP_NAME_FILTER_VALUE_REGEX = re.compile(r'[a-z0-9\-]+')
+"""Defines correct name prefix value (contains only letters, numbers and dashes) that can be used in GCP filter chips."""
+def get_jobsets_list_gcp_link(project: str) -> str:
+  """Returns a link to Cloud Console JobSets list"""
+  return f'https://console.cloud.google.com/kubernetes/aiml/deployments/jobs?project={project}'

xpk/core/workload_decorators/rdma_decorator.py CHANGED Viewed

@@ -18,7 +18,7 @@ import yaml
 from ...utils.yaml import literal_string
-def decorate_kjob_template(job_manifest) -> str:
+def decorate_kjob_template(job_manifest: dict) -> dict:
   spec = (
       job_manifest.setdefault('spec', {})
       .setdefault('template', {})
@@ -64,7 +64,8 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
     add_tolerations(job_manifest)
     update_gpu_containers(job_manifest)
-  return yaml.dump(manifest, sort_keys=False)
+  yaml_str: str = yaml.dump(manifest, sort_keys=False)
+  return yaml_str
 def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:

xpk/core/workload_decorators/storage_decorator.py CHANGED Viewed

@@ -36,7 +36,8 @@ def decorate_jobset(jobset_manifest_str, storages) -> str:
     job_manifest = job['template']
     add_annotations(job_manifest, storages)
     add_volumes(job_manifest, storage_volumes)
-  return yaml.dump(manifest, sort_keys=False)
+  yaml_result: str = yaml.dump(manifest, sort_keys=False)
+  return yaml_result
 def add_annotations(job_manifest, storages):

xpk/core/workload_decorators/tcpx_decorator.py CHANGED Viewed

@@ -55,7 +55,8 @@ def decorate_jobset(jobset_manifest_str: str) -> str:
   for job in manifest['spec']['replicatedJobs']:
     job_manifest = job['template']
     job_manifest = decorate_job(job_manifest)
-  return yaml.dump(manifest, sort_keys=False)
+  yaml_str: str = yaml.dump(manifest, sort_keys=False)
+  return yaml_str
 def get_interfaces_annotation() -> dict:
@@ -131,6 +132,7 @@ def add_volumes(job_manifest: dict):
   })
   volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
   volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
+  volumes.append({'name': 'tcpx-socket', 'hostPath': {'path': '/run/tcpx'}})
   volumes.append(
       {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
   )
@@ -168,7 +170,7 @@ def add_tcpx_daemon_container(job_manifest):
   spec['initContainers'].append(tcpxo_daemon_container)
-def update_gpu_containers(job_manifest):
+def update_gpu_containers(job_manifest) -> None:
   for container in job_manifest['spec']['template']['spec']['containers']:
     if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
       env: list = container.setdefault('env', [])

xpk 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

xpk 0.11.0py3-none-any.whl → 0.13.0py3-none-any.whl