PyPI - xpk - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

xpk 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

xpk/commands/batch.py +2 -3
xpk/commands/cluster.py +225 -73
xpk/commands/common.py +33 -1
xpk/commands/kjob_common.py +10 -1
xpk/commands/run.py +2 -3
xpk/commands/storage.py +14 -3
xpk/commands/workload.py +17 -15
xpk/core/blueprint/blueprint_generator.py +18 -18
xpk/core/cluster.py +119 -8
xpk/core/config.py +1 -1
xpk/core/filestore.py +2 -6
xpk/core/gcsfuse.py +22 -4
xpk/core/kjob.py +20 -13
xpk/core/kueue.py +30 -0
xpk/core/mtc.py +195 -0
xpk/core/network.py +23 -1
xpk/core/pathways.py +1 -1
xpk/core/resources.py +21 -0
xpk/core/workload.py +1 -1
xpk/core/workload_decorators/rdma_decorator.py +6 -10
xpk/core/workload_decorators/tcpx_decorator.py +179 -0
xpk/core/workload_decorators/tcpxo_decorator.py +15 -14
xpk/parser/cluster.py +573 -389
xpk/parser/storage.py +11 -2
xpk/utils/kubectl.py +4 -1
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/METADATA +134 -91
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/RECORD +31 -29
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/WHEEL +1 -1
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/entry_points.txt +0 -0
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.8.0.dist-info → xpk-0.9.0.dist-info}/top_level.txt +0 -0

xpk/core/network.py CHANGED Viewed

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-from ..utils.console import xpk_print
+from ..utils.console import xpk_exit, xpk_print
 from ..utils.file import write_tmp_file
 from .commands import run_command_for_value, run_command_with_updates
 from .gcloud_context import zone_to_region
@@ -235,6 +235,28 @@ def create_cluster_network_config(args) -> int:
   return 0
+def get_cluster_subnetworks(args) -> list[str]:
+  """Gets the list of cluster networks.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    list[str]: list of cluster networks
+  """
+  command = 'kubectl get GKENetworkParamSet'
+  return_code, stdout = run_command_for_value(
+      command, 'Get Cluster Networks', args
+  )
+  if return_code != 0:
+    xpk_print('GKE Cluster Get NetworkParamSet failed')
+    xpk_exit(return_code)
+  networks = [line.split()[0] for line in stdout.splitlines()][1:]
+  return networks
 def set_up_cluster_network_for_a3(args) -> int:
   """Set up GKE Cluster networks, subnets and firewall rules for A3.
   Note: there are 4 NICs for GPU-GPU bw and 1 NIC for host in an A3 node.

xpk/core/pathways.py CHANGED Viewed

@@ -211,7 +211,7 @@ def append_custom_pathways_worker(args) -> str:
   """
   yaml = """"""
   if args.server_image or args.custom_pathways_worker_args:
-    yaml = """- componentType: pathways_worker"""
+    yaml = """- componentType: worker"""
   indentation = (
       ' ' * 8
   )  # Currently 8, based on the YAML, may need to update in the future.

xpk/core/resources.py CHANGED Viewed

@@ -236,3 +236,24 @@ def get_cluster_system_characteristics(args) -> SystemCharacteristics | None:
       return system
   return None
+def get_cluster_capacity_type(args) -> CapacityType | None:
+  """Get systemCharcteristics based on the cluster resources configMap
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    returns system characteristics
+  """
+  metadata_configmap_name = f'{args.cluster}-{CLUSTER_METADATA_CONFIGMAP}'
+  cluster_config_map = get_cluster_configmap(args, metadata_configmap_name)
+  if cluster_config_map is None:
+    return None
+  capacityValue = cluster_config_map.get('capacity_type')
+  if capacityValue is not None:
+    return CapacityType[capacityValue.upper()]
+  return None

xpk/core/workload.py CHANGED Viewed

@@ -131,7 +131,7 @@ def get_workload_list(args) -> tuple[int, str]:
   )
   workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
   command = (
-      f'kubectl get workloads -o=custom-columns="{s}" '
+      f'kubectl get workloads --ignore-not-found -o=custom-columns="{s}" '
       f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
   )

xpk/core/workload_decorators/rdma_decorator.py CHANGED Viewed

@@ -68,16 +68,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
 def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
-  interfaces = [
-      '[',
-      '    {"interfaceName":"eth0","network":"default"},',
-      *[
-          f'    {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<8 else ""}'
-          for i in range(9)
-      ],
-      ']',
-  ]
-  return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
+  entries = ',\n'.join([
+      f'    {{"interfaceName":"eth{i}","network":"{network}"}}'
+      for i, network in enumerate(sub_networks)
+  ])
+  interfaces = f'[\n{entries}\n]'
+  return 'networking.gke.io/interfaces', literal_string(interfaces)
 def add_annotations(job_manifest: dict, sub_networks: list[str]):

xpk/core/workload_decorators/tcpx_decorator.py ADDED Viewed

@@ -0,0 +1,179 @@
+"""
+Copyright 2024 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import yaml
+from ...utils.yaml import literal_string
+# Component version
+tcpx = 'v2.0.11'
+def decorate_kjob_template(job_manifest: dict) -> dict:
+  add_volumes(job_manifest)
+  add_tolerations(job_manifest)
+  add_tcpxo_daemon_container(job_manifest)
+  update_gpu_containers(job_manifest)
+  return job_manifest
+def decorate_job(job_manifest: dict) -> dict:
+  add_annotations(job_manifest)
+  add_volumes(job_manifest)
+  add_tolerations(job_manifest)
+  add_tcpxo_daemon_container(job_manifest)
+  update_gpu_containers(job_manifest)
+  return job_manifest
+def decorate_jobset(jobset_manifest_str: str) -> str:
+  """
+  Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
+  Args:
+    jobset_manifest_str: The JobSet manifest as a YAML string.
+  Returns:
+    The modified JobSet manifest as a YAML string.
+  """
+  manifest = yaml.safe_load(jobset_manifest_str)
+  for job in manifest['spec']['replicatedJobs']:
+    job_manifest = job['template']
+    job_manifest = decorate_job(job_manifest)
+  return yaml.dump(manifest, sort_keys=False)
+def get_interfaces_annotation() -> dict:
+  interfaces = [
+      '[',
+      '    {"interfaceName":"eth0","network":"default"},',
+      '    {"interfaceName":"eth1","network":"vpc1"},',
+      '    {"interfaceName":"eth2","network":"vpc2"},',
+      '    {"interfaceName":"eth3","network":"vpc3"},',
+      '    {"interfaceName":"eth4","network":"vpc4"}',
+      ']',
+  ]
+  return {'networking.gke.io/interfaces': literal_string('\n'.join(interfaces))}
+def get_tcpx_deamon_annotation() -> dict:
+  return {
+      'devices.gke.io/container.tcpx-daemon': literal_string(
+          '- path: /dev/nvidia0\n'
+          '- path: /dev/nvidia1\n'
+          '- path: /dev/nvidia2\n'
+          '- path: /dev/nvidia3\n'
+          '- path: /dev/nvidia4\n'
+          '- path: /dev/nvidia5\n'
+          '- path: /dev/nvidia6\n'
+          '- path: /dev/nvidia7\n'
+          '- path: /dev/nvidiactl\n'
+          '- path: /dev/nvidia-uvm\n'
+      )
+  }
+def add_annotations(job_manifest: dict):
+  """Adds or updates annotations in the Pod template."""
+  annotations: dict = (
+      job_manifest.setdefault('spec', {})
+      .setdefault('template', {})
+      .setdefault('metadata', {})
+      .setdefault('annotations', {})
+  )
+  annotations.update(get_tcpx_deamon_annotation())
+  annotations.update({'networking.gke.io/default-interface': 'eth0'})
+  annotations.update(get_interfaces_annotation())
+def add_tolerations(job_manifest: dict):
+  """Adds tolerations to the Pod spec."""
+  tolerations: list = (
+      job_manifest.setdefault('spec', {})
+      .setdefault('template', {})
+      .setdefault('spec', {})
+      .setdefault('tolerations', [])
+  )
+  tolerations.append({
+      'key': 'user-workload',
+      'operator': 'Equal',
+      'value': 'true',
+      'effect': 'NoSchedule',
+  })
+def add_volumes(job_manifest: dict):
+  """Adds volumes to the Pod spec."""
+  volumes: list = (
+      job_manifest.setdefault('spec', {})
+      .setdefault('template', {})
+      .setdefault('spec', {})
+      .setdefault('volumes', [])
+  )
+  volumes.append({
+      'name': 'libraries',
+      'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
+  })
+  volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
+  volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
+def add_tcpxo_daemon_container(job_manifest):
+  """Adds the tcpxo-daemon container to the Pod spec."""
+  tcpxo_daemon_container = {
+      'name': 'tcpx-daemon',
+      'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
+      'imagePullPolicy': 'Always',
+      'restartPolicy': 'Always',
+      'command': [
+          '/tcpgpudmarxd/build/app/tcpgpudmarxd',
+          '--gpu_nic_preset',
+          'a3vm',
+          '--gpu_shmem_type',
+          'fd',
+          '--uds_path',
+          '/run/tcpx',
+          '--setup_param',
+          '"--verbose 128 2 0 "',
+      ],
+      'securityContext': {'capabilities': {'add': ['NET_ADMIN']}},
+      'volumeMounts': [
+          {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'},
+          {'name': 'tcpx-socket', 'mountPath': '/run/tcpx'},
+          {'name': 'sys', 'mountPath': '/hostsysfs'},
+          {'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
+      ],
+      'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
+  }
+  spec = job_manifest['spec']['template']['spec']
+  spec.setdefault('initContainers', [])
+  spec['initContainers'].append(tcpxo_daemon_container)
+def update_gpu_containers(job_manifest):
+  for container in job_manifest['spec']['template']['spec']['containers']:
+    if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
+      env: list = container.setdefault('env', [])
+      env.append(
+          {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
+      )
+      volumeMounts: list = container.setdefault('volumeMounts', [])
+      volumeMounts.append({'name': 'tcpx-socket', 'mountPath': '/tmp'})
+      volumeMounts.append(
+          {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
+      )

xpk/core/workload_decorators/tcpxo_decorator.py CHANGED Viewed

@@ -77,16 +77,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
 def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
-  interfaces = [
-      '[',
-      '    {"interfaceName":"eth0","network":"default"},',
-      *[
-          f'    {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<7 else ""}'
-          for i in range(8)
-      ],
-      ']',
-  ]
-  return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
+  entries = ',\n'.join([
+      f'    {{"interfaceName":"eth{i}","network":"{network}"}}'
+      for i, network in enumerate(sub_networks)
+  ])
+  interfaces = f'[\n{entries}\n]'
+  return 'networking.gke.io/interfaces', literal_string(interfaces)
 def get_tcpxo_deamon_entry() -> tuple[str, str]:
@@ -107,7 +103,11 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
 def add_annotations(job_manifest: dict, sub_networks: list[str]):
   """Adds or updates annotations in the Pod template."""
-  annotations = job_manifest['spec']['template']['metadata']['annotations']
+  metadata = job_manifest['spec']['template']['metadata']
+  annotations = metadata.get('annotations')
+  if annotations is None:
+    annotations = {}
+    metadata['annotations'] = annotations
   tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
   interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
   annotations.update({
@@ -149,6 +149,7 @@ def add_tcpxo_daemon_container(job_manifest):
       'name': 'tcpxo-daemon',
       'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
       'imagePullPolicy': 'Always',
+      'restartPolicy': 'Always',
       'command': ['/bin/sh', '-c'],
       'args': [
           'set -ex\nchmod 755'
@@ -165,9 +166,9 @@ def add_tcpxo_daemon_container(job_manifest):
       ],
       'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
   }
-  job_manifest['spec']['template']['spec']['containers'].append(
-      tcpxo_daemon_container
-  )
+  spec = job_manifest['spec']['template']['spec']
+  spec.setdefault('initContainers', [])
+  spec['initContainers'].append(tcpxo_daemon_container)
 def update_gpu_containers(job_manifest):

xpk 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

xpk 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl