PyPI - xpk - Versions diffs - 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

xpk 0.8.0py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

xpk/commands/batch.py +5 -6
xpk/commands/cluster.py +246 -73
xpk/commands/cluster_gcluster.py +27 -0
xpk/commands/common.py +40 -1
xpk/commands/kjob_common.py +13 -1
xpk/commands/run.py +4 -5
xpk/commands/shell.py +2 -2
xpk/commands/storage.py +24 -6
xpk/commands/workload.py +66 -27
xpk/core/blueprint/blueprint_generator.py +115 -47
xpk/core/capacity.py +66 -6
xpk/core/cluster.py +282 -13
xpk/core/config.py +1 -65
xpk/core/docker_manager.py +1 -1
xpk/core/docker_resources.py +145 -72
xpk/core/filestore.py +2 -6
xpk/core/gcsfuse.py +22 -4
xpk/core/jobset.py +143 -0
xpk/core/kjob.py +21 -18
xpk/core/kueue.py +194 -4
xpk/core/mtc.py +195 -0
xpk/core/network.py +23 -1
xpk/core/nodepool.py +17 -4
xpk/core/pathways.py +2 -3
xpk/core/resources.py +21 -0
xpk/core/storage.py +1 -95
xpk/core/system_characteristics.py +1 -1
xpk/core/workload.py +1 -45
xpk/core/workload_decorators/rdma_decorator.py +8 -10
xpk/core/workload_decorators/tcpx_decorator.py +185 -0
xpk/core/workload_decorators/tcpxo_decorator.py +22 -14
xpk/parser/cluster.py +589 -389
xpk/parser/storage.py +12 -3
xpk/parser/workload.py +21 -3
xpk/utils/kubectl.py +4 -1
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/METADATA +178 -96
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/RECORD +41 -38
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/WHEEL +1 -1
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/entry_points.txt +0 -0
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.8.0.dist-info → xpk-0.10.0.dist-info}/top_level.txt +0 -0

xpk/core/storage.py CHANGED Viewed

@@ -46,6 +46,7 @@ STORAGE_CRD_NAME = f"{XPK_API_GROUP_NAME}.{STORAGE_CRD_PLURAL}"
 GCS_FUSE_TYPE = "gcsfuse"
 GCP_FILESTORE_TYPE = "gcpfilestore"
 PARALLELSTORE_TYPE = "parallelstore"
+LUSTRE_TYPE = "lustre"
 GCE_PD_TYPE = "pd"
 MANIFESTS_PATH = os.path.abspath("xpkclusters/storage-manifests")
 GCS_FUSE_ANNOTATIONS = {
@@ -365,101 +366,6 @@ def get_storage_annotations(storages: list[Storage]) -> list[str]:
   return annotations
-def get_storage_volume_mounts_yaml(storages: list[Storage]) -> str:
-  """
-  Generates the YAML representation of the volumeMounts section for the given Storages.
-  This function creates the YAML snippet that defines how the storage volumes
-  should be mounted within a Pod's containers.
-  Args:
-      storages: A list of Storage objects.
-  Returns:
-      A string containing the YAML representation of the volumeMounts section.
-  """
-  yaml_str = ""
-  for storage in storages:
-    yaml_str += f"""- name: {storage.pv}
-                  mountPath: {storage.mount_point}
-                  readOnly: {storage.readonly}
-            """
-  return yaml_str
-def get_storage_volumes_yaml(storages: list[Storage]) -> str:
-  """
-  Generates the YAML representation of the volumes section for the given Storages.
-  This function creates the YAML snippet that defines the volumes to be
-  mounted in a Pod, including the PersistentVolumeClaim associated with
-  each Storage.
-  Args:
-      storages: A list of Storage objects.
-  Returns:
-      A string containing the YAML representation of the volumes section.
-  """
-  yaml_str = ""
-  for storage in storages:
-    yaml_str += f"""- name: {storage.pv}
-                persistentVolumeClaim:
-                  claimName: {storage.pvc}
-                  readOnly: {storage.readonly}
-            """
-  return yaml_str
-def get_storage_volume_mounts_for_gpu(
-    storages: list[Storage],
-) -> list[dict]:
-  """
-  Generates the YAML representation of the volumeMounts section for the given Storages.
-  This function creates the list of storage specifications that define how the storage volumes
-  should be mounted within a Pod's containers.
-  Args:
-      storages: A list of Storage objects.
-  Returns:
-      A list containing the dictionary representation of the volumeMounts section.
-  """
-  return [
-      {
-          "name": storage.pv,
-          "mountPath": storage.mount_point,
-          "readOnly": storage.readonly,
-      }
-      for storage in storages
-  ]
-def get_storage_volumes_yaml_for_gpu(storages: list[Storage]) -> str:
-  """
-  Generates the YAML representation of the volumes section for the given Storages.
-  This function creates the YAML snippet that defines the volumes to be
-  mounted in a Pod, including the PersistentVolumeClaim associated with
-  each Storage.
-  Args:
-      storages: A list of Storage objects.
-  Returns:
-      A string containing the YAML representation of the volumes section.
-  """
-  yaml_str = ""
-  for storage in storages:
-    yaml_str += f"""- name: {storage.pv}
-                persistentVolumeClaim:
-                  claimName: {storage.pvc}
-                  readOnly: {storage.readonly}
-            """
-  return yaml_str
 def get_storage_volumes_yaml_dict(storages: list[Storage]) -> list[dict]:
   vols = []
   for storage in storages:

xpk/core/system_characteristics.py CHANGED Viewed

@@ -1156,7 +1156,7 @@ UserFacingNameToSystemCharacteristics = {
         2,
         'tpu-v5-lite-podslice',
         'ct5lp-hightpu-4t',
-        8,
+        4,
         AcceleratorType['TPU'],
         'v5litepod-8',
     ),

xpk/core/workload.py CHANGED Viewed

@@ -14,18 +14,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-import yaml
-from ..utils import templates
 from ..utils.console import xpk_exit, xpk_print
-from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE
 from .commands import run_command_for_value
 from .gcloud_context import zone_to_region
-from .storage import Storage, get_storage_volume_mounts_for_gpu
-from .system_characteristics import SystemCharacteristics
-RXDM_CONTAINER_A3HIGH_PATH = '/../templates/rxdm_container_a3high.yaml'
-RXDM_CONTAINER_A3MEGA_PATH = '/../templates/rxdm_container_a3mega.yaml'
 def workload_list_awk_command(filter_key) -> str:
@@ -131,7 +122,7 @@ def get_workload_list(args) -> tuple[int, str]:
   )
   workload_list_filter_job_cmd = determine_workload_list_filter_by_job(args)
   command = (
-      f'kubectl get workloads -o=custom-columns="{s}" '
+      f'kubectl get workloads --ignore-not-found -o=custom-columns="{s}" '
       f'{workload_list_filter_status_cmd} {workload_list_filter_job_cmd}'
   )
@@ -249,38 +240,3 @@ def wait_for_job_completion(args) -> int:
     xpk_print('Your workload did not complete successfully')
     return 125
   return 0
-def add_gpu_rxdm_container(
-    jobset_manifest_str: str,
-    system: SystemCharacteristics,
-    all_storages: list[Storage],
-) -> str:
-  """Add gpu rxdm container to jobset manifest based on user provided arguments.
-  Args:
-    jobset_manifest_str: the JobSet manifest as a YAML string.
-    system: system characteristics.
-    all_storages: list of all storages.
-  Returns:
-    str: the modified JobSet manifest as a YAML string.
-  """
-  if system.device_type == H100_DEVICE_TYPE:
-    gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3HIGH_PATH)
-  elif system.device_type == H100_MEGA_DEVICE_TYPE:
-    gpu_rxdm_container = templates.load(RXDM_CONTAINER_A3MEGA_PATH)
-  else:
-    return jobset_manifest_str
-  storage_volume_mounts = get_storage_volume_mounts_for_gpu(all_storages)
-  gpu_rxdm_container['volumeMounts'].extend(storage_volume_mounts)
-  manifest = yaml.safe_load(jobset_manifest_str)
-  for job in manifest['spec']['replicatedJobs']:
-    job['template']['spec']['template']['spec']['containers'].append(
-        gpu_rxdm_container
-    )
-  return yaml.dump(manifest, sort_keys=False)

xpk/core/workload_decorators/rdma_decorator.py CHANGED Viewed

@@ -68,22 +68,20 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
 def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
-  interfaces = [
-      '[',
-      '    {"interfaceName":"eth0","network":"default"},',
-      *[
-          f'    {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<8 else ""}'
-          for i in range(9)
-      ],
-      ']',
-  ]
-  return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
+  entries = ',\n'.join([
+      f'    {{"interfaceName":"eth{i}","network":"{network}"}}'
+      for i, network in enumerate(sub_networks)
+  ])
+  interfaces = f'[\n{entries}\n]'
+  return 'networking.gke.io/interfaces', literal_string(interfaces)
 def add_annotations(job_manifest: dict, sub_networks: list[str]):
   """Adds or updates annotations in the Pod template."""
   annotations = job_manifest['spec']['template']['metadata']['annotations']
   interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
+  if annotations is None:
+    annotations = {}
   annotations.update({
       'networking.gke.io/default-interface': 'eth0',
       interfaces_key: interfaces_value,

xpk/core/workload_decorators/tcpx_decorator.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""
+Copyright 2024 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import yaml
+from ...utils.yaml import literal_string
+# Component version
+tcpx = 'v2.0.11'
+def decorate_kjob_template(job_manifest: dict) -> dict:
+  add_volumes(job_manifest)
+  add_tolerations(job_manifest)
+  add_tcpx_daemon_container(job_manifest)
+  update_gpu_containers(job_manifest)
+  return job_manifest
+def decorate_job(job_manifest: dict) -> dict:
+  add_annotations(job_manifest)
+  add_volumes(job_manifest)
+  add_tolerations(job_manifest)
+  add_tcpx_daemon_container(job_manifest)
+  update_gpu_containers(job_manifest)
+  return job_manifest
+def decorate_jobset(jobset_manifest_str: str) -> str:
+  """
+  Decorates a JobSet manifest with the necessary components for tcpxo-daemon.
+  Args:
+    jobset_manifest_str: The JobSet manifest as a YAML string.
+  Returns:
+    The modified JobSet manifest as a YAML string.
+  """
+  manifest = yaml.safe_load(jobset_manifest_str)
+  for job in manifest['spec']['replicatedJobs']:
+    job_manifest = job['template']
+    job_manifest = decorate_job(job_manifest)
+  return yaml.dump(manifest, sort_keys=False)
+def get_interfaces_annotation() -> dict:
+  interfaces = [
+      '[',
+      '    {"interfaceName":"eth0","network":"default"},',
+      '    {"interfaceName":"eth1","network":"vpc1"},',
+      '    {"interfaceName":"eth2","network":"vpc2"},',
+      '    {"interfaceName":"eth3","network":"vpc3"},',
+      '    {"interfaceName":"eth4","network":"vpc4"}',
+      ']',
+  ]
+  return {'networking.gke.io/interfaces': literal_string('\n'.join(interfaces))}
+def get_tcpx_deamon_annotation() -> dict:
+  return {
+      'devices.gke.io/container.tcpx-daemon': literal_string(
+          '- path: /dev/nvidia0\n'
+          '- path: /dev/nvidia1\n'
+          '- path: /dev/nvidia2\n'
+          '- path: /dev/nvidia3\n'
+          '- path: /dev/nvidia4\n'
+          '- path: /dev/nvidia5\n'
+          '- path: /dev/nvidia6\n'
+          '- path: /dev/nvidia7\n'
+          '- path: /dev/nvidiactl\n'
+          '- path: /dev/nvidia-uvm\n'
+      )
+  }
+def add_annotations(job_manifest: dict):
+  """Adds or updates annotations in the Pod template."""
+  annotations: dict = (
+      job_manifest.setdefault('spec', {})
+      .setdefault('template', {})
+      .setdefault('metadata', {})
+      .setdefault('annotations', {})
+  )
+  annotations.update(get_tcpx_deamon_annotation())
+  annotations.update({'networking.gke.io/default-interface': 'eth0'})
+  annotations.update(get_interfaces_annotation())
+def add_tolerations(job_manifest: dict):
+  """Adds tolerations to the Pod spec."""
+  tolerations: list = (
+      job_manifest.setdefault('spec', {})
+      .setdefault('template', {})
+      .setdefault('spec', {})
+      .setdefault('tolerations', [])
+  )
+  tolerations.append({
+      'key': 'user-workload',
+      'operator': 'Equal',
+      'value': 'true',
+      'effect': 'NoSchedule',
+  })
+def add_volumes(job_manifest: dict):
+  """Adds volumes to the Pod spec."""
+  volumes: list = (
+      job_manifest.setdefault('spec', {})
+      .setdefault('template', {})
+      .setdefault('spec', {})
+      .setdefault('volumes', [])
+  )
+  volumes.append({
+      'name': 'libraries',
+      'hostPath': {'path': '/home/kubernetes/bin/nvidia/lib64'},
+  })
+  volumes.append({'name': 'sys', 'hostPath': {'path': '/sys'}})
+  volumes.append({'name': 'proc-sys', 'hostPath': {'path': '/proc/sys'}})
+  volumes.append(
+      {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
+  )
+def add_tcpx_daemon_container(job_manifest):
+  """Adds the tcpx-daemon container to the Pod spec."""
+  tcpxo_daemon_container = {
+      'name': 'tcpx-daemon',
+      'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{tcpx}',
+      'imagePullPolicy': 'Always',
+      'restartPolicy': 'Always',
+      'command': [
+          '/tcpgpudmarxd/build/app/tcpgpudmarxd',
+          '--gpu_nic_preset',
+          'a3vm',
+          '--gpu_shmem_type',
+          'fd',
+          '--uds_path',
+          '/run/tcpx',
+          '--setup_param',
+          '"--verbose 128 2 0 "',
+      ],
+      'securityContext': {'capabilities': {'add': ['NET_ADMIN']}},
+      'volumeMounts': [
+          {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'},
+          {'name': 'tcpx-socket', 'mountPath': '/run/tcpx'},
+          {'name': 'sys', 'mountPath': '/hostsysfs'},
+          {'name': 'proc-sys', 'mountPath': '/hostprocsysfs'},
+      ],
+      'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
+  }
+  spec = job_manifest['spec']['template']['spec']
+  spec.setdefault('initContainers', [])
+  spec['initContainers'].append(tcpxo_daemon_container)
+def update_gpu_containers(job_manifest):
+  for container in job_manifest['spec']['template']['spec']['containers']:
+    if 'nvidia.com/gpu' in container.get('resources', {}).get('limits', {}):
+      env: list = container.setdefault('env', [])
+      env.append(
+          {'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}
+      )
+      volumeMounts: list = container.setdefault('volumeMounts', [])
+      volumeMounts.append({'name': 'tcpx-socket', 'mountPath': '/tmp'})
+      volumeMounts.append(
+          {'name': 'libraries', 'mountPath': '/usr/local/nvidia/lib64'}
+      )
+      container['volumeMounts'].append(
+          {'name': 'dshm', 'mountPath': '/dev/shm'}
+      )

xpk/core/workload_decorators/tcpxo_decorator.py CHANGED Viewed

@@ -15,6 +15,7 @@ limitations under the License.
 """
 import yaml
 from ...utils.yaml import literal_string
 # Component version
@@ -77,16 +78,12 @@ def decorate_jobset(jobset_manifest_str: str, sub_networks: list[str]) -> str:
 def get_interfaces_entry(sub_networks: list[str]) -> tuple[str, str]:
-  interfaces = [
-      '[',
-      '    {"interfaceName":"eth0","network":"default"},',
-      *[
-          f'    {{"interfaceName":"eth{i + 1}","network":"{sub_networks[i]}"}}{"," if i<7 else ""}'
-          for i in range(8)
-      ],
-      ']',
-  ]
-  return 'networking.gke.io/interfaces', literal_string('\n'.join(interfaces))
+  entries = ',\n'.join([
+      f'    {{"interfaceName":"eth{i}","network":"{network}"}}'
+      for i, network in enumerate(sub_networks)
+  ])
+  interfaces = f'[\n{entries}\n]'
+  return 'networking.gke.io/interfaces', literal_string(interfaces)
 def get_tcpxo_deamon_entry() -> tuple[str, str]:
@@ -107,7 +104,11 @@ def get_tcpxo_deamon_entry() -> tuple[str, str]:
 def add_annotations(job_manifest: dict, sub_networks: list[str]):
   """Adds or updates annotations in the Pod template."""
-  annotations = job_manifest['spec']['template']['metadata']['annotations']
+  metadata = job_manifest['spec']['template']['metadata']
+  annotations = metadata.get('annotations')
+  if annotations is None:
+    annotations = {}
+    metadata['annotations'] = annotations
   tcpxo_deamon_key, tcpxo_deamon_paths = get_tcpxo_deamon_entry()
   interfaces_key, interfaces_value = get_interfaces_entry(sub_networks)
   annotations.update({
@@ -141,6 +142,9 @@ def add_volumes(job_manifest):
       'name': 'aperture-devices',
       'hostPath': {'path': '/dev/aperture_devices'},
   })
+  volumes.append(
+      {'name': 'dshm', 'emptyDir': {'medium': 'Memory', 'sizeLimit': '128Gi'}}
+  )
 def add_tcpxo_daemon_container(job_manifest):
@@ -149,6 +153,7 @@ def add_tcpxo_daemon_container(job_manifest):
       'name': 'tcpxo-daemon',
       'image': f'us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:{rxdm}',
       'imagePullPolicy': 'Always',
+      'restartPolicy': 'Always',
       'command': ['/bin/sh', '-c'],
       'args': [
           'set -ex\nchmod 755'
@@ -165,9 +170,9 @@ def add_tcpxo_daemon_container(job_manifest):
       ],
       'env': [{'name': 'LD_LIBRARY_PATH', 'value': '/usr/local/nvidia/lib64'}],
   }
-  job_manifest['spec']['template']['spec']['containers'].append(
-      tcpxo_daemon_container
-  )
+  spec = job_manifest['spec']['template']['spec']
+  spec.setdefault('initContainers', [])
+  spec['initContainers'].append(tcpxo_daemon_container)
 def update_gpu_containers(job_manifest):
@@ -188,3 +193,6 @@ def update_gpu_containers(job_manifest):
       container['volumeMounts'].append(
           {'name': 'libraries', 'mountPath': '/usr/local/nvidia'}
       )
+      container['volumeMounts'].append(
+          {'name': 'dshm', 'mountPath': '/dev/shm'}
+      )

xpk 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

xpk 0.8.0py3-none-any.whl → 0.10.0py3-none-any.whl