PyPI - xpk - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

xpk 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

xpk/api/__init__.py +15 -0
xpk/api/storage_crd.yaml +52 -0
xpk/commands/batch.py +27 -5
xpk/commands/cluster.py +104 -80
xpk/commands/cluster_gcluster.py +94 -10
xpk/commands/common.py +44 -0
xpk/commands/config.py +29 -0
xpk/commands/info.py +8 -10
xpk/commands/inspector.py +5 -11
xpk/commands/job.py +9 -7
xpk/commands/kind.py +34 -4
xpk/commands/kjob_common.py +44 -0
xpk/commands/run.py +128 -0
xpk/commands/shell.py +27 -7
xpk/commands/storage.py +280 -0
xpk/commands/version.py +6 -18
xpk/commands/workload.py +381 -184
xpk/core/blueprint/blueprint_definitions.py +1 -0
xpk/core/blueprint/blueprint_generator.py +132 -76
xpk/core/capacity.py +185 -0
xpk/core/cluster.py +564 -0
xpk/core/cluster_private.py +6 -3
xpk/core/commands.py +18 -14
xpk/core/config.py +179 -0
xpk/core/docker_container.py +225 -0
xpk/core/docker_image.py +210 -0
xpk/core/docker_resources.py +350 -0
xpk/core/filestore.py +251 -0
xpk/core/gcloud_context.py +196 -0
xpk/core/gcluster_manager.py +20 -2
xpk/core/gcsfuse.py +50 -0
xpk/core/kjob.py +257 -18
xpk/core/kueue.py +12 -6
xpk/core/monitoring.py +134 -0
xpk/core/nap.py +32 -20
xpk/core/network.py +377 -0
xpk/core/nodepool.py +581 -0
xpk/core/pathways.py +124 -45
xpk/core/remote_state/__init__.py +15 -0
xpk/core/remote_state/fuse_remote_state.py +99 -0
xpk/core/remote_state/remote_state_client.py +38 -0
xpk/core/resources.py +238 -0
xpk/core/scheduling.py +253 -0
xpk/core/storage.py +581 -0
xpk/core/system_characteristics.py +38 -1
xpk/core/vertex.py +105 -0
xpk/core/workload.py +209 -1
xpk/core/workload_decorators/rdma_decorator.py +25 -5
xpk/core/workload_decorators/storage_decorator.py +52 -0
xpk/core/workload_decorators/tcpxo_decorator.py +70 -37
xpk/main.py +3 -1
xpk/parser/batch.py +10 -151
xpk/parser/cluster.py +49 -8
xpk/parser/common.py +189 -1
xpk/parser/config.py +49 -0
xpk/parser/core.py +27 -1
xpk/parser/info.py +2 -1
xpk/parser/inspector.py +3 -3
xpk/parser/job.py +25 -4
xpk/parser/kind.py +3 -2
xpk/parser/run.py +47 -0
xpk/parser/shell.py +10 -1
xpk/parser/storage.py +326 -0
xpk/parser/validators.py +3 -3
xpk/parser/workload.py +118 -76
xpk/templates/__init__.py +15 -0
xpk/templates/storage.yaml +13 -0
xpk/utils/gcs_utils.py +125 -0
xpk/utils/kubectl.py +57 -0
xpk/utils/objects.py +8 -5
xpk/utils/templates.py +28 -0
xpk/utils/validation.py +80 -0
{xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/METADATA +169 -15
xpk-0.7.1.dist-info/RECORD +92 -0
{xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/WHEEL +1 -1
xpk/core/core.py +0 -2824
xpk-0.6.0.dist-info/RECORD +0 -57
{xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/entry_points.txt +0 -0
{xpk-0.6.0.dist-info → xpk-0.7.1.dist-info/licenses}/LICENSE +0 -0
{xpk-0.6.0.dist-info → xpk-0.7.1.dist-info}/top_level.txt +0 -0

xpk/core/docker_resources.py ADDED Viewed

@@ -0,0 +1,350 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
+from .cluster import setup_k8s_env
+from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, Storage, get_storages_to_mount
+from .system_characteristics import AcceleratorType, SystemCharacteristics
+def get_main_container_resources(
+    args, system: SystemCharacteristics, resource_type
+) -> str:
+  """Resources for the main container.
+  Args:
+    args: user provided args.
+    system: system characteristics.
+    resource_type: TPU / GPU / CPU
+  Returns:
+    str:
+      Workload resources port as a YAML string
+  """
+  # Resources requirements for Pathways workload containers are known.
+  resources_yaml = """cpu: "24"
+                    memory: 100G"""
+  if args.use_pathways:
+    return resources_yaml
+  gpu_resources_yaml = """nvidia.com/gpu: {system.chips_per_vm}"""
+  if system.accelerator_type == AcceleratorType['GPU']:
+    return gpu_resources_yaml.format(system=system)
+  if system.accelerator_type == AcceleratorType['CPU']:
+    # CPUs don't have chips, but have a subresource called vCPUs.
+    # system.chips_per_vm is used as a proxy for vCPUs.
+    # Some vCPUs get used in hosting system pods of the workloads,
+    # hence an offset of 0.95 is introduced.
+    offset_vCPUs = int(system.chips_per_vm) * 0.95
+    return f'{resource_type}: {offset_vCPUs}'
+  return f'{resource_type}: {system.chips_per_vm}'
+def get_env_container(args, system: SystemCharacteristics) -> str:
+  """Environment configuration for the main container.
+  Args:
+    args: user provided args.
+    system: system characteristics.
+  Returns:
+    str:
+      YAML with the env config for the main container, as a YAML string.
+  """
+  pw_env_yaml = """
+                - name: XCLOUD_ENVIRONMENT
+                  value: GCP
+                - name: JAX_PLATFORMS
+                  value: proxy
+                - name: JAX_BACKEND_TARGET
+                  value: {proxy_address}
+                - name: JOBSET_NAME
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']"""
+  if args.use_pathways:
+    return pw_env_yaml.format(
+        args=args, proxy_address=args.pathways_proxy_address
+    )
+  gpu_env_yaml = """
+                  - name: REPLICATED_JOB_NAME
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
+                  - name: JOBSET_NAME
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
+                  - name: JAX_COORDINATOR_ADDRESS
+                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
+                  - name: NNODES
+                    value: "{args.num_nodes}"
+                  - name: NODE_RANK
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+                  - name: USE_GPUDIRECT
+                    value: {gpu_direct_name}
+                  - name: GPUS_PER_NODE
+                    value: "{system.chips_per_vm}"
+                  - name: JAX_COORDINATOR_PORT
+                    value: "6002"
+                  - name: COMMAND
+                    value: "{args.command}"
+                  {args.env}"""
+  if system.accelerator_type == AcceleratorType['GPU']:
+    gpu_direct_name = 'fastrak'
+    if args.device_type == H100_DEVICE_TYPE:
+      gpu_direct_name = 'tcpx'
+      gpu_env_yaml += """
+                  - name: LD_LIBRARY_PATH
+                    value: /usr/local/nvidia/lib64
+"""
+    elif args.device_type == H100_MEGA_DEVICE_TYPE:
+      gpu_direct_name = 'tcpxo'
+    elif args.device_type == H200_DEVICE_TYPE:
+      gpu_direct_name = 'rdma'
+    return gpu_env_yaml.format(
+        args=args, system=system, gpu_direct_name=gpu_direct_name
+    )
+  if system.accelerator_type == AcceleratorType['CPU']:
+    return get_cpu_env(args.num_slices, args.env, system)
+  return args.env  # pytype: disable=bad-return-type
+def get_cpu_env(num_slices, env_vars, system) -> str:
+  """Generate environment variables for CPU nodepools
+  Args:
+    num_slices: Number of slices to be used in the workload.
+    env_vars: Environment variables, processed from user args.
+    system: system characteristics
+  Returns:
+    str: yaml containing env variables
+  """
+  yaml = """
+                - name: REPLICATED_JOB_NAME
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
+                - name: JOB_INDEX
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: metadata.annotations['jobset.sigs.k8s.io/job-index']
+                - name: JOB_COMPLETION_INDEX
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+                - name: PROCESSES_IN_JOB
+                  value: "{processes_in_job}"
+                - name: JAX_PROCESS_COUNT
+                  value: "{process_count}"
+                {env_vars}
+                - name: JAX_COORDINATOR_ADDRESS
+                  value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
+  """
+  return yaml.format(
+      processes_in_job=system.vms_per_slice,
+      process_count=calculate_process_count(num_slices, system.vms_per_slice),
+      env_vars=env_vars,
+  )
+def get_volumes(args, system: SystemCharacteristics) -> str:
+  """Get volumes accessible to the containers in the pod.
+  Args:
+    args: user provided args.
+    system: system characteristics.
+  Returns:
+    str:
+      YAML for the volumes.
+  """
+  volumes = """- emptyDir:
+                  medium: Memory
+                name: dshm-2
+              """
+  if args.ramdisk_directory != '':
+    volumes += """
+              - name: cache
+                csi:
+                  driver: phase1-checkpoint.csi.storage.gke.io"""
+  if (
+      system.accelerator_type == AcceleratorType['TPU']
+      and args.deploy_stacktrace_sidecar
+  ):
+    volumes += """
+              - name: tpu-stack-trace
+              - name: shared-data
+              """
+  storages: list[Storage] = get_storages_to_mount(
+      setup_k8s_env(args), args.storage
+  )
+  for storage in storages:
+    if storage.type == GCS_FUSE_TYPE:
+      volumes += f"""- name: {storage.pv}
+                persistentVolumeClaim:
+                  claimName: {storage.pvc}
+                  readOnly: {storage.readonly}
+              """
+    if storage.type == GCP_FILESTORE_TYPE:
+      volumes += f"""- name: {storage.pv}
+                persistentVolumeClaim:
+                  claimName: {storage.pvc}
+                  readOnly: {storage.readonly}
+              """
+  return volumes
+def get_volume_mounts(args, system: SystemCharacteristics) -> str:
+  """Resources for the main container.
+  Args:
+    args: user provided args.
+  Returns:
+    str:
+      YAML for the volumes mounted within a Pathways container or GPU container as a YAML string.
+  """
+  volume_mount_yaml = """- mountPath: /dev/shm
+                  name: dshm-2
+                """
+  if args.ramdisk_directory != '':
+    volume_mount_yaml += f"""
+                - mountPath: /{args.ramdisk_directory}
+                  name: cache"""
+  if args.use_pathways:
+    volume_mount_yaml = """- mountPath: /tmp
+                  name: shared-tmp
+                """
+  elif (
+      system.accelerator_type == AcceleratorType['TPU']
+      and args.deploy_stacktrace_sidecar
+  ):
+    volume_mount_yaml += """- name: tpu-stack-trace
+                  mountPath: /tmp/debugging
+                - name: shared-data
+                  mountPath: /shared-volume
+                """
+  elif system.accelerator_type == AcceleratorType['GPU']:
+    if system.device_type == H100_DEVICE_TYPE:
+      volume_mount_yaml = """- name: nvidia-install-dir-host
+                  mountPath: /usr/local/nvidia/lib64
+                - name: tcpx-nccl-plugin-volume
+                  mountPath: /usr/local/tcpx
+                - name: tcpd-socket
+                  mountPath: /tmp
+                - name: shared-memory
+                  mountPath: /dev/shm
+                - name: workload-terminated-volume
+                  mountPath: /usr/share/workload"""
+    elif (
+        system.device_type == H100_MEGA_DEVICE_TYPE
+        or system.device_type == H200_DEVICE_TYPE
+    ):
+      volume_mount_yaml = ''
+  storages: list[Storage] = get_storages_to_mount(
+      setup_k8s_env(args), args.storage
+  )
+  for storage in storages:
+    if storage.type == GCS_FUSE_TYPE:
+      volume_mount_yaml += f"""- name: {storage.pv}
+                  mountPath: {storage.mount_point}
+                  readOnly: {storage.readonly}
+                """
+    if storage.type == GCP_FILESTORE_TYPE:
+      volume_mount_yaml += f"""- name: {storage.pv}
+                  mountPath: {storage.mount_point}
+                  readOnly: {storage.readonly}
+                """
+  return volume_mount_yaml
+def calculate_process_count(num_slices, vms_per_slice) -> str:
+  """Calculates the total number of processes in the workload.
+  Args:
+    num_slices: Number of slices to be used in the workload.
+    vms_per_slice: number of VMs in each slice.
+  Returns:
+    str: total number of processes.
+  """
+  num_processes = int(num_slices) * int(vms_per_slice)
+  return f'{num_processes}'
+def add_container_ports(args, system: SystemCharacteristics) -> str:
+  """Add slice builder and megascale container ports,
+  for non-pathways workloads.
+  Args:
+    args: user provided args.
+  Returns:
+    str:
+      Pathways server port as a YAML string
+  """
+  port_yaml = """- containerPort: 8471
+                - containerPort: 8080"""
+  if args.use_pathways:
+    return ''
+  gpu_port_yaml = """- containerPort: 6002"""
+  if system.accelerator_type == AcceleratorType['GPU']:
+    return gpu_port_yaml
+  return port_yaml
+def add_jax_coordinator_port(system) -> str:
+  """Add jax coordinator port only for CPUs
+  Args:
+    system: system characteristics.
+  Returns:
+    str:
+      jax coordinator port as a YAML string
+  """
+  if system.accelerator_type == AcceleratorType['CPU']:
+    return '- containerPort: 1234'
+  return ''
+def add_image_pull_policy_for_pw_or_gpu(args, system: SystemCharacteristics):
+  """Add image pull policy only for Pathways containers.
+  Args:
+    args: user provided args.
+    system: system characteristics
+  Returns:
+    str:
+      YAML stating that the image will be pulled fro GCR every time.
+  """
+  yaml = """imagePullPolicy: Always"""
+  if args.use_pathways or system.accelerator_type == AcceleratorType['GPU']:
+    return yaml.format(args=args)
+  return ''

xpk/core/filestore.py ADDED Viewed

@@ -0,0 +1,251 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from enum import Enum
+from google.cloud import filestore_v1
+from google.cloud.exceptions import GoogleCloudError
+from google.cloud.filestore_v1.types import (
+    FileShareConfig,
+    Instance,
+    NetworkConfig,
+)
+from ..utils import templates
+from ..utils.console import xpk_exit, xpk_print
+from .cluster import zone_to_region
+FS_PV_PATH = "/../templates/filestore-pv.yaml"
+FS_PVC_PATH = "/../templates/filestore-pvc.yaml"
+FS_SC_PATH = "/../templates/filestore-sc.yaml"
+class Availability(Enum):
+  ZONAL = "Zonal"
+  REGIONAL = "Regional"
+TIERS = {
+    "BASIC_HDD": Availability.ZONAL,
+    "BASIC_SSD": Availability.ZONAL,
+    "ZONAL": Availability.ZONAL,
+    "REGIONAL": Availability.REGIONAL,
+    "ENTERPRISE": Availability.REGIONAL,
+}
+def get_storage_class_name(storage_name: str) -> str:
+  return f"{storage_name}-sc"
+def get_pv_name(storage_name: str) -> str:
+  return f"{storage_name}-pv"
+def get_pvc_name(storage_name: str) -> str:
+  return f"{storage_name}-pvc"
+class FilestoreClient:
+  """FilestoreClient is a class for interacting with GCP filestore instances."""
+  def __init__(
+      self,
+      zone: str,
+      name: str,
+      project: str,
+  ) -> None:
+    self.zone = zone
+    self.region = zone_to_region(zone)
+    self.name = name
+    self.project = project
+    self._client = filestore_v1.CloudFilestoreManagerClient()
+    self.instance: Instance | None = None
+  def get_instance(self) -> Instance | None:
+    """Get existing Filestore instance"""
+    parentZonal = self.get_parent(self.zone)
+    parentRegional = self.get_parent(self.region)
+    reqZonal = filestore_v1.ListInstancesRequest(parent=parentZonal)
+    reqRegional = filestore_v1.ListInstancesRequest(parent=parentRegional)
+    try:
+      instancesZonal = self._client.list_instances(reqZonal)
+      instancesRegional = self._client.list_instances(reqRegional)
+    except GoogleCloudError as e:
+      xpk_print(f"Exception while trying to list instances {e}")
+      xpk_exit(1)
+    fullname_zonal = self.get_instance_fullname(self.zone)
+    fullname_regional = self.get_instance_fullname(self.region)
+    for instance in instancesZonal:
+      if instance.name == fullname_zonal:
+        return instance  # pytype: disable=bad-return-type
+    for instance in instancesRegional:
+      if instance.name == fullname_regional:
+        return instance  # pytype: disable=bad-return-type
+  def check_instance_exists(self) -> bool:
+    """Check if Filestore instance exists"""
+    instance = self.get_instance()
+    return instance is not None
+  def load_instance(self) -> None:
+    if self.instance is None:
+      self.instance = self.get_instance()
+  def get_instance_location(self) -> str:
+    """Get Filestore instance's location"""
+    self.load_instance()
+    return str(self.instance.name.split("/")[3])
+  def create_instance(
+      self,
+      vol: str,
+      size: int,
+      tier: str,
+      connect_mode=None,
+      reserved_ip_range=None,
+      network: str = "default",
+      description: str = "XPK created filestore instance",
+      kms_key_name=None,
+      source_backup=None,
+      nfs_export_options=None,
+      modes=None,
+  ) -> None:
+    """Create new Filestore instance"""
+    location = (
+        self.zone
+        if TIERS[tier].value == Availability.ZONAL.value
+        else self.region
+    )
+    file_shares = [
+        FileShareConfig(
+            name=vol,
+            capacity_gb=size,
+            source_backup=source_backup,
+            nfs_export_options=nfs_export_options,
+        )
+    ]
+    networks = [
+        NetworkConfig(
+            network=network,
+            modes=modes,
+            reserved_ip_range=reserved_ip_range,
+            connect_mode=connect_mode,
+        )
+    ]
+    request = filestore_v1.CreateInstanceRequest(
+        parent=self.get_parent(location),
+        instance_id=self.name,
+        instance=Instance(
+            description=description,
+            tier=tier,
+            kms_key_name=kms_key_name,
+            file_shares=file_shares,
+            networks=networks,
+        ),
+    )
+    # Make the request
+    operation = self._client.create_instance(request=request)
+    xpk_print("Waiting for filestore creation to complete...")
+    self.instance = None
+    try:
+      self.instance = operation.result()
+    except GoogleCloudError as e:
+      xpk_print(f"Error while creating Filestore instance: {e}")
+      xpk_exit(1)
+    xpk_print(
+        f"Filestore instance {self.get_instance_fullname(location)} created"
+    )
+  def delete_filestore_instance(self):
+    # Initialize request
+    name = self.get_instance_fullname()
+    request = filestore_v1.DeleteInstanceRequest(name=name)
+    # Make the request
+    operation = self._client.delete_instance(request)
+    xpk_print("Waiting for filestore deletion to complete...")
+    try:
+      operation.result()
+    except GoogleCloudError as e:
+      xpk_print(f"Error while deleting Filestore instance: {e}")
+      xpk_exit(1)
+    xpk_print(f"Filestore instance {name} deleted")
+  def create_sc(self, name: str, network: str) -> dict:
+    """Create a yaml representing filestore StorageClass."""
+    data = templates.load(FS_SC_PATH)
+    data["metadata"]["name"] = get_storage_class_name(name)
+    data["parameters"]["tier"] = self.instance.tier.name
+    data["parameters"][
+        "network"
+    ] = f"projects/{self.project}/global/networks/{network}"
+    return data
+  def create_pv(self, name: str, vol: str, access_mode: str) -> dict:
+    """Create a yaml representing filestore PersistentVolume."""
+    data = templates.load(FS_PV_PATH)
+    data["metadata"]["name"] = get_pv_name(name)
+    data["spec"]["storageClassName"] = get_storage_class_name(name)
+    data["spec"]["capacity"]["storage"] = self.instance.file_shares[
+        0
+    ].capacity_gb
+    data["spec"]["accessModes"] = [access_mode]
+    volumeHandle = f"{self.get_instance_fullname()}/volumes/{vol}"
+    data["spec"]["csi"]["volumeHandle"] = volumeHandle
+    data["spec"]["csi"]["volumeAttributes"]["ip"] = self.instance.networks[
+        0
+    ].ip_addresses[0]
+    data["spec"]["csi"]["volumeAttributes"]["volume"] = vol
+    return data
+  def create_pvc(self, name: str, access_mode: str) -> dict:
+    """Create a yaml representing filestore PersistentVolumeClaim."""
+    data = templates.load(FS_PVC_PATH)
+    data["metadata"]["name"] = get_pvc_name(name)
+    data["spec"]["accessModes"] = [access_mode]
+    data["spec"]["storageClassName"] = get_storage_class_name(name)
+    data["spec"]["volumeName"] = get_pv_name(name)
+    data["spec"]["resources"]["requests"]["storage"] = (
+        self.instance.file_shares[0].capacity_gb
+    )
+    return data
+  def manifest(
+      self, name: str, vol: str, access_mode: str, network: str
+  ) -> list[dict]:
+    self.load_instance()
+    pv = self.create_pv(name, vol, access_mode)
+    pvc = self.create_pvc(name, access_mode)
+    sc = self.create_sc(name, network)
+    return [pv, pvc, sc]
+  def get_parent(self, location: str | None = None) -> str:
+    """Get the Filestore's parent's name"""
+    if location is None:
+      location = self.get_instance_location()
+    return f"projects/{self.project}/locations/{location}"
+  def get_instance_fullname(self, location: str | None = None) -> str:
+    """Get the Filestore's full name"""
+    if location is None:
+      location = self.get_instance_location()
+    return f"projects/{self.project}/locations/{location}/instances/{self.name}"

xpk 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

xpk 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl