PyPI - xpk - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

xpk 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

xpk/commands/batch.py +3 -3
xpk/commands/cluster.py +22 -1
xpk/commands/cluster_gcluster.py +27 -0
xpk/commands/common.py +12 -5
xpk/commands/kjob_common.py +4 -1
xpk/commands/run.py +2 -2
xpk/commands/shell.py +2 -2
xpk/commands/storage.py +10 -3
xpk/commands/workload.py +64 -27
xpk/core/blueprint/blueprint_generator.py +108 -40
xpk/core/capacity.py +66 -6
xpk/core/cluster.py +165 -7
xpk/core/config.py +1 -65
xpk/core/docker_manager.py +1 -1
xpk/core/docker_resources.py +145 -72
xpk/core/jobset.py +143 -0
xpk/core/kjob.py +2 -6
xpk/core/kueue.py +154 -5
xpk/core/nodepool.py +17 -4
xpk/core/pathways.py +1 -2
xpk/core/storage.py +1 -95
xpk/core/system_characteristics.py +1 -1
xpk/core/workload.py +0 -44
xpk/core/workload_decorators/rdma_decorator.py +2 -0
xpk/core/workload_decorators/tcpx_decorator.py +10 -4
xpk/core/workload_decorators/tcpxo_decorator.py +7 -0
xpk/parser/cluster.py +23 -7
xpk/parser/storage.py +2 -2
xpk/parser/workload.py +21 -3
{xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/METADATA +46 -7
{xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/RECORD +35 -34
{xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/WHEEL +0 -0
{xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/entry_points.txt +0 -0
{xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/licenses/LICENSE +0 -0
{xpk-0.9.0.dist-info → xpk-0.10.1.dist-info}/top_level.txt +0 -0

xpk/core/config.py CHANGED Viewed

@@ -15,16 +15,14 @@ limitations under the License.
 """
 import os
-import re
 import ruamel.yaml
 from ..utils import file
 from ..utils.console import xpk_print
-from .system_characteristics import AcceleratorType, SystemCharacteristics
 # This is the version for XPK PyPI package
-__version__ = 'v0.9.0'
+__version__ = 'v0.10.1'
 XPK_CURRENT_VERSION = __version__
 XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')
@@ -117,65 +115,3 @@ class XpkConfig:
       return None
     val: dict[str, str] = config_yaml[CONFIGS_KEY]
     return val
-def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
-  """Parses the environment configurations to the jobset config.
-  Args:
-    args: user provided arguments for running the command.
-    tensorboard_config: configuration of Vertex Tensorboard.
-    system: system characteristics.
-  """
-  env = {}
-  env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
-  if args.env_file:
-    print('Setting container environment from', args.env_file)
-    with open(file=args.env_file, mode='r', encoding='utf-8') as f:
-      for match in env_pat.finditer(f.read()):
-        variable = match.group(1)
-        if match.group(2) is not None:
-          env[variable] = match.group(2)
-        else:
-          assert variable in os.environ, (
-              f'Variable {variable} is not set in the current '
-              'environment, a value must be specified.'
-          )
-          env[variable] = os.environ[variable]
-  if args.env:
-    for var in args.env:
-      match = env_pat.match(var)
-      assert match and match.group(2) is not None, (
-          'Invalid environment variable, format must be '
-          f'`--env VARIABLE=value`: {var}'
-      )
-      variable = match.group(1)
-      env[variable] = match.group(2)
-  if not args.use_pathways:
-    if args.debug_dump_gcs:
-      if 'XLA_FLAGS' in env:
-        raise ValueError(
-            'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
-            'and environment file. Please choose one way to define '
-            'XLA_FLAGS.'
-        )
-      env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
-    if tensorboard_config:
-      env['UPLOAD_DATA_TO_TENSORBOARD'] = True
-      for key, value in tensorboard_config.items():
-        env[key.upper()] = value
-  if system.accelerator_type == AcceleratorType['GPU']:
-    # For GPUs, it has two more spaces ahead of name and value respectively
-    env_format = '''
-                  - name: {key}
-                    value: "{value}"'''
-  else:
-    env_format = '''
-                - name: {key}
-                  value: "{value}"'''
-  args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())

xpk/core/docker_manager.py CHANGED Viewed

@@ -30,7 +30,7 @@ import time
 DockerRunCommandExitCode = 135
 dockerBuildErrorCode = 134
 ctk_dockerfile_path = "Dockerfile"
-ctk_build_ref = "v1.48.0"
+ctk_build_ref = "v1.57.1"
 ctk_docker_image = "xpk-ctk"
 ctk_container_name = "xpk-ctk-container"
 gcloud_cfg_mount_path = "/root/.config/gcloud"

xpk/core/docker_resources.py CHANGED Viewed

@@ -14,9 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE, B200_DEVICE_TYPE
+import os
+import re
+from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
 from .cluster import setup_k8s_env
-from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, Storage, get_storages_to_mount
+from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, PARALLELSTORE_TYPE, GCE_PD_TYPE, LUSTRE_TYPE, Storage, get_storages_to_mount
 from .system_characteristics import AcceleratorType, SystemCharacteristics
@@ -64,6 +66,25 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
     str:
       YAML with the env config for the main container, as a YAML string.
   """
+  if system.accelerator_type == AcceleratorType['GPU']:
+    return get_gpu_env(args, system)
+  if system.accelerator_type == AcceleratorType['CPU']:
+    return get_cpu_env(args, system)
+  return format_env_dict(args.env, system)  # pytype: disable=bad-return-type
+def get_gpu_env(args, system) -> str:
+  """Generate environment variables for GPU nodepools
+  Args:
+    num_slices: Number of slices to be used in the workload.
+    env_vars: Environment variables, processed from user args.
+    system: system characteristics
+  Returns:
+    str: yaml containing env variables
+  """
   gpu_env_yaml = """
                   - name: REPLICATED_JOB_NAME
                     valueFrom:
@@ -73,8 +94,6 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
                     valueFrom:
                       fieldRef:
                         fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
-                  - name: JAX_COORDINATOR_ADDRESS
-                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
                   - name: NNODES
                     value: "{args.num_nodes}"
                   - name: NODE_RANK
@@ -84,36 +103,37 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
                   - name: USE_GPUDIRECT
                     value: {gpu_direct_name}
                   - name: GPUS_PER_NODE
-                    value: "{system.chips_per_vm}"
-                  - name: JAX_COORDINATOR_PORT
-                    value: "6002"
+                    value: "{chips_per_vm}"
                   - name: COMMAND
                     value: "{args.command}"
-                  {args.env}"""
-  if system.accelerator_type == AcceleratorType['GPU']:
-    gpu_direct_name = 'fastrak'
-    if args.device_type == H100_DEVICE_TYPE:
-      gpu_direct_name = 'tcpx'
-      gpu_env_yaml += """
-                  - name: LD_LIBRARY_PATH
-                    value: /usr/local/nvidia/lib64
-"""
-    elif args.device_type == H100_MEGA_DEVICE_TYPE:
-      gpu_direct_name = 'tcpxo'
-    elif args.device_type == H200_DEVICE_TYPE:
-      gpu_direct_name = 'rdma'
-    return gpu_env_yaml.format(
-        args=args, system=system, gpu_direct_name=gpu_direct_name
-    )
-  if system.accelerator_type == AcceleratorType['CPU']:
-    return get_cpu_env(args.num_slices, args.env, system)
-  return args.env  # pytype: disable=bad-return-type
+                  {custom_envs}"""
+  gpu_direct_name = 'fastrak'
+  if args.device_type == H100_DEVICE_TYPE:
+    gpu_direct_name = 'tcpx'
+  elif args.device_type == H100_MEGA_DEVICE_TYPE:
+    gpu_direct_name = 'tcpxo'
+  elif args.device_type == H200_DEVICE_TYPE:
+    gpu_direct_name = 'rdma'
+  gpu_env_dic = {
+      'JAX_COORDINATOR_PORT': '6002',
+      'JAX_COORDINATOR_ADDRESS': (
+          '$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)'
+      ),
+  }
+  args.env = gpu_env_dic | args.env
+  return gpu_env_yaml.format(
+      args=args,
+      chips_per_vm=system.chips_per_vm,
+      gpu_direct_name=gpu_direct_name,
+      custom_envs=format_env_dict(args.env, system),
+  )
-def get_cpu_env(num_slices, env_vars, system) -> str:
+def get_cpu_env(args, system) -> str:
   """Generate environment variables for CPU nodepools
   Args:
     num_slices: Number of slices to be used in the workload.
@@ -136,19 +156,87 @@ def get_cpu_env(num_slices, env_vars, system) -> str:
                   valueFrom:
                     fieldRef:
                       fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
-                - name: PROCESSES_IN_JOB
-                  value: "{processes_in_job}"
-                - name: JAX_PROCESS_COUNT
-                  value: "{process_count}"
-                {env_vars}
-                - name: JAX_COORDINATOR_ADDRESS
-                  value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
+                {custom_envs}
   """
-  return yaml.format(
-      processes_in_job=system.vms_per_slice,
-      process_count=calculate_process_count(num_slices, system.vms_per_slice),
-      env_vars=env_vars,
-  )
+  cpu_env_dic = {
+      'PROCESSES_IN_JOB': str(system.vms_per_slice),
+      'JAX_PROCESS_COUNT': str(
+          calculate_process_count(args.num_slices, system.vms_per_slice)
+      ),
+      'JAX_COORDINATOR_ADDRESS': (
+          '$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)'
+      ),
+  }
+  args.env = cpu_env_dic | args.env
+  return yaml.format(custom_envs=format_env_dict(args.env, system))
+def format_env_dict(env, system: SystemCharacteristics) -> str:
+  if system.accelerator_type == AcceleratorType['GPU']:
+    # For GPUs, it has two more spaces ahead of name and value respectively
+    env_format = '''
+                  - name: {key}
+                    value: "{value}"'''
+  else:
+    env_format = '''
+                - name: {key}
+                  value: "{value}"'''
+  return ''.join(env_format.format(key=k, value=v) for k, v in env.items())
+def parse_env_config(args, tensorboard_config):
+  """Parses the environment configurations to the a dictionary.
+  Args:
+    args: user provided arguments for running the command.
+    tensorboard_config: configuration of Vertex Tensorboard.
+    system: system characteristics.
+  """
+  env = {}
+  env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
+  if args.env_file:
+    print('Setting container environment from', args.env_file)
+    with open(file=args.env_file, mode='r', encoding='utf-8') as f:
+      for match in env_pat.finditer(f.read()):
+        variable = match.group(1)
+        if match.group(2) is not None:
+          env[variable] = match.group(2)
+        else:
+          assert variable in os.environ, (
+              f'Variable {variable} is not set in the current '
+              'environment, a value must be specified.'
+          )
+          env[variable] = os.environ[variable]
+  if args.env:
+    for var in args.env:
+      match = env_pat.match(var)
+      assert match and match.group(2) is not None, (
+          'Invalid environment variable, format must be '
+          f'`--env VARIABLE=value`: {var}'
+      )
+      variable = match.group(1)
+      env[variable] = match.group(2)
+  if not args.use_pathways:
+    if args.debug_dump_gcs:
+      if 'XLA_FLAGS' in env:
+        raise ValueError(
+            'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
+            'and environment file. Please choose one way to define '
+            'XLA_FLAGS.'
+        )
+      env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
+    if tensorboard_config:
+      env['UPLOAD_DATA_TO_TENSORBOARD'] = True
+      for key, value in tensorboard_config.items():
+        env[key.upper()] = value
+  args.env = env
 def get_volumes(args, system: SystemCharacteristics) -> str:
@@ -188,13 +276,13 @@ def get_volumes(args, system: SystemCharacteristics) -> str:
       setup_k8s_env(args), args.storage
   )
   for storage in storages:
-    if storage.type == GCS_FUSE_TYPE:
-      volumes += f"""- name: {storage.pv}
-                persistentVolumeClaim:
-                  claimName: {storage.pvc}
-                  readOnly: {storage.readonly}
-              """
-    if storage.type == GCP_FILESTORE_TYPE:
+    if storage.type in {
+        GCS_FUSE_TYPE,
+        GCP_FILESTORE_TYPE,
+        PARALLELSTORE_TYPE,
+        GCE_PD_TYPE,
+        LUSTRE_TYPE,
+    }:
       volumes += f"""- name: {storage.pv}
                 persistentVolumeClaim:
                   claimName: {storage.pvc}
@@ -235,34 +323,19 @@ def get_volume_mounts(args, system: SystemCharacteristics) -> str:
                   mountPath: /shared-volume
                 """
   elif system.accelerator_type == AcceleratorType['GPU']:
-    if system.device_type == H100_DEVICE_TYPE:
-      volume_mount_yaml = """- name: nvidia-install-dir-host
-                  mountPath: /usr/local/nvidia/lib64
-                - name: tcpx-nccl-plugin-volume
-                  mountPath: /usr/local/tcpx
-                - name: tcpd-socket
-                  mountPath: /tmp
-                - name: shared-memory
-                  mountPath: /dev/shm
-                - name: workload-terminated-volume
-                  mountPath: /usr/share/workload"""
-    elif (
-        system.device_type == H100_MEGA_DEVICE_TYPE
-        or system.device_type == H200_DEVICE_TYPE
-        or system.device_type == B200_DEVICE_TYPE
-    ):
-      volume_mount_yaml = ''
+    volume_mount_yaml = ''
   storages: list[Storage] = get_storages_to_mount(
       setup_k8s_env(args), args.storage
   )
   for storage in storages:
-    if storage.type == GCS_FUSE_TYPE:
-      volume_mount_yaml += f"""- name: {storage.pv}
-                  mountPath: {storage.mount_point}
-                  readOnly: {storage.readonly}
-                """
-    if storage.type == GCP_FILESTORE_TYPE:
+    if storage.type in {
+        GCS_FUSE_TYPE,
+        GCP_FILESTORE_TYPE,
+        PARALLELSTORE_TYPE,
+        GCE_PD_TYPE,
+        LUSTRE_TYPE,
+    }:
       volume_mount_yaml += f"""- name: {storage.pv}
                   mountPath: {storage.mount_point}
                   readOnly: {storage.readonly}

xpk/core/jobset.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""
+Copyright 2024 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import math
+from ..utils.console import xpk_exit, xpk_print
+from ..utils.file import write_tmp_file
+from ..core.kueue import (
+    MEMORY_SIZE_PER_VM,
+    MIN_MEMORY_LIMIT_SIZE,
+)
+from .commands import (
+    run_command_for_value,
+    run_command_with_updates_retry,
+)
+jobset_controller_manager_yml = """
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app.kubernetes.io/component: manager
+    app.kubernetes.io/created-by: jobset
+    app.kubernetes.io/instance: controller-manager
+    app.kubernetes.io/managed-by: kustomize
+    app.kubernetes.io/name: deployment
+    app.kubernetes.io/part-of: jobset
+    control-plane: controller-manager
+  name: jobset-controller-manager
+  namespace: jobset-system
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      control-plane: controller-manager
+  template:
+    metadata:
+      annotations:
+        kubectl.kubernetes.io/default-container: manager
+      labels:
+        control-plane: controller-manager
+    spec:
+      containers:
+      - args:
+        - --config=/controller_manager_config.yaml
+        - --zap-log-level=2
+        command:
+        - /manager
+        image: registry.k8s.io/jobset/jobset:v0.8.0
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8081
+          initialDelaySeconds: 15
+          periodSeconds: 20
+        name: manager
+        ports:
+        - containerPort: 9443
+          name: webhook-server
+          protocol: TCP
+        readinessProbe:
+          httpGet:
+            path: /readyz
+            port: 8081
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        resources:
+          limits:
+            memory: {memory_limit_size}
+          requests:
+            cpu: 500m
+            memory: 128Mi
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - ALL
+        volumeMounts:
+        - mountPath: /controller_manager_config.yaml
+          name: manager-config
+          subPath: controller_manager_config.yaml
+        - mountPath: /tmp/k8s-webhook-server/serving-certs
+          name: cert
+          readOnly: true
+      securityContext:
+        runAsNonRoot: true
+      serviceAccountName: jobset-controller-manager
+      terminationGracePeriodSeconds: 10
+      volumes:
+      - configMap:
+          name: jobset-manager-config
+        name: manager-config
+      - name: cert
+        secret:
+          defaultMode: 420
+          secretName: jobset-webhook-server-cert
+"""
+def update_jobset_resources_if_necessary(args):
+  """Update the jobset manifest to increase the resources for the jobset controller manager.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  # Get total number of nodes
+  cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
+  return_code, out = run_command_for_value(
+      cmd_total_node_num, 'Count total nodes', args
+  )
+  if return_code != 0:
+    xpk_exit(1)
+  # 1.2MiB per VM or 4GiB (whichever is greater).
+  new_memory_limit = (
+      f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
+  )
+  yml_string = jobset_controller_manager_yml.format(
+      memory_limit_size=new_memory_limit,
+  )
+  tmp = write_tmp_file(yml_string)
+  command = f'kubectl apply -f {str(tmp.file.name)}'
+  task = 'Updating jobset Controller Manager resources'
+  return_code = run_command_with_updates_retry(command, task, args)
+  if return_code != 0:
+    xpk_print(f'{task} returned ERROR {return_code}')
+  return return_code

xpk/core/kjob.py CHANGED Viewed

@@ -40,11 +40,8 @@ from .config import (
     XpkConfig,
 )
 from .network import get_cluster_subnetworks
-from .resources import (
-    AcceleratorType,
-    SystemCharacteristics,
-    get_cluster_system_characteristics,
-)
+from .system_characteristics import AcceleratorType, SystemCharacteristics
+from .resources import get_cluster_system_characteristics
 from .storage import (
     GCS_FUSE_ANNOTATIONS,
     PARALLELSTORE_ANNOTATIONS,
@@ -380,7 +377,6 @@ def prepare_kjob(args: Namespace) -> int:
   job_err_code = create_job_template_instance(args, system, service_account)
   if job_err_code > 0:
     return job_err_code
   pod_err_code = create_pod_template_instance(args, service_account)
   if pod_err_code > 0:
     return pod_err_code

xpk 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

xpk 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl