PyPI - xpk - Versions diffs - 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

xpk 0.10.1py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

xpk/commands/cluster.py +263 -0
xpk/core/capacity.py +2 -2
xpk/core/config.py +1 -1
xpk/core/jobset.py +1 -1
xpk/core/kueue.py +6 -2
xpk/core/nap.py +16 -2
xpk/core/nodepool.py +52 -19
xpk/core/scheduling.py +1 -1
xpk/core/system_characteristics.py +266 -1080
{xpk-0.10.1.dist-info → xpk-0.11.0.dist-info}/METADATA +1 -1
{xpk-0.10.1.dist-info → xpk-0.11.0.dist-info}/RECORD +15 -15
{xpk-0.10.1.dist-info → xpk-0.11.0.dist-info}/WHEEL +0 -0
{xpk-0.10.1.dist-info → xpk-0.11.0.dist-info}/entry_points.txt +0 -0
{xpk-0.10.1.dist-info → xpk-0.11.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.10.1.dist-info → xpk-0.11.0.dist-info}/top_level.txt +0 -0

xpk/commands/cluster.py CHANGED Viewed

@@ -78,6 +78,8 @@ from ..utils.console import get_user_input, xpk_exit, xpk_print
 from ..utils.file import write_tmp_file
 from . import cluster_gcluster
 from .common import set_cluster_command
+import shutil
+import os
 def cluster_adapt(args) -> None:
@@ -247,6 +249,10 @@ def cluster_create(args) -> None:
   get_cluster_credentials(args)
+  update_coredns_command_code = update_coredns_if_necessary(args)
+  if update_coredns_command_code != 0:
+    xpk_exit(update_cluster_command_code)
   k8s_client = setup_k8s_env(args)
   install_storage_crd(k8s_client)
@@ -702,6 +708,262 @@ def cluster_create_ray_cluster(args) -> None:
   cluster_create(args)
+def install_jq(args):
+  """Installs 'jq' utility."""
+  if shutil.which('jq'):
+    xpk_print("Task: 'Install jq' skipped, jq already installed.")
+    return
+  command_jq_install = 'sudo apt install jq -y'
+  xpk_print("Task: 'Install jq' in progress.")
+  return_code = run_command_with_updates(command_jq_install, 'Install jq', args)
+  if return_code != 0:
+    xpk_print(f'Install jq error {return_code}')
+    xpk_exit(return_code)
+def clone_coredns_deployment_repo(args, coredns_repo_full_path: str):
+  """Clones the CoreDNS deployment repository if it doesn't exist."""
+  if os.path.exists(coredns_repo_full_path):
+    xpk_print(
+        f"Directory '{coredns_repo_full_path}' already exists, skip git clone."
+    )
+    return
+  command_git_clone = (
+      'git clone https://github.com/coredns/deployment.git'
+      f' {coredns_repo_full_path}'
+  )
+  xpk_print(
+      "Task: 'Clone deployment' in progress, Target"
+      f' directory:{coredns_repo_full_path}.'
+  )
+  return_code = run_command_with_updates(
+      command_git_clone, 'Clone deployment', args
+  )
+  if return_code != 0:
+    xpk_print(f'Clone deployment error {return_code}')
+    xpk_exit(return_code)
+def deploy_coredns_manifests(args, coredns_k8s_path: str):
+  """Deploys CoreDNS manifests to the cluster."""
+  if not os.path.isdir(coredns_k8s_path):
+    xpk_print(
+        f"Error：CoreDNS Kubernetes path '{coredns_k8s_path}' does not exist."
+        ' Has git clone been successful?'
+    )
+    xpk_exit(1)
+  original_cwd = os.getcwd()
+  try:
+    os.chdir(coredns_k8s_path)
+    xpk_print(f'Current working directory changed to: {os.getcwd()}')
+    command_deploy_coredns = './deploy.sh | kubectl apply -f -'
+    xpk_print(
+        f"Task: 'Deploy CoreDNS' in progress, Located at '{coredns_k8s_path}'"
+    )
+    return_code = run_command_with_updates(
+        command_deploy_coredns, 'Deploy CoreDNS', args
+    )
+    if return_code != 0:
+      xpk_print(f'Deploy CoreDNS error {return_code}')
+  finally:
+    xpk_print(f'Restoring working directory to: {original_cwd}')
+    os.chdir(original_cwd)
+  if return_code != 0:
+    xpk_exit(return_code)
+def scale_down_deployment(
+    args, deployment_name: str, namespace: str = 'kube-system'
+):
+  """Scales down a specified Kubernetes deployment to 0 replicas."""
+  command = (
+      f'kubectl scale deployment {deployment_name} --replicas=0'
+      f' --namespace={namespace}'
+  )
+  xpk_print(f"Task: 'Scaling down {deployment_name}' in progress")
+  return_code = run_command_with_updates(
+      command, f'Scale down {deployment_name}', args
+  )
+  if return_code != 0:
+    xpk_print(f'Scale down {deployment_name} error {return_code}')
+    xpk_exit(return_code)
+  xpk_print(f'\n{deployment_name} has been scaled down.')
+def scale_up_coredns(args, replicas: int = 15, namespace: str = 'kube-system'):
+  """Scales up the CoreDNS deployment to a specified number of replicas."""
+  command_coredns_scale = (
+      f'kubectl scale deployment coredns --replicas={replicas} -n {namespace}'
+  )
+  xpk_print(f"Task: 'Scale CoreDNS' in progress (to {replicas} replicas)")
+  return_code = run_command_with_updates(
+      command_coredns_scale, 'Scale CoreDNS', args
+  )
+  if return_code != 0:
+    xpk_print(f'Scale CoreDNS error {return_code}')
+    xpk_exit(return_code)
+def check_deployment_exists(args, deployment_name: str, namespace: str) -> bool:
+  """Check for the existence of a specific Deployment in a given namespace."""
+  command = (
+      f'kubectl get deployment {deployment_name} -n'
+      f' {namespace} --ignore-not-found'
+  )
+  result = run_command_with_updates(
+      command, 'Waiting for kubeDNS to be checked.', args
+  )
+  return result
+def verify_coredns_readiness(
+    args, timeout: int = 120, namespace: str = 'kube-system'
+):
+  """Verifies CoreDNS readiness using kubectl wait commands."""
+  xpk_print('Now verifying CoreDNS readiness...')
+  kube_dns_exists = check_deployment_exists(args, 'kube-dns', namespace)
+  if kube_dns_exists:
+    # Wait for kube-dns to be fully scaled down
+    command_kube_dns_wait_scaled_down = (
+        'kubectl wait deployment/kube-dns'
+        " --for=jsonpath='{.status.replicas}'=0"
+        f' --namespace={namespace} --timeout={timeout}s'
+    )
+    xpk_print('Verifying if kube-dns has scaled down...')
+    return_code_kube_dns = run_command_with_updates(
+        command_kube_dns_wait_scaled_down, 'Wait for kube-dns scale down', args
+    )
+    if return_code_kube_dns != 0:
+      xpk_print('kube-dns did not scale down successfully within the timeout.')
+      xpk_exit(1)  # Exit if kube-dns cannot scale down
+    else:
+      xpk_print('kube-dns has successfully scaled down.')
+  else:
+    xpk_print('kube-dns deployment not found.')
+  # Wait for CoreDNS to be fully scaled up and available
+  command_coredns_wait_available = (
+      'kubectl wait deployment/coredns --for=condition=Available=true'
+      f' --namespace={namespace} --timeout={timeout}s'
+  )
+  xpk_print('Verifying if CoreDNS is available...')
+  return_code_coredns = run_command_with_updates(
+      command_coredns_wait_available, 'Wait for coredns available', args
+  )
+  if return_code_coredns != 0:
+    xpk_print(
+        'CoreDNS verification failed, it might not have fully started within'
+        ' the timeout.'
+    )
+    xpk_exit(1)  # Exit if coredns cannot become available
+  xpk_print('CoreDNS has successfully started and passed verification.')
+def cleanup_coredns_repo(coredns_repo_full_path: str):
+  """Deletes the cloned CoreDNS deployment directory."""
+  xpk_print(
+      "Task: 'Deleting CoreDNS deployment directory' in progress:"
+      f' {coredns_repo_full_path}'
+  )
+  try:
+    shutil.rmtree(coredns_repo_full_path)
+    xpk_print(f'Successfully deleted directory: {coredns_repo_full_path}')
+  except OSError as e:
+    xpk_print(f'Error deleting directory {coredns_repo_full_path}: {e}')
+def update_coredns(args):
+  """Updates and deploys CoreDNS within a cluster.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  coredns_repo_dir = os.path.expanduser('/tmp/')
+  coredns_repo_dir_name = 'deployment'
+  coredns_repo_full_path = os.path.join(coredns_repo_dir, coredns_repo_dir_name)
+  coredns_k8s_path = os.path.join(coredns_repo_full_path, 'kubernetes')
+  # 1. Install jq
+  install_jq(args)
+  # 2. Clone CoreDNS deployment repository
+  clone_coredns_deployment_repo(args, coredns_repo_full_path)
+  # 3. Deploy CoreDNS to the cluster
+  deploy_coredns_manifests(args, coredns_k8s_path)
+  # 4. Scale down kube-dns-autoscaler
+  scale_down_deployment(args, 'kube-dns-autoscaler')
+  # 5. Scale down kube-dns
+  scale_down_deployment(args, 'kube-dns')
+  # 6. Scale up coredns and verify readiness
+  scale_up_coredns(args, replicas=15)
+  verify_coredns_readiness(args, timeout=120)
+  xpk_print('The CoreDNS setup process has been completed.')
+  # 7. Cleanup
+  cleanup_coredns_repo(coredns_repo_full_path)
+  return 0
+def coredns_deployment_exists(args, namespace: str = 'kube-system') -> bool:
+  """Checks if the CoreDNS deployment exists in the given namespace.
+  Args:
+    namespace: The Kubernetes namespace to check for the CoreDNS deployment.
+  Returns:
+    True if the 'coredns' deployment exists, False otherwise.
+  """
+  command = f'kubectl get deployment coredns -n {namespace}'
+  xpk_print(
+      "Task: 'Checking CoreDNS deployment existence' in progress for"
+      f' namespace: {namespace}'
+  )
+  return_code = run_command_with_updates(
+      command, f'Check CoreDNS deployment in {namespace}', args
+  )
+  if return_code == 0:
+    verify_coredns_readiness(args)
+    xpk_print(f"CoreDNS deployment 'coredns' found in namespace '{namespace}'.")
+    return True
+  else:
+    xpk_print(
+        f"CoreDNS deployment 'coredns' NOT found in namespace '{namespace}' or"
+        ' an error occurred.'
+    )
+    return False
+def update_coredns_if_necessary(args) -> int:
+  """Updates and deploys CoreDNS within the cluster if it's not already present.
+  This function checks for the existence of the CoreDNS deployment.
+  If it's not found, it proceeds to deploy and configure CoreDNS.
+  Args:
+    args: User-provided arguments for running the command.
+  Returns:
+    0 if successful (CoreDNS was already present or successfully deployed),
+    and 1 otherwise.
+  """
+  if coredns_deployment_exists(args, namespace='kube-system'):
+    xpk_print('Skipping CoreDNS deployment since it already exists.')
+    return 0
+  else:
+    xpk_print('CoreDNS deployment not found. Proceeding with CoreDNS setup.')
+    return update_coredns(args)
 def create_cluster_if_necessary(
     args, gke_control_plane_version: str, system: SystemCharacteristics
 ) -> int:
@@ -842,6 +1104,7 @@ def run_gke_cluster_create_command(
       f' {args.custom_cluster_arguments}'
       f' {rapid_release_cmd}'
       ' --enable-dns-access'
+      ' --autoscaling-profile=optimize-utilization'
   )
   enable_ip_alias = False

xpk/core/capacity.py CHANGED Viewed

@@ -232,9 +232,9 @@ def get_capacity_node_selectors_from_capacity_type(
     case CapacityType.ON_DEMAND.name:
       node_selector = ''
     case CapacityType.FLEX_START.name:
-      node_selector = 'cloud.google.com/gke-queued="true"'
+      node_selector = 'cloud.google.com/gke-queued: "true"'
     case CapacityType.SPOT.name:
-      node_selector = 'cloud.google.com/gke-spot="true"'
+      node_selector = 'cloud.google.com/gke-spot: "true"'
     case CapacityType.RESERVATION.name:
       node_selector = f'cloud.google.com/reservation-name: {args.reservation}'
     case _:

xpk/core/config.py CHANGED Viewed

@@ -22,7 +22,7 @@ from ..utils import file
 from ..utils.console import xpk_print
 # This is the version for XPK PyPI package
-__version__ = 'v0.10.1'
+__version__ = 'v0.11.0'
 XPK_CURRENT_VERSION = __version__
 XPK_CONFIG_FILE = os.path.expanduser('~/.config/xpk/config.yaml')

xpk/core/jobset.py CHANGED Viewed

@@ -81,7 +81,7 @@ spec:
           limits:
             memory: {memory_limit_size}
           requests:
-            cpu: 500m
+            cpu: 1000m
             memory: 128Mi
         securityContext:
           allowPrivilegeEscalation: false

xpk/core/kueue.py CHANGED Viewed

@@ -244,14 +244,16 @@ spec:
           periodSeconds: 10
         resources:
           limits:
-            cpu: 500m
+            cpu: 1000m
             memory: {memory_limit_size}
           requests:
-            cpu: 500m
+            cpu: 1000m
             memory: 512Mi
         securityContext:
           allowPrivilegeEscalation: false
         volumeMounts:
+        - mountPath: /visibility
+          name: visibility
         - mountPath: /tmp/k8s-webhook-server/serving-certs
           name: cert
           readOnly: true
@@ -263,6 +265,8 @@ spec:
       serviceAccountName: kueue-controller-manager
       terminationGracePeriodSeconds: 10
       volumes:
+      - name: visibility
+        emptyDir: {{}}
       - name: cert
         secret:
           defaultMode: 420

xpk/core/nap.py CHANGED Viewed

@@ -42,6 +42,8 @@ AUTOPROVISIONING_CONFIG_FILE = """
 management:
   autoRepair: true
   autoUpgrade: true
+scopes:
+  - "https://www.googleapis.com/auth/devstorage.read_write"
 autoprovisioningLocations:
   {zones}
 {resource_limits}
@@ -106,6 +108,18 @@ def enable_autoprovisioning_on_cluster(
     xpk_print(f'{task} request returned ERROR {return_code}')
     return autoprovisioning_config, return_code
+  command = (
+      'gcloud container clusters update'
+      f' {args.cluster} --project={args.project}'
+      f' --region={zone_to_region(args.zone)}'
+      ' --autoscaling-profile=optimize-utilization'
+  )
+  task = 'Update cluster with autoscaling-profile'
+  return_code = run_command_with_updates(command, task, args)
+  if return_code != 0:
+    xpk_print(f'{task} request returned ERROR {return_code}')
+    return autoprovisioning_config, return_code
   # Update created accelerator node pools to support autoprovisioning.
   existing_node_pool_names, return_code = get_all_nodepools_programmatic(args)
   if return_code != 0:
@@ -171,11 +185,11 @@ def create_autoprovisioning_config(
   # is not controlled by NAP.
   cpu_limits = """
   minimum: 1
-  maximum: 10000
+  maximum: 1000000
   """
   memory_limits = """
   minimum: 1
-  maximum: 10000
+  maximum: 10000000
   """
   # By default, the maximum chips is set to be the current number of resources used

xpk/core/nodepool.py CHANGED Viewed

@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+from typing import List
 from ..utils.console import get_user_input, xpk_print
 from .capacity import (
     AUTOPROVISIONING_CONFIG_VALUE,
@@ -32,6 +33,8 @@ from .resources import (
     create_or_update_cluster_configmap,
 )
 from .system_characteristics import AcceleratorType
+from functools import reduce
+from operator import mul
 CLOUD_PLATFORM_AUTH_SCOPE_URL = (
     '"https://www.googleapis.com/auth/cloud-platform"'
@@ -88,20 +91,26 @@ def run_gke_node_pool_create_command(
     xpk_print('Parsing capacity arguments failed!')
     return return_code
-  if system.accelerator_type == AcceleratorType['GPU']:
-    xpk_print(
-        f'Creating 1 node pool with {args.num_nodes} nodes of'
-        f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
-    )
-    desired_node_pool_names = [f'{args.cluster}-np-0']
-  else:
-    xpk_print(
-        f'Creating {args.num_slices} node pool or pools of'
-        f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
-    )
-    desired_node_pool_names = [
-        f'{args.cluster}-np-{slice_num}' for slice_num in range(args.num_slices)
-    ]
+  desired_node_pool_count = (
+      1
+      if system.accelerator_type == AcceleratorType['GPU']
+      else args.num_slices
+  )
+  message = (
+      (
+          f'Creating 1 node pool with {args.num_nodes} nodes of'
+          f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
+      )
+      if system.accelerator_type == AcceleratorType['GPU']
+      else (
+          f'Creating {args.num_slices} node pool or pools of'
+          f' {system.device_type}\nUnderlyingly, we assume that means: {system}'
+      )
+  )
+  xpk_print(message)
+  desired_node_pool_names = get_desired_node_pool_names(
+      existing_node_pool_names, args.cluster, desired_node_pool_count
+  )
   node_pools_to_remain = []
   delete_commands = []
@@ -275,20 +284,24 @@ def run_gke_node_pool_create_command(
         f' --host-maintenance-interval={args.host_maintenance_interval}'
         f' {capacity_args}'
         ' --enable-gvnic'
-        f' {args.custom_nodepool_arguments}'
     )
     if system.accelerator_type == AcceleratorType['TPU']:
       command += f' --node-version={gke_node_pool_version}'
+      topology_product = reduce(
+          mul, (int(x) for x in system.topology.split('x')), 1
+      )
       if capacity_type == CapacityType.FLEX_START:
         command += ' --num-nodes=0'
-      else:
+      elif topology_product > 1:
         command += f' --num-nodes={system.vms_per_slice}'
-      command += ' --placement-type=COMPACT  --max-pods-per-node 15'
       command += (
           f' --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL}'
       )
-      command += f' --tpu-topology={system.topology}'
-      command += f' {args.custom_tpu_nodepool_arguments}'
+      if topology_product > 1:
+        command += ' --placement-type=COMPACT  --max-pods-per-node 15'
+        command += f' --tpu-topology={system.topology}'
+        command += f' {args.custom_tpu_nodepool_arguments}'
     elif system.accelerator_type == AcceleratorType['GPU']:
       subnet_prefix = f'{args.cluster}-{zone_to_region(args.zone)}'
       if capacity_type == CapacityType.FLEX_START:
@@ -319,6 +332,8 @@ def run_gke_node_pool_create_command(
     if args.enable_workload_identity or args.enable_gcsfuse_csi_driver:
       command += ' --workload-metadata=GKE_METADATA'
+    command += f' {args.custom_nodepool_arguments}'
     task = f'NodepoolCreate-{node_pool_name}'
     create_commands.append(command)
     create_task_names.append(task)
@@ -594,3 +609,21 @@ def get_nodepool_workload_metadata_mode(
     return 1, None
   return 0, nodepool_WI_mode.strip()
+def get_desired_node_pool_names(
+    existing_node_pool_names: List[str],
+    cluster_name: str,
+    desired_node_pool_count: int,
+) -> List[str]:
+  cluster_node_pools = [
+      np
+      for np in existing_node_pool_names
+      if np.startswith(f'{cluster_name}-np-')
+  ]
+  result = set(cluster_node_pools[:desired_node_pool_count])
+  i = 0
+  while len(result) < desired_node_pool_count:
+    result.add(f'{cluster_name}-np-{i}')
+    i += 1
+  return list(result)

xpk/core/scheduling.py CHANGED Viewed

@@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool:
   missing_gke_accelerator_type = False
   if not cluster_config_map.get(system.gke_accelerator):
     xpk_print(
-        f'Gke Accelerator Type Check: {args.workload} is requesting'
+        f'GKE Accelerator Type Check: {args.workload} is requesting'
         f' {system.gke_accelerator} but cluster only contains'
         f' {cluster_config_map.keys()}. '
     )

xpk 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

xpk 0.10.1py3-none-any.whl → 0.11.0py3-none-any.whl