PyPI - xpk - Versions diffs - 0.0.1__py3-none-any.whl - Mend

xpk 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

xpk/__init__.py +15 -0
xpk/api/__init__.py +15 -0
xpk/api/storage_crd.yaml +52 -0
xpk/commands/__init__.py +15 -0
xpk/commands/batch.py +131 -0
xpk/commands/cluster.py +808 -0
xpk/commands/cluster_gcluster.py +269 -0
xpk/commands/common.py +44 -0
xpk/commands/config.py +29 -0
xpk/commands/info.py +243 -0
xpk/commands/inspector.py +357 -0
xpk/commands/job.py +199 -0
xpk/commands/kind.py +283 -0
xpk/commands/kjob_common.py +44 -0
xpk/commands/run.py +128 -0
xpk/commands/shell.py +140 -0
xpk/commands/storage.py +267 -0
xpk/commands/version.py +27 -0
xpk/commands/workload.py +889 -0
xpk/core/__init__.py +15 -0
xpk/core/blueprint/__init__.py +15 -0
xpk/core/blueprint/blueprint_definitions.py +62 -0
xpk/core/blueprint/blueprint_generator.py +708 -0
xpk/core/capacity.py +185 -0
xpk/core/cluster.py +564 -0
xpk/core/cluster_private.py +200 -0
xpk/core/commands.py +356 -0
xpk/core/config.py +179 -0
xpk/core/docker_container.py +225 -0
xpk/core/docker_image.py +210 -0
xpk/core/docker_manager.py +308 -0
xpk/core/docker_resources.py +350 -0
xpk/core/filestore.py +251 -0
xpk/core/gcloud_context.py +196 -0
xpk/core/gcluster_manager.py +176 -0
xpk/core/gcsfuse.py +50 -0
xpk/core/kjob.py +444 -0
xpk/core/kueue.py +358 -0
xpk/core/monitoring.py +134 -0
xpk/core/nap.py +361 -0
xpk/core/network.py +377 -0
xpk/core/nodepool.py +581 -0
xpk/core/pathways.py +377 -0
xpk/core/ray.py +222 -0
xpk/core/remote_state/__init__.py +15 -0
xpk/core/remote_state/fuse_remote_state.py +99 -0
xpk/core/remote_state/remote_state_client.py +38 -0
xpk/core/resources.py +238 -0
xpk/core/scheduling.py +253 -0
xpk/core/storage.py +581 -0
xpk/core/system_characteristics.py +1432 -0
xpk/core/vertex.py +105 -0
xpk/core/workload.py +341 -0
xpk/core/workload_decorators/__init__.py +15 -0
xpk/core/workload_decorators/rdma_decorator.py +129 -0
xpk/core/workload_decorators/storage_decorator.py +52 -0
xpk/core/workload_decorators/tcpxo_decorator.py +190 -0
xpk/main.py +75 -0
xpk/parser/__init__.py +15 -0
xpk/parser/batch.py +43 -0
xpk/parser/cluster.py +662 -0
xpk/parser/common.py +259 -0
xpk/parser/config.py +49 -0
xpk/parser/core.py +135 -0
xpk/parser/info.py +64 -0
xpk/parser/inspector.py +65 -0
xpk/parser/job.py +147 -0
xpk/parser/kind.py +95 -0
xpk/parser/run.py +47 -0
xpk/parser/shell.py +59 -0
xpk/parser/storage.py +316 -0
xpk/parser/validators.py +39 -0
xpk/parser/version.py +23 -0
xpk/parser/workload.py +726 -0
xpk/templates/__init__.py +15 -0
xpk/templates/storage.yaml +13 -0
xpk/utils/__init__.py +15 -0
xpk/utils/console.py +55 -0
xpk/utils/file.py +82 -0
xpk/utils/gcs_utils.py +125 -0
xpk/utils/kubectl.py +57 -0
xpk/utils/network.py +168 -0
xpk/utils/objects.py +88 -0
xpk/utils/templates.py +28 -0
xpk/utils/validation.py +80 -0
xpk/utils/yaml.py +30 -0
xpk-0.0.1.dist-info/LICENSE +202 -0
xpk-0.0.1.dist-info/METADATA +1498 -0
xpk-0.0.1.dist-info/RECORD +92 -0
xpk-0.0.1.dist-info/WHEEL +5 -0
xpk-0.0.1.dist-info/entry_points.txt +2 -0
xpk-0.0.1.dist-info/top_level.txt +1 -0

xpk/core/cluster_private.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""
+Copyright 2024 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from ..utils.console import xpk_exit, xpk_print
+from ..utils.network import (
+    add_current_machine_to_networks,
+    is_current_machine_in_any_network,
+)
+from ..utils.objects import is_text_true
+from .commands import run_command_for_value, run_command_with_updates
+from .gcloud_context import zone_to_region
+def authorize_private_cluster_access_if_necessary(args) -> int:
+  """Updates a GKE cluster to add authorize networks to access a private cluster's control plane, if not added already.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    0 if successful and error code otherwise.
+  """
+  if not is_cluster_private(args):
+    if not args.private and args.authorized_networks is None:
+      xpk_print('Cluster is public and no need to authorize networks.')
+      return 0
+    else:
+      xpk_print(
+          'Cannot convert an existing public cluster to private. The arguments'
+          ' --private and --authorized-networks are not acceptable for public'
+          ' clusters.'
+      )
+      return 1
+  new_authorized_networks_needed, authorized_networks = (
+      check_if_new_authorized_networks_needed(args)
+  )
+  (
+      add_current_machine_to_networks_return_code,
+      is_current_machine_in_network,
+      authorized_networks,
+  ) = add_current_machine_to_networks_if_needed(authorized_networks)
+  if add_current_machine_to_networks_return_code != 0:
+    return add_current_machine_to_networks_return_code
+  if new_authorized_networks_needed or not is_current_machine_in_network:
+    return update_cluster_new_authorized_networks(args, authorized_networks)
+  xpk_print("Current machine's IP adrress is already authorized.")
+  return 0
+def update_cluster_new_authorized_networks(args, authorized_networks) -> int:
+  cluster_authorized_networks_update_code = update_cluster_authorized_networks(
+      args, authorized_networks
+  )
+  if cluster_authorized_networks_update_code != 0:
+    xpk_print('Updating cluster authorized networks failed!')
+    return cluster_authorized_networks_update_code
+  xpk_print("Cluster's master authorized networks updated successfully.")
+  return 0
+def add_current_machine_to_networks_if_needed(
+    authorized_networks,
+) -> tuple[int, bool, list]:
+  is_current_machine_in_network_return_code, is_current_machine_in_network = (
+      is_current_machine_in_any_network(authorized_networks)
+  )
+  if is_current_machine_in_network_return_code != 0:
+    xpk_print("Error on checking current machine's IP adrress.")
+    return is_current_machine_in_network_return_code, False, authorized_networks
+  if not is_current_machine_in_network:
+    add_current_machine_to_networks_return_code, authorized_networks = (
+        add_current_machine_to_networks(authorized_networks)
+    )
+    if add_current_machine_to_networks_return_code != 0:
+      xpk_print(
+          "Adding current machine's IP address to the authorized networks"
+          ' failed!'
+      )
+      return add_current_machine_to_networks_return_code, authorized_networks
+  return 0, is_current_machine_in_network, authorized_networks
+def check_if_new_authorized_networks_needed(args) -> tuple[bool, list]:
+  new_authorized_networks_needed = args.authorized_networks is not None
+  authorized_networks = (
+      args.authorized_networks
+      if new_authorized_networks_needed
+      else get_cluster_authorized_networks(args)
+  )
+  return new_authorized_networks_needed, authorized_networks
+def is_cluster_private(args) -> bool:
+  """Checks if cluster is private.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    True if cluster is private and False otherwise.
+  """
+  command = (
+      f'gcloud container clusters describe {args.cluster}'
+      f' --project={args.project} --region={zone_to_region(args.zone)}'
+      ' --format="value(privateClusterConfig.enablePrivateNodes)"'
+  )
+  return_code, private_nodes_enabled = run_command_for_value(
+      command,
+      'Check if Private Nodes is enabled in cluster.',
+      args,
+  )
+  if return_code != 0:
+    xpk_print('Checking if Private Nodes is enabled failed!')
+    xpk_exit(return_code)
+  if is_text_true(private_nodes_enabled):
+    xpk_print('Private Nodes is enabled on the cluster.')
+    return True
+  xpk_print('Private Nodes is not enabled on the cluster.')
+  return False
+def get_cluster_authorized_networks(args) -> list[str]:
+  """Retreives the networks list that are authorized to have access to Control Plane.
+  Args:
+    args: user provided arguments for running the command.
+  Returns:
+    List of networks CIDRs as strings
+  """
+  command = (
+      f'gcloud container clusters describe {args.cluster}'
+      f' --project={args.project} --region={zone_to_region(args.zone)}'
+      ' --format="value(masterAuthorizedNetworksConfig.cidrBlocks[].cidrBlock)"'
+  )
+  return_code, authorized_networks = run_command_for_value(
+      command,
+      'Fetching the list of authorized network from cluster describe.',
+      args,
+  )
+  if return_code != 0:
+    xpk_print('Fetching authorized networks failed!')
+    xpk_exit(return_code)
+  return (
+      authorized_networks.strip().split(';')
+      if authorized_networks.strip() != ''
+      else []
+  )
+def update_cluster_authorized_networks(args, authorized_networks) -> int:
+  """Run the GKE cluster update command for existing cluster and update master authorized networks list.
+  Args:
+    args: user provided arguments for running the command.
+    authorized_networks: list of networks CIDRs to authorize.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  command = (
+      'gcloud container clusters update'
+      f' {args.cluster} --project={args.project}'
+      f' --region={zone_to_region(args.zone)}'
+      ' --enable-master-authorized-networks'
+      f' --master-authorized-networks={",".join(authorized_networks)}'
+      ' --quiet'
+  )
+  return_code = run_command_with_updates(
+      command, 'GKE Cluster Update master authorized networks', args
+  )
+  if return_code != 0:
+    xpk_print(f'GKE Cluster Update request returned ERROR {return_code}')
+    return 1
+  return 0

xpk/core/commands.py ADDED Viewed

@@ -0,0 +1,356 @@
+"""
+Copyright 2024 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import datetime
+import subprocess
+import sys
+import time
+from argparse import Namespace
+from ..utils.objects import chunks
+from ..utils.file import make_tmp_files, write_tmp_file
+from ..utils.console import xpk_print
+def run_commands(commands, jobname, per_command_name, batch=10, dry_run=False):
+  """Run commands in groups of `batch`.
+  Args:
+    commands: list of command.
+    jobname: the name of the job.
+    per_command_name: list of command names.
+    batch: number of commands to run in parallel.
+    dry_run: enables dry_run if set to true.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  temporary_files_batches = chunks(make_tmp_files(per_command_name), batch)
+  commands_batched = chunks(commands, batch)
+  per_command_name_batches = chunks(per_command_name, batch)
+  xpk_print(
+      f'Breaking up a total of {len(commands)} commands into'
+      f' {len(commands_batched)} batches'
+  )
+  if dry_run:
+    xpk_print('Pretending all the jobs succeeded')
+    return 0
+  max_return_code = 0
+  for i, _ in enumerate(commands_batched):
+    xpk_print(f'Dispatching batch {i}/{len(commands_batched)}')
+    batch_max_return_code, _ = run_command_batch(
+        commands_batched[i],
+        jobname,
+        per_command_name_batches[i],
+        temporary_files_batches[i],
+    )
+    max_return_code = max(max_return_code, batch_max_return_code)
+    if max_return_code > 0:
+      return max_return_code
+  return max_return_code
+def run_command_batch(commands, jobname, per_command_name, output_logs):
+  """Runs commands in parallel.
+  Args:
+    commands: list of n commands, each command is a a list of strings
+    jobname: Useful debugging name for the group of commands
+    per_command_name: specific name per task
+    output_logs: list of n log paths, each command will output to each log.
+  Returns:
+    The max return code and a list of all the return codes.
+  """
+  children = []
+  start_time = datetime.datetime.now()
+  for i, command in enumerate(commands):
+    children.append(
+        # subprocess managed by list pylint: disable=consider-using-with
+        subprocess.Popen(
+            command, stdout=output_logs[i], stderr=output_logs[i], shell=True
+        )
+    )
+  while True:
+    returncodes = [child.poll() for child in children]
+    max_returncode = max([0] + [r for r in returncodes if r is not None])
+    completed = len([r for r in returncodes if r is not None])
+    total = len(returncodes)
+    seconds_elapsed = (datetime.datetime.now() - start_time).total_seconds()
+    if completed < total:
+      slow_worker_index = returncodes.index(None)
+      slow_worker_text = per_command_name[slow_worker_index]
+      slow_str = (
+          f', task {slow_worker_text} still working, logfile'
+          f' {output_logs[slow_worker_index].name}'
+      )
+    else:
+      slow_str = ''
+    xpk_print(
+        f'[t={seconds_elapsed:.2f}, {jobname}] Completed'
+        f' {completed}/{total}{slow_str}'
+    )
+    if max_returncode > 0:
+      failing_index = [
+          i for i, x in enumerate(returncodes) if x is not None and x > 0
+      ][0]
+      xpk_print(
+          f'Terminating all {jobname} processes since at least one failed.'
+      )
+      xpk_print(
+          f'Failure is {per_command_name[failing_index]}'
+          f' and logfile {output_logs[failing_index].name}'
+      )
+      for child in children:
+        child.terminate()
+      break
+    if completed == total:
+      break
+    time.sleep(1)
+  return max_returncode, returncodes
+def run_command_with_updates_retry(
+    command, task, args, verbose=True, num_retry_attempts=5, wait_seconds=10
+) -> int:
+  """Generic run commands function with updates and retry logic.
+  Args:
+    command: command to execute
+    task: user-facing name of the task
+    args: user provided arguments for running the command.
+    verbose: shows stdout and stderr if set to true. Set to True by default.
+    num_retry_attempts: number of attempts to retry the command.
+        This has a default value in the function arguments.
+    wait_seconds: Seconds to wait between attempts.
+        Has a default value in the function arguments.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  i = 0
+  return_code = -1
+  while return_code != 0 and i < num_retry_attempts:
+    # Do not sleep before first try.
+    if i != 0:
+      xpk_print(f'Wait {wait_seconds} seconds before retrying.')
+      time.sleep(wait_seconds)
+    i += 1
+    xpk_print(f'Try {i}: {task}')
+    return_code = run_command_with_updates(command, task, args, verbose=verbose)
+  return return_code
+def run_command_with_updates(command, task, global_args, verbose=True) -> int:
+  """Generic run commands function with updates.
+  Args:
+    command: command to execute
+    task: user-facing name of the task
+    global_args: user provided arguments for running the command.
+    verbose: shows stdout and stderr if set to true. Set to True by default.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  if global_args.dry_run:
+    xpk_print(
+        f'Task: `{task}` is implemented by the following command'
+        ' not running since it is a dry run.'
+        f' \n{command}'
+    )
+    return 0
+  if verbose:
+    xpk_print(
+        f'Task: `{task}` is implemented by `{command}`, streaming output live.'
+    )
+    with subprocess.Popen(
+        command,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+        shell=True,
+    ) as child:
+      i = 0
+      while True:
+        return_code = child.poll()
+        if return_code is None:
+          xpk_print(f'Waiting for `{task}`, for {i} seconds...', end='\r')
+          time.sleep(1)
+          i += 1
+        else:
+          xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
+          return return_code
+  else:
+    xpk_print(
+        f'Task: `{task}` is implemented by `{command}`, hiding output unless'
+        ' there is an error.'
+    )
+    try:
+      subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+      xpk_print(
+          f'Task: `{task}` terminated with ERROR `{e.returncode}`, printing'
+          ' logs'
+      )
+      xpk_print('*' * 80)
+      xpk_print(e.output)
+      xpk_print('*' * 80)
+      return e.returncode
+    xpk_print(f'Task: `{task}` succeeded.')
+    return 0
+def run_command_for_value(
+    command,
+    task,
+    global_args,
+    dry_run_return_val='0',
+    print_timer=False,
+    hide_error=False,
+    quiet=False,
+) -> tuple[int, str]:
+  """Runs the command and returns the error code and stdout.
+  Prints errors and associated user-facing information
+  Args:
+    command: user provided command to run.
+    task: user provided task name for running the command.
+    global_args: user provided arguments for running the command.
+    dry_run_return_val: return value of this command for dry run.
+    print_timer: print out the time the command is running.
+    hide_error: hide the error from the command output upon success.
+  Returns:
+    tuple[int, str]
+    int: return_code, default is 0
+    str: return_val, default is '0'
+  """
+  if global_args is not None and global_args.dry_run:
+    xpk_print(
+        f'Task: `{task}` is implemented by the following command'
+        ' not running since it is a dry run.'
+        f' \n{command}'
+    )
+    return 0, dry_run_return_val
+  if print_timer:
+    if not quiet:
+      xpk_print(f'Task: `{task}` is implemented by `{command}`')
+    with subprocess.Popen(
+        command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        shell=True,
+    ) as child:
+      i = 0
+      while True:
+        return_code = child.poll()
+        if return_code is None:
+          if not quiet:
+            xpk_print(f'Waiting for `{task}`, for {i} seconds...', end='\r')
+          time.sleep(1)
+          i += 1
+        else:
+          if not quiet:
+            xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
+          out, err = child.communicate()
+          out, err = str(out, 'UTF-8'), str(err, 'UTF-8')
+          return return_code, f'{out}\n{err}'
+  else:
+    if not quiet:
+      xpk_print(
+          f'Task: `{task}` is implemented by `{command}`, hiding output unless'
+          ' there is an error.'
+      )
+    try:
+      output = subprocess.check_output(
+          command,
+          shell=True,
+          stderr=subprocess.STDOUT if not hide_error else None,
+      )
+    except subprocess.CalledProcessError as e:
+      if not quiet:
+        xpk_print(f'Task {task} failed with {e.returncode}')
+        xpk_print('*' * 80)
+        xpk_print(e.output)
+        xpk_print('*' * 80)
+      return e.returncode, str(e.output, 'UTF-8')
+    return 0, str(output, 'UTF-8')
+def run_command_with_full_controls(
+    command: str,
+    task: str,
+    global_args: Namespace,
+    instructions: str | None = None,
+) -> int:
+  """Run command in current shell with system out, in and error handles. Wait
+  until it exits.
+  Args:
+    command: command to execute
+    task: user-facing name of the task
+    global_args: user provided arguments for running the command.
+    verbose: shows stdout and stderr if set to true. Set to True by default.
+  Returns:
+    0 if successful and 1 otherwise.
+  """
+  if global_args.dry_run:
+    xpk_print(
+        f'Task: `{task}` is implemented by the following command'
+        ' not running since it is a dry run.'
+        f' \n{command}'
+    )
+    return 0
+  xpk_print(
+      f'Task: `{task}` is implemented by `{command}`. '
+      'Streaming output and input live.'
+  )
+  if instructions is not None:
+    xpk_print(instructions)
+  try:
+    with subprocess.Popen(
+        command,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+        stdin=sys.stdin,
+        shell=True,
+    ) as child:
+      return_code = child.wait()
+      xpk_print(f'Task: `{task}` terminated with code `{return_code}`')
+  except KeyboardInterrupt:
+    return_code = 0
+  return return_code
+def run_kubectl_apply(yml_string: str, task: str, args: Namespace) -> int:
+  tmp = write_tmp_file(yml_string)
+  command = f'kubectl apply -f {str(tmp.file.name)}'
+  err_code = run_command_with_updates(command, task, args)
+  return err_code