PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250318__py3-none-any.whl → 1.0.0.dev20250320__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250318py3-none-any.whl → 1.0.0.dev20250320py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

sky/__init__.py +2 -2
sky/adaptors/cloudflare.py +4 -0
sky/check.py +156 -53
sky/clouds/aws.py +5 -0
sky/clouds/azure.py +5 -0
sky/clouds/cloud.py +12 -0
sky/clouds/gcp.py +55 -33
sky/clouds/ibm.py +5 -0
sky/clouds/oci.py +5 -0
sky/clouds/utils/gcp_utils.py +11 -1
sky/core.py +3 -1
sky/data/storage.py +7 -9
sky/execution.py +6 -1
sky/global_user_state.py +30 -0
sky/optimizer.py +10 -5
sky/provision/gcp/config.py +3 -3
sky/provision/gcp/constants.py +16 -2
sky/provision/gcp/instance.py +4 -1
sky/provision/kubernetes/utils.py +37 -24
sky/serve/replica_managers.py +10 -1
sky/server/requests/executor.py +33 -19
sky/server/server.py +4 -1
sky/utils/controller_utils.py +7 -1
sky/utils/kubernetes/kubernetes_deploy_utils.py +3 -1
sky/utils/subprocess_utils.py +47 -25
{skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/METADATA +3 -2
{skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/RECORD +31 -31
{skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info/licenses}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/top_level.txt +0 -0

sky/optimizer.py CHANGED Viewed

@@ -1225,7 +1225,8 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
         # Explicitly check again to update the enabled cloud list.
         sky_check.check(quiet=True,
                         clouds=list(clouds_need_recheck -
-                                    global_disabled_clouds))
+                                    global_disabled_clouds),
+                        capability=sky_check.CloudCapability.COMPUTE)
         enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
             raise_if_no_cloud_access=True)
         disabled_clouds = (clouds_need_recheck -
@@ -1328,13 +1329,17 @@ def _fill_in_launchable_resources(
                                 f'{colorama.Style.RESET_ALL}')
                 else:
                     if resources.cpus is not None:
-                        logger.info('Try specifying a different CPU count, '
+                        logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
+                                    '- Try specifying a different CPU count, '
                                     'or add "+" to the end of the CPU count '
-                                    'to allow for larger instances.')
+                                    'to allow for larger instances.'
+                                    f'{colorama.Style.RESET_ALL}')
                     if resources.memory is not None:
-                        logger.info('Try specifying a different memory size, '
+                        logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
+                                    '- Try specifying a different memory size, '
                                     'or add "+" to the end of the memory size '
-                                    'to allow for larger instances.')
+                                    'to allow for larger instances.'
+                                    f'{colorama.Style.RESET_ALL}')
                 for cloud, hint in hints.items():
                     logger.info(f'{repr(cloud)}: {hint}')

sky/provision/gcp/config.py CHANGED Viewed

@@ -297,8 +297,8 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
 def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
     """Setup a gcp service account with IAM roles.
-    Creates a gcp service acconut and binds IAM roles which allow it to control
-    control storage/compute services. Specifically, the head node needs to have
+    Creates a gcp service account and binds IAM roles which allow it to control
+    storage/compute services. Specifically, the head node needs to have
     an IAM role that allows it to create further gce instances and store items
     in google cloud storage.
@@ -311,7 +311,7 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
     )
     service_account = _get_service_account(email, project_id, iam)
-    permissions = gcp_utils.get_minimal_permissions()
+    permissions = gcp_utils.get_minimal_compute_permissions()
     roles = constants.DEFAULT_SERVICE_ACCOUNT_ROLES
     if config.provider_config.get(constants.HAS_TPU_PROVIDER_FIELD, False):
         roles = (constants.DEFAULT_SERVICE_ACCOUNT_ROLES +

sky/provision/gcp/constants.py CHANGED Viewed

@@ -141,6 +141,11 @@ FIREWALL_RULES_TEMPLATE = [
     },
 ]
+GCP_MINIMAL_PERMISSIONS = [
+    'serviceusage.services.enable',
+    'serviceusage.services.list',
+]
 # A list of permissions required to run SkyPilot on GCP.
 # Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long
 VM_MINIMAL_PERMISSIONS = [
@@ -170,13 +175,22 @@ VM_MINIMAL_PERMISSIONS = [
     # Check: sky.provision.gcp.config::_is_permission_satisfied
     # 'iam.serviceAccounts.actAs',
     'iam.serviceAccounts.get',
-    'serviceusage.services.enable',
-    'serviceusage.services.list',
     'serviceusage.services.use',
     'resourcemanager.projects.get',
     'resourcemanager.projects.getIamPolicy',
 ]
+STORAGE_MINIMAL_PERMISSIONS = [
+    'storage.buckets.create',
+    'storage.buckets.get',
+    'storage.buckets.delete',
+    'storage.objects.create',
+    'storage.objects.update',
+    'storage.objects.delete',
+    'storage.objects.get',
+    'storage.objects.list',
+]
 # Permissions implied by GCP built-in roles. We hardcode these here, as we
 # cannot get the permissions of built-in role from the GCP Python API.
 # The lists are not exhaustive, but should cover the permissions listed in

sky/provision/gcp/instance.py CHANGED Viewed

@@ -586,8 +586,11 @@ def open_ports(
     }
     handlers: List[Type[instance_utils.GCPInstance]] = [
         instance_utils.GCPComputeInstance,
-        instance_utils.GCPTPUVMInstance,
     ]
+    use_tpu_vms = provider_config.get('_has_tpus', False)
+    if use_tpu_vms:
+        handlers.append(instance_utils.GCPTPUVMInstance)
     handler_to_instances = _filter_instances(handlers, project_id, zone,
                                              label_filters, lambda _: None)
     operations = collections.defaultdict(list)

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -97,6 +97,7 @@ GKE_TPU_ACCELERATOR_TO_GENERATION = {
     # Multi-host compatible v5e TPU configurations allowed.
     'tpu-v5-lite-podslice': 'v5e',
     'tpu-v5p-slice': 'v5p',
+    'tpu-v6e-slice': 'v6e',
 }
 POD_STATUSES = {
@@ -359,7 +360,8 @@ class GKELabelFormatter(GPULabelFormatter):
     # label to use in an autoscaling environment. For list of topologies, see:
     # tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
     # tpu v5p: https://cloud.google.com/tpu/docs/v5p
-    # TODO(romilb): Add support for TPU v4 and v6.
+    # tpu v6e: https://cloud.google.com/tpu/docs/v6e
+    # TODO(romilb): Add support for TPU v4.
     GKE_TPU_TOPOLOGIES = {
         'tpu-v5-lite-podslice': {
             1: '1x1',
@@ -374,6 +376,11 @@ class GKELabelFormatter(GPULabelFormatter):
         'tpu-v5p-slice': {
             4: '2x2x1'
         },
+        'tpu-v6e-slice': {
+            1: '1x1',
+            4: '2x2',
+            8: '2x4'
+        }
     }
     @classmethod
@@ -602,6 +609,7 @@ class GKEAutoscaler(Autoscaler):
     _pip_install_gcp_hint_last_sent = 0.0
     @classmethod
+    @annotations.lru_cache(scope='request', maxsize=10)
     def can_create_new_instance_of_type(cls, context: str,
                                         instance_type: str) -> bool:
         """Looks at each node pool in the cluster and checks if
@@ -655,18 +663,25 @@ class GKEAutoscaler(Autoscaler):
         # Check if any node pool with autoscaling enabled can
         # fit the instance type.
-        for node_pool in cluster['nodePools']:
-            logger.debug(f'checking if node pool {node_pool["name"]} '
+        node_pools = cluster.get('nodePools', [])
+        for node_pool in node_pools:
+            name = node_pool.get('name', '')
+            logger.debug(f'checking if node pool {name} '
                          'has autoscaling enabled.')
-            if (node_pool['autoscaling'] is not None and
-                    'enabled' in node_pool['autoscaling'] and
-                    node_pool['autoscaling']['enabled']):
-                logger.debug(
-                    f'node pool {node_pool["name"]} has autoscaling enabled. '
-                    'Checking if it can create a node '
-                    f'satisfying {instance_type}')
-                if cls._check_instance_fits_gke_autoscaler_node_pool(
-                        instance_type, node_pool):
+            autoscaling_enabled = (node_pool.get('autoscaling',
+                                                 {}).get('enabled', False))
+            if autoscaling_enabled:
+                logger.debug(f'node pool {name} has autoscaling enabled. '
+                             'Checking if it can create a node '
+                             f'satisfying {instance_type}')
+                try:
+                    if cls._check_instance_fits_gke_autoscaler_node_pool(
+                            instance_type, node_pool):
+                        return True
+                except KeyError:
+                    logger.debug('encountered KeyError while checking if '
+                                 f'node pool {name} can create a node '
+                                 f'satisfying {instance_type}.')
                     return True
         return False
@@ -768,9 +783,9 @@ class GKEAutoscaler(Autoscaler):
         to fit the instance type.
         """
         for accelerator in node_pool_accelerators:
-            node_accelerator_type = GKELabelFormatter. \
-                get_accelerator_from_label_value(
-                    accelerator['acceleratorType'])
+            node_accelerator_type = (
+                GKELabelFormatter.get_accelerator_from_label_value(
+                    accelerator['acceleratorType']))
             node_accelerator_count = accelerator['acceleratorCount']
             if node_accelerator_type == requested_gpu_type and int(
                     node_accelerator_count) >= requested_gpu_count:
@@ -784,6 +799,7 @@ class GKEAutoscaler(Autoscaler):
         """Check if the node pool has enough TPU capacity
         to fit the instance type.
         """
         if 'goog-gke-tpu-node-pool-type' not in node_pool_resource_labels:
             # This node does not have TPUs.
             return False
@@ -803,25 +819,22 @@ class GKEAutoscaler(Autoscaler):
     @classmethod
     def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
         """Infer the number of TPU chips from the instance type."""
-        machine_type_parts = machine_type.split('-')
         # according to
         # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
         # GKE TPU machine types have the format of
-        # ct<version>-hightpu-<node-chip-count>t
+        # ct<version>-<type>-<node-chip-count>t
         logger.debug(
             f'inferring TPU chip count from machine type: {machine_type}')
-        if (len(machine_type_parts) != 3 or
-                not machine_type_parts[0].startswith('ct') or
-                machine_type_parts[1] != 'hightpu' or
-                not machine_type_parts[2].endswith('t') or
-                not machine_type_parts[2].strip('t').isdigit()):
+        pattern = r'ct[a-z0-9]+-[a-z]+-([0-9]+)t'
+        search = re.search(pattern, machine_type)
+        if search is None:
             logger.debug(f'machine type {machine_type} is not a '
                          'valid TPU machine type format.')
             return 0
-        num_tpu_chips = int(machine_type_parts[2].strip('t'))
+        num_tpu_chips = search.group(1)
         logger.debug(
             f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
-        return num_tpu_chips
+        return int(num_tpu_chips)
     @classmethod
     def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:

sky/serve/replica_managers.py CHANGED Viewed

@@ -1205,7 +1205,16 @@ class SkyPilotReplicaManager(ReplicaManager):
                 for key in ['service']:
                     old_config.pop(key)
                 # Bump replica version if all fields except for service are
-                # the same. File mounts should both be empty, as update always
+                # the same.
+                # Here, we manually convert the any_of field to a set to avoid
+                # only the difference in the random order of the any_of fields.
+                old_config_any_of = old_config.get('resources',
+                                                   {}).pop('any_of', [])
+                new_config_any_of = new_config.get('resources',
+                                                   {}).pop('any_of', [])
+                if set(old_config_any_of) != set(new_config_any_of):
+                    continue
+                # File mounts should both be empty, as update always
                 # create new buckets if they are not empty.
                 if (old_config == new_config and
                         old_config.get('file_mounts', None) == {}):

sky/server/requests/executor.py CHANGED Viewed

@@ -49,7 +49,6 @@ from sky.utils import annotations
 from sky.utils import common_utils
 from sky.utils import subprocess_utils
 from sky.utils import timeline
-from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
     import types
@@ -221,6 +220,10 @@ def _restore_output(original_stdout: int, original_stderr: int) -> None:
     os.close(original_stderr)
+def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
+    raise KeyboardInterrupt
 def _request_execution_wrapper(request_id: str,
                                ignore_return_value: bool) -> None:
     """Wrapper for a request execution.
@@ -232,12 +235,8 @@ def _request_execution_wrapper(request_id: str,
     3. Redirect the stdout and stderr of the execution to log file;
     4. Handle the SIGTERM signal to abort the request gracefully.
     """
-    def sigterm_handler(signum: int,
-                        frame: Optional['types.FrameType']) -> None:
-        raise KeyboardInterrupt
-    signal.signal(signal.SIGTERM, sigterm_handler)
+    # Handle the SIGTERM signal to abort the request processing gracefully.
+    signal.signal(signal.SIGTERM, _sigterm_handler)
     pid = multiprocessing.current_process().pid
     logger.info(f'Running request {request_id} with pid {pid}')
@@ -355,6 +354,8 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
     Args:
         max_parallel_size: Maximum number of parallel jobs this worker can run.
     """
+    # Handle the SIGTERM signal to abort the executor process gracefully.
+    signal.signal(signal.SIGTERM, _sigterm_handler)
     proc_group = f'{worker.schedule_type.value}-{worker.id}'
     setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
     queue = _get_queue(worker.schedule_type)
@@ -388,19 +389,11 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
                 logger.info(f'[{worker}] Finished request: {request_id}')
             else:
                 logger.info(f'[{worker}] Submitted request: {request_id}')
-        except KeyboardInterrupt:
-            # Interrupt the worker process will stop request execution, but
-            # the SIGTERM request should be respected anyway since it might
-            # be explicitly sent by user.
-            # TODO(aylei): crash the API server or recreate the worker process
-            # to avoid broken state.
-            logger.error(f'[{worker}] Worker process interrupted')
-            with ux_utils.print_exception_no_traceback():
-                raise
         except (Exception, SystemExit) as e:  # pylint: disable=broad-except
             # Catch any other exceptions to avoid crashing the worker process.
             logger.error(
-                f'[{worker}] Error processing request {request_id}: '
+                f'[{worker}] Error processing request: '
+                f'{request_id if "request_id" in locals() else ""} '
                 f'{common_utils.format_exception(e, use_bracket=True)}')
     # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
@@ -409,12 +402,33 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
     # We use executor instead of individual multiprocessing.Process to avoid
     # the overhead of forking a new process for each request, which can be about
     # 1s delay.
-    with concurrent.futures.ProcessPoolExecutor(
+    try:
+        executor = concurrent.futures.ProcessPoolExecutor(
             max_workers=max_parallel_size,
             initializer=executor_initializer,
-            initargs=(proc_group,)) as executor:
+            initargs=(proc_group,))
         while True:
             process_request(executor)
+    # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
+    except KeyboardInterrupt:
+        pass
+    finally:
+        # In most cases, here we receive either ctrl-c in foreground execution
+        # or SIGTERM on server exiting. Gracefully exit the worker process and
+        # the executor.
+        # TODO(aylei): worker may also be killed by system daemons like OOM
+        # killer, crash the API server or recreate the worker process to avoid
+        # broken state in such cases.
+        logger.info(f'[{worker}] Worker process interrupted')
+        executor_processes = list(executor._processes.values())  # pylint: disable=protected-access,line-too-long
+        # Shutdown the executor so that executor process can exit once the
+        # running task is finished or interrupted.
+        executor.shutdown(wait=False)
+        # Proactively interrupt the running task to avoid indefinite waiting.
+        subprocess_utils.run_in_parallel(
+            subprocess_utils.kill_process_with_grace_period,
+            executor_processes,
+            num_threads=len(executor_processes))
 def start(deploy: bool) -> List[multiprocessing.Process]:

sky/server/server.py CHANGED Viewed

@@ -1140,6 +1140,9 @@ if __name__ == '__main__':
                 # The process may not be started yet, close it anyway.
                 proc.close()
+        # Terminate processes in reverse order in case dependency, especially
+        # queue server. Terminate queue server first does not affect the
+        # correctness of cleanup but introduce redundant error messages.
         subprocess_utils.run_in_parallel(cleanup,
-                                         sub_procs,
+                                         list(reversed(sub_procs)),
                                          num_threads=len(sub_procs))

sky/utils/controller_utils.py CHANGED Viewed

@@ -215,7 +215,13 @@ def _get_cloud_dependencies_installation_commands(
     commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
                     f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')
-    for cloud in sky_check.get_cached_enabled_clouds_or_refresh():
+    enabled_compute_clouds = set(
+        sky_check.get_cached_enabled_clouds_or_refresh())
+    enabled_storage_clouds = set(
+        sky_check.get_cached_enabled_storage_clouds_or_refresh())
+    enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
+    for cloud in enabled_clouds:
         cloud_python_dependencies: List[str] = copy.deepcopy(
             dependencies.extras_require[cloud.canonical_name()])

sky/utils/kubernetes/kubernetes_deploy_utils.py CHANGED Viewed

@@ -167,7 +167,9 @@ def deploy_local_cluster(gpus: bool):
                                f'\nError: {stderr}')
     # Run sky check
     with rich_utils.safe_status('[bold cyan]Running sky check...'):
-        sky_check.check(clouds=['kubernetes'], quiet=True)
+        sky_check.check(clouds=['kubernetes'],
+                        quiet=True,
+                        capability=sky_check.CloudCapability.COMPUTE)
     if cluster_created:
         # Prepare completion message which shows CPU and GPU count
         # Get number of CPUs

sky/utils/subprocess_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Utility functions for subprocesses."""
+import multiprocessing
 from multiprocessing import pool
 import os
 import random
@@ -181,29 +182,6 @@ def kill_children_processes(parent_pids: Optional[Union[
     if isinstance(parent_pids, int):
         parent_pids = [parent_pids]
-    def kill(proc: psutil.Process):
-        if not proc.is_running():
-            # Skip if the process is not running.
-            return
-        logger.debug(f'Killing process {proc.pid}')
-        try:
-            if force:
-                proc.kill()
-            else:
-                proc.terminate()
-            proc.wait(timeout=10)
-        except psutil.NoSuchProcess:
-            # The child process may have already been terminated.
-            pass
-        except psutil.TimeoutExpired:
-            logger.debug(
-                f'Process {proc.pid} did not terminate after 10 seconds')
-            # Attempt to force kill if the normal termination fails
-            if not force:
-                logger.debug(f'Force killing process {proc.pid}')
-                proc.kill()
-                proc.wait(timeout=5)  # Shorter timeout after force kill
     parent_processes = []
     if parent_pids is None:
         parent_processes = [psutil.Process()]
@@ -218,10 +196,54 @@ def kill_children_processes(parent_pids: Optional[Union[
     for parent_process in parent_processes:
         child_processes = parent_process.children(recursive=True)
         if parent_pids is not None:
-            kill(parent_process)
+            kill_process_with_grace_period(parent_process, force=force)
         logger.debug(f'Killing child processes: {child_processes}')
         for child in child_processes:
-            kill(child)
+            kill_process_with_grace_period(child, force=force)
+def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
+                                               psutil.Process],
+                                   force: bool = False,
+                                   grace_period: int = 10) -> None:
+    """Kill a process with SIGTERM and wait for it to exit.
+    Args:
+        proc: The process to kill, either a multiprocessing.Process or a
+            psutil.Process.
+        force: Whether to force kill the process.
+        grace_period: The grace period seconds to wait for the process to exit.
+    """
+    if isinstance(proc, psutil.Process):
+        alive = proc.is_running
+        wait = proc.wait
+    else:
+        alive = proc.is_alive
+        wait = proc.join
+    if not alive():
+        # Skip if the process is not running.
+        return
+    logger.debug(f'Killing process {proc.pid}')
+    try:
+        if force:
+            proc.kill()
+        else:
+            proc.terminate()
+        wait(timeout=grace_period)
+    except (psutil.NoSuchProcess, ValueError):
+        # The child process may have already been terminated.
+        return
+    except psutil.TimeoutExpired:
+        # Pass to finally to force kill the process.
+        pass
+    finally:
+        logger.debug(f'Process {proc.pid} did not terminate after '
+                     f'{grace_period} seconds')
+        # Attempt to force kill if the normal termination fails
+        if not force:
+            logger.debug(f'Force killing process {proc.pid}')
+            # Shorter timeout after force kill
+            kill_process_with_grace_period(proc, force=True, grace_period=5)
 def run_with_retries(

{skypilot_nightly-1.0.0.dev20250318.dist-info → skypilot_nightly-1.0.0.dev20250320.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: skypilot-nightly
-Version: 1.0.0.dev20250318
+Version: 1.0.0.dev20250320
 Summary: SkyPilot: An intercloud broker for the clouds
 Author: SkyPilot Team
 License: Apache 2.0
@@ -156,6 +156,7 @@ Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type
 Dynamic: license
+Dynamic: license-file
 Dynamic: project-url
 Dynamic: provides-extra
 Dynamic: requires-dist

skypilot-nightly 1.0.0.dev20250318__py3-none-any.whl → 1.0.0.dev20250320__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250318py3-none-any.whl → 1.0.0.dev20250320py3-none-any.whl