PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250317__py3-none-any.whl → 1.0.0.dev20250319__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250317py3-none-any.whl → 1.0.0.dev20250319py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '5d5ab949f1f83301a2677989761c1eea06f0af00'
+_SKYPILOT_COMMIT_SHA = '246e69ba16705c31b69143bfe76efcee17b6407f'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250317'
+__version__ = '1.0.0.dev20250319'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/clouds/kubernetes.py CHANGED Viewed

@@ -229,32 +229,52 @@ class Kubernetes(clouds.Cloud):
         # Check if requested instance type will fit in the cluster.
         # TODO(zhwu,romilb): autoscaler type needs to be regional (per
         # kubernetes cluster/context).
-        regions_to_return = []
+        if instance_type is None:
+            return regions
         autoscaler_type = kubernetes_utils.get_autoscaler_type()
-        if autoscaler_type is None and instance_type is not None:
-            # If autoscaler is not set, check if the instance type fits in the
-            # cluster. Else, rely on the autoscaler to provision the right
-            # instance type without running checks. Worst case, if autoscaling
-            # fails, the pod will be stuck in pending state until
-            # provision_timeout, after which failover will be triggered.
-            for r in regions:
-                context = r.name
-                try:
-                    fits, reason = kubernetes_utils.check_instance_fits(
-                        context, instance_type)
-                except exceptions.KubeAPIUnreachableError as e:
-                    cls._log_unreachable_context(context, str(e))
-                    continue
-                if fits:
-                    regions_to_return.append(r)
-                else:
-                    logger.debug(
-                        f'Instance type {instance_type} does '
-                        'not fit in the Kubernetes cluster with context: '
-                        f'{context}. Reason: {reason}')
-        else:
-            regions_to_return = regions
+        if (autoscaler_type is not None and not kubernetes_utils.get_autoscaler(
+                autoscaler_type).can_query_backend):
+            # Unsupported autoscaler type. Rely on the autoscaler to
+            # provision the right instance type without running checks.
+            # Worst case, if autoscaling fails, the pod will be stuck in
+            # pending state until provision_timeout, after which failover
+            # will be triggered.
+            #
+            # Removing this if statement produces the same behavior,
+            # because can_create_new_instance_of_type() always returns True
+            # for unsupported autoscaler types.
+            # This check is here as a performance optimization to avoid
+            # further code executions that is known to return this result.
+            return regions
+        regions_to_return = []
+        for r in regions:
+            context = r.name
+            try:
+                fits, reason = kubernetes_utils.check_instance_fits(
+                    context, instance_type)
+            except exceptions.KubeAPIUnreachableError as e:
+                cls._log_unreachable_context(context, str(e))
+                continue
+            if fits:
+                regions_to_return.append(r)
+                continue
+            logger.debug(f'Instance type {instance_type} does '
+                         'not fit in the existing Kubernetes cluster '
+                         'with context: '
+                         f'{context}. Reason: {reason}')
+            if autoscaler_type is None:
+                continue
+            autoscaler = kubernetes_utils.get_autoscaler(autoscaler_type)
+            logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
+            if autoscaler.can_create_new_instance_of_type(
+                    context, instance_type):
+                logger.debug(f'Kubernetes cluster {context} can be '
+                             'autoscaled to create instance type '
+                             f'{instance_type}. Including {context} '
+                             'in the list of regions to return.')
+                regions_to_return.append(r)
         return regions_to_return
     def instance_type_to_hourly_cost(self,
@@ -618,7 +638,6 @@ class Kubernetes(clouds.Cloud):
             chosen_instance_type = (
                 kubernetes_utils.KubernetesInstanceType.from_resources(
                     gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
         # Check the availability of the specified instance type in all contexts.
         available_regions = self.regions_with_offering(
             chosen_instance_type,

sky/execution.py CHANGED Viewed

@@ -529,6 +529,11 @@ def launch(
             ]
             skip_unnecessary_provisioning = True
+    # Attach to setup if the cluster is a controller, so that user can
+    # see the setup logs when inspecting the launch process to know
+    # excatly what the job is waiting for.
+    detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
     return _execute(
         entrypoint=entrypoint,
         dryrun=dryrun,
@@ -540,7 +545,7 @@ def launch(
         optimize_target=optimize_target,
         stages=stages,
         cluster_name=cluster_name,
-        detach_setup=True,
+        detach_setup=detach_setup,
         detach_run=True,
         idle_minutes_to_autostop=idle_minutes_to_autostop,
         no_setup=no_setup,

sky/optimizer.py CHANGED Viewed

@@ -1328,13 +1328,17 @@ def _fill_in_launchable_resources(
                                 f'{colorama.Style.RESET_ALL}')
                 else:
                     if resources.cpus is not None:
-                        logger.info('Try specifying a different CPU count, '
+                        logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
+                                    '- Try specifying a different CPU count, '
                                     'or add "+" to the end of the CPU count '
-                                    'to allow for larger instances.')
+                                    'to allow for larger instances.'
+                                    f'{colorama.Style.RESET_ALL}')
                     if resources.memory is not None:
-                        logger.info('Try specifying a different memory size, '
+                        logger.info(f'{colorama.Fore.LIGHTBLACK_EX}'
+                                    '- Try specifying a different memory size, '
                                     'or add "+" to the end of the memory size '
-                                    'to allow for larger instances.')
+                                    'to allow for larger instances.'
+                                    f'{colorama.Style.RESET_ALL}')
                 for cloud, hint in hints.items():
                     logger.info(f'{repr(cloud)}: {hint}')

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -21,6 +21,7 @@ from sky import exceptions
 from sky import models
 from sky import sky_logging
 from sky import skypilot_config
+from sky.adaptors import gcp
 from sky.adaptors import kubernetes
 from sky.provision import constants as provision_constants
 from sky.provision.kubernetes import constants as kubernetes_constants
@@ -96,6 +97,7 @@ GKE_TPU_ACCELERATOR_TO_GENERATION = {
     # Multi-host compatible v5e TPU configurations allowed.
     'tpu-v5-lite-podslice': 'v5e',
     'tpu-v5p-slice': 'v5p',
+    'tpu-v6e-slice': 'v6e',
 }
 POD_STATUSES = {
@@ -358,7 +360,8 @@ class GKELabelFormatter(GPULabelFormatter):
     # label to use in an autoscaling environment. For list of topologies, see:
     # tpu v5e: https://cloud.google.com/tpu/docs/tpus-in-gke
     # tpu v5p: https://cloud.google.com/tpu/docs/v5p
-    # TODO(romilb): Add support for TPU v4 and v6.
+    # tpu v6e: https://cloud.google.com/tpu/docs/v6e
+    # TODO(romilb): Add support for TPU v4.
     GKE_TPU_TOPOLOGIES = {
         'tpu-v5-lite-podslice': {
             1: '1x1',
@@ -373,6 +376,11 @@ class GKELabelFormatter(GPULabelFormatter):
         'tpu-v5p-slice': {
             4: '2x2x1'
         },
+        'tpu-v6e-slice': {
+            1: '1x1',
+            4: '2x2',
+            8: '2x4'
+        }
     }
     @classmethod
@@ -517,13 +525,6 @@ LABEL_FORMATTER_REGISTRY = [
     GFDLabelFormatter, CoreWeaveLabelFormatter
 ]
-# Mapping of autoscaler type to label formatter
-AUTOSCALER_TO_LABEL_FORMATTER = {
-    kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
-    kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterLabelFormatter,  # pylint: disable=line-too-long
-    kubernetes_enums.KubernetesAutoscalerType.GENERIC: SkyPilotLabelFormatter,
-}
 @annotations.lru_cache(scope='request')
 def detect_gpu_label_formatter(
@@ -557,6 +558,314 @@ def detect_gpu_label_formatter(
     return label_formatter, node_labels
+class Autoscaler:
+    """Base class to define a autoscaler for a Kubernetes cluster.
+    An autoscaler is a class that defines how to detect if a Kubernetes
+    context can autoscale to meet the resource requirements of a task.
+    """
+    label_formatter: Any = None
+    # returns if the autoscaler backend can be queried for information.
+    # If True, SkyPilot will query the autoscaler backend to check if
+    # the Kubernetes context can autoscale to meet the resource requirements
+    # of a task.
+    can_query_backend: bool = False
+    @classmethod
+    # pylint: disable=unused-argument
+    def can_create_new_instance_of_type(cls, context: str,
+                                        instance_type: str) -> bool:
+        """Returns if the Kubernetes context has an autoscaler
+        that can create a new node that satisfies the instance type.
+        Args:
+            context: The Kubernetes context to check.
+            instance_type: The instance type to check.
+        Returns:
+            bool: True if the Kubernetes context has an autoscaler that can
+                create a new node satisfying the instance type,
+                or if such determination is not possible.
+                False if the Kubernetes context autoscaler cannot create a new
+                node satisfying the instance type.
+        """
+        # For autoscalers that SkyPilot does not know how to interface with,
+        # assume the autoscaler can create a new node that satisfies
+        # the instance type.
+        # If this is not the case, the autoscaler will fail to provision the
+        # node and the pod will be stuck in pending state until
+        # provision_timeout, after which failover will be triggered.
+        return True
+class GKEAutoscaler(Autoscaler):
+    """GKE autoscaler
+    """
+    label_formatter: Any = GKELabelFormatter
+    can_query_backend: bool = True
+    # This variable is stored in memory in the server.
+    # The variable will reset if the server restarts.
+    _pip_install_gcp_hint_last_sent = 0.0
+    @classmethod
+    @annotations.lru_cache(scope='request', maxsize=10)
+    def can_create_new_instance_of_type(cls, context: str,
+                                        instance_type: str) -> bool:
+        """Looks at each node pool in the cluster and checks if
+        it can create a new node that satisfies the instance type.
+        If the context does not match standard GKE context naming convention,
+        or GKE credential is not set, this function returns True
+        for optimistic pod scheduling.
+        """
+        # assume context naming convention of
+        # gke_PROJECT-ID_LOCATION_CLUSTER-NAME
+        valid, project_id, location, cluster_name = cls._validate_context_name(
+            context)
+        if not valid:
+            # Context name is not in the format of
+            # gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
+            # Cannot determine if the context can autoscale
+            # return True for optimistic pod scheduling.
+            logger.debug(f'context {context} is not in the format of '
+                         f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
+                         'reporting context as potentially capable of '
+                         'provisioning resources without further check')
+            return True
+        try:
+            logger.debug(
+                f'attempting to get information about cluster {cluster_name}')
+            container_service = gcp.build('container',
+                                          'v1',
+                                          credentials=None,
+                                          cache_discovery=False)
+            cluster = container_service.projects().locations().clusters().get(
+                name=f'projects/{project_id}'
+                f'/locations/{location}'
+                f'/clusters/{cluster_name}').execute()
+        except ImportError:
+            # If the gcp module is not installed, return True for
+            # optimistic pod scheduling.
+            # Remind the user once per day to install the gcp module for better
+            # pod scheduling with GKE autoscaler.
+            if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
+                logger.info(
+                    'Could not fetch autoscaler information from GKE. '
+                    'Run pip install "skypilot[gcp]" for more intelligent pod '
+                    'scheduling with GKE autoscaler.')
+                cls._pip_install_gcp_hint_last_sent = time.time()
+            return True
+        except gcp.http_error_exception() as e:
+            # Cluster information is not available.
+            # return True for optimistic pod scheduling.
+            logger.debug(f'{e.message}', exc_info=True)
+            return True
+        # Check if any node pool with autoscaling enabled can
+        # fit the instance type.
+        for node_pool in cluster['nodePools']:
+            logger.debug(f'checking if node pool {node_pool["name"]} '
+                         'has autoscaling enabled.')
+            if (node_pool['autoscaling'] is not None and
+                    'enabled' in node_pool['autoscaling'] and
+                    node_pool['autoscaling']['enabled']):
+                logger.debug(
+                    f'node pool {node_pool["name"]} has autoscaling enabled. '
+                    'Checking if it can create a node '
+                    f'satisfying {instance_type}')
+                if cls._check_instance_fits_gke_autoscaler_node_pool(
+                        instance_type, node_pool):
+                    return True
+        return False
+    @classmethod
+    def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
+        """Validates the context name is in the format of
+        gke_PROJECT-ID_LOCATION_CLUSTER-NAME
+        Returns:
+            bool: True if the context name is in the format of
+                gke_PROJECT-ID_LOCATION_CLUSTER-NAME
+            str: project id
+            str: location
+            str: cluster name
+        """
+        context_components = context.split('_')
+        if len(context_components) != 4 or context_components[0] != 'gke':
+            logger.debug(
+                f'context {context} is not in valid GKE context format.')
+            return False, '', '', ''
+        logger.debug(f'context {context} is in valid GKE context format.')
+        return True, context_components[1], context_components[
+            2], context_components[3]
+    @classmethod
+    def _check_instance_fits_gke_autoscaler_node_pool(
+        cls, instance_type: str, node_pool: dict
+    ) -> bool:  # check if there are any spare capacity in the autoscaler.
+        node_pool_name = node_pool['name']
+        logger.debug(
+            f'checking if autoscale-enabled node pool {node_pool_name} '
+            f'can create a node satisfying {instance_type}')
+        k8s_instance_type = KubernetesInstanceType.\
+            from_instance_type(instance_type)
+        node_config = node_pool['config']
+        machine_type = node_config['machineType']
+        # Accelerator check
+        requested_acc_type = k8s_instance_type.accelerator_type
+        requested_acc_count = k8s_instance_type.accelerator_count
+        acc_is_tpu = (requested_acc_type is not None and
+                      is_tpu_on_gke(requested_acc_type))
+        if requested_acc_type is not None:
+            assert requested_acc_count is not None, (requested_acc_type,
+                                                     requested_acc_count)
+            accelerator_exists = False
+            if acc_is_tpu:
+                # Accelerator type is a TPU.
+                logger.debug(
+                    f'checking {node_pool_name} for TPU {requested_acc_type}:'
+                    f'{requested_acc_count}')
+                if 'resourceLabels' in node_config:
+                    accelerator_exists = cls._node_pool_has_tpu_capacity(
+                        node_config['resourceLabels'], machine_type,
+                        requested_acc_type, requested_acc_count)
+            else:
+                # Accelerator type is a GPU.
+                logger.debug(
+                    f'checking {node_pool_name} for GPU {requested_acc_type}:'
+                    f'{requested_acc_count}')
+                if 'accelerators' in node_config:
+                    accelerator_exists = cls._node_pool_has_gpu_capacity(
+                        node_config['accelerators'], requested_acc_type,
+                        requested_acc_count)
+            if not accelerator_exists:
+                logger.debug(f'{node_pool_name} does not have accelerators '
+                             f'{requested_acc_type}:{requested_acc_count}')
+                return False
+        # vcpu and memory check is not supported for TPU instances.
+        # TODO(seungjin): Correctly account for vcpu/memory for TPUs.
+        if acc_is_tpu:
+            # vcpu and memory check
+            logger.debug(f'vcpu and memory check is not supported for TPUs. '
+                         'Skipping vcpu and memory check for node pool '
+                         f'{node_pool_name}.')
+            return True
+        vcpus, mem = clouds.GCP.get_vcpus_mem_from_instance_type(machine_type)
+        if vcpus is not None and vcpus < k8s_instance_type.cpus:
+            logger.debug(f'vcpu check failed for {machine_type} '
+                         f'on node pool {node_pool_name}')
+            return False
+        if mem is not None and mem < k8s_instance_type.memory:
+            logger.debug(f'memory check failed for {machine_type} '
+                         f'on node pool {node_pool_name}')
+            return False
+        logger.debug(f'node pool {node_pool_name} can create a node '
+                     f'satisfying {instance_type}')
+        return True
+    @classmethod
+    def _node_pool_has_gpu_capacity(cls, node_pool_accelerators: List[dict],
+                                    requested_gpu_type: str,
+                                    requested_gpu_count: int) -> bool:
+        """Check if the node pool has enough GPU capacity
+        to fit the instance type.
+        """
+        for accelerator in node_pool_accelerators:
+            node_accelerator_type = GKELabelFormatter. \
+                get_accelerator_from_label_value(
+                    accelerator['acceleratorType'])
+            node_accelerator_count = accelerator['acceleratorCount']
+            if node_accelerator_type == requested_gpu_type and int(
+                    node_accelerator_count) >= requested_gpu_count:
+                return True
+        return False
+    @classmethod
+    def _node_pool_has_tpu_capacity(cls, node_pool_resource_labels: dict,
+                                    machine_type: str, requested_tpu_type: str,
+                                    requested_tpu_count: int) -> bool:
+        """Check if the node pool has enough TPU capacity
+        to fit the instance type.
+        """
+        if 'goog-gke-tpu-node-pool-type' not in node_pool_resource_labels:
+            # This node does not have TPUs.
+            return False
+        if cls._is_node_multi_host_tpu(node_pool_resource_labels):
+            # This node is a multi-host TPU.
+            # multi-host TPUs are not supported in SkyPilot yet.
+            return False
+        node_tpu_type = node_pool_resource_labels['goog-gke-accelerator-type']
+        # infer chip count from instance type
+        tpu_chip_count = cls._tpu_chip_count_from_instance_type(machine_type)
+        # For TPUs, the number of requested TPU count
+        # must exactly match the TPU count in the instance.
+        return (node_tpu_type == requested_tpu_type and
+                tpu_chip_count == requested_tpu_count)
+    @classmethod
+    def _tpu_chip_count_from_instance_type(cls, machine_type: str) -> int:
+        """Infer the number of TPU chips from the instance type."""
+        machine_type_parts = machine_type.split('-')
+        # according to
+        # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#machine_type
+        # GKE TPU machine types have the format of
+        # ct<version>-<type>-<node-chip-count>t
+        logger.debug(
+            f'inferring TPU chip count from machine type: {machine_type}')
+        if (len(machine_type_parts) != 3 or
+                not machine_type_parts[0].startswith('ct') or
+                not machine_type_parts[2].endswith('t') or
+                not machine_type_parts[2].strip('t').isdigit()):
+            logger.debug(f'machine type {machine_type} is not a '
+                         'valid TPU machine type format.')
+            return 0
+        num_tpu_chips = int(machine_type_parts[2].strip('t'))
+        logger.debug(
+            f'machine type {machine_type} has {num_tpu_chips} TPU chips.')
+        return num_tpu_chips
+    @classmethod
+    def _is_node_multi_host_tpu(cls, resource_labels: dict) -> bool:
+        """Check if the node pool is a multi-host TPU."""
+        return ('goog-gke-tpu-node-pool-type' in resource_labels and
+                resource_labels['goog-gke-tpu-node-pool-type'] == 'multi-host')
+class KarpenterAutoscaler(Autoscaler):
+    """Karpenter autoscaler
+    """
+    label_formatter: Any = KarpenterLabelFormatter
+    can_query_backend: bool = False
+class GenericAutoscaler(Autoscaler):
+    """Generic autoscaler
+    """
+    label_formatter: Any = SkyPilotLabelFormatter
+    can_query_backend: bool = False
+# Mapping of autoscaler type to autoscaler
+AUTOSCALER_TYPE_TO_AUTOSCALER = {
+    kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
+    kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
+    kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
+}
+def get_autoscaler(autoscaler_type: kubernetes_enums.KubernetesAutoscalerType):
+    return AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type, Autoscaler)
 @annotations.lru_cache(scope='request', maxsize=10)
 def detect_accelerator_resource(
         context: Optional[str]) -> Tuple[bool, Set[str]]:
@@ -710,7 +1019,8 @@ def check_instance_fits(context: Optional[str],
             node for node in nodes if gpu_label_key in node.metadata.labels and
             node.metadata.labels[gpu_label_key] == gpu_label_val
         ]
-        assert gpu_nodes, 'GPU nodes not found'
+        if not gpu_nodes:
+            return False, f'No GPU nodes found with {acc_type} on the cluster'
         if is_tpu_on_gke(acc_type):
             # If requested accelerator is a TPU type, check if the cluster
             # has sufficient TPU resource to meet the requirement.
@@ -795,9 +1105,10 @@ def get_accelerator_label_key_value(
             # early since we assume the cluster autoscaler will handle GPU
             # node provisioning.
             return None, None, None, None
-        formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
-        assert formatter is not None, ('Unsupported autoscaler type:'
-                                       f' {autoscaler_type}')
+        autoscaler = AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type)
+        assert autoscaler is not None, ('Unsupported autoscaler type:'
+                                        f' {autoscaler_type}')
+        formatter = autoscaler.label_formatter
         tpu_topology_label_key = None
         tpu_topology_label_value = None
         if is_tpu_on_gke(acc_type):

sky/server/requests/executor.py CHANGED Viewed

@@ -49,7 +49,6 @@ from sky.utils import annotations
 from sky.utils import common_utils
 from sky.utils import subprocess_utils
 from sky.utils import timeline
-from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
     import types
@@ -221,6 +220,10 @@ def _restore_output(original_stdout: int, original_stderr: int) -> None:
     os.close(original_stderr)
+def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
+    raise KeyboardInterrupt
 def _request_execution_wrapper(request_id: str,
                                ignore_return_value: bool) -> None:
     """Wrapper for a request execution.
@@ -232,12 +235,8 @@ def _request_execution_wrapper(request_id: str,
     3. Redirect the stdout and stderr of the execution to log file;
     4. Handle the SIGTERM signal to abort the request gracefully.
     """
-    def sigterm_handler(signum: int,
-                        frame: Optional['types.FrameType']) -> None:
-        raise KeyboardInterrupt
-    signal.signal(signal.SIGTERM, sigterm_handler)
+    # Handle the SIGTERM signal to abort the request processing gracefully.
+    signal.signal(signal.SIGTERM, _sigterm_handler)
     pid = multiprocessing.current_process().pid
     logger.info(f'Running request {request_id} with pid {pid}')
@@ -355,6 +354,8 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
     Args:
         max_parallel_size: Maximum number of parallel jobs this worker can run.
     """
+    # Handle the SIGTERM signal to abort the executor process gracefully.
+    signal.signal(signal.SIGTERM, _sigterm_handler)
     proc_group = f'{worker.schedule_type.value}-{worker.id}'
     setproctitle.setproctitle(f'SkyPilot:worker:{proc_group}')
     queue = _get_queue(worker.schedule_type)
@@ -388,19 +389,11 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
                 logger.info(f'[{worker}] Finished request: {request_id}')
             else:
                 logger.info(f'[{worker}] Submitted request: {request_id}')
-        except KeyboardInterrupt:
-            # Interrupt the worker process will stop request execution, but
-            # the SIGTERM request should be respected anyway since it might
-            # be explicitly sent by user.
-            # TODO(aylei): crash the API server or recreate the worker process
-            # to avoid broken state.
-            logger.error(f'[{worker}] Worker process interrupted')
-            with ux_utils.print_exception_no_traceback():
-                raise
         except (Exception, SystemExit) as e:  # pylint: disable=broad-except
             # Catch any other exceptions to avoid crashing the worker process.
             logger.error(
-                f'[{worker}] Error processing request {request_id}: '
+                f'[{worker}] Error processing request: '
+                f'{request_id if "request_id" in locals() else ""} '
                 f'{common_utils.format_exception(e, use_bracket=True)}')
     # Use concurrent.futures.ProcessPoolExecutor instead of multiprocessing.Pool
@@ -409,12 +402,33 @@ def request_worker(worker: RequestWorker, max_parallel_size: int) -> None:
     # We use executor instead of individual multiprocessing.Process to avoid
     # the overhead of forking a new process for each request, which can be about
     # 1s delay.
-    with concurrent.futures.ProcessPoolExecutor(
+    try:
+        executor = concurrent.futures.ProcessPoolExecutor(
             max_workers=max_parallel_size,
             initializer=executor_initializer,
-            initargs=(proc_group,)) as executor:
+            initargs=(proc_group,))
         while True:
             process_request(executor)
+    # TODO(aylei): better to distinct between KeyboardInterrupt and SIGTERM.
+    except KeyboardInterrupt:
+        pass
+    finally:
+        # In most cases, here we receive either ctrl-c in foreground execution
+        # or SIGTERM on server exiting. Gracefully exit the worker process and
+        # the executor.
+        # TODO(aylei): worker may also be killed by system daemons like OOM
+        # killer, crash the API server or recreate the worker process to avoid
+        # broken state in such cases.
+        logger.info(f'[{worker}] Worker process interrupted')
+        executor_processes = list(executor._processes.values())  # pylint: disable=protected-access,line-too-long
+        # Shutdown the executor so that executor process can exit once the
+        # running task is finished or interrupted.
+        executor.shutdown(wait=False)
+        # Proactively interrupt the running task to avoid indefinite waiting.
+        subprocess_utils.run_in_parallel(
+            subprocess_utils.kill_process_with_grace_period,
+            executor_processes,
+            num_threads=len(executor_processes))
 def start(deploy: bool) -> List[multiprocessing.Process]:

sky/server/server.py CHANGED Viewed

@@ -1140,6 +1140,9 @@ if __name__ == '__main__':
                 # The process may not be started yet, close it anyway.
                 proc.close()
+        # Terminate processes in reverse order in case dependency, especially
+        # queue server. Terminate queue server first does not affect the
+        # correctness of cleanup but introduce redundant error messages.
         subprocess_utils.run_in_parallel(cleanup,
-                                         sub_procs,
+                                         list(reversed(sub_procs)),
                                          num_threads=len(sub_procs))

sky/utils/subprocess_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Utility functions for subprocesses."""
+import multiprocessing
 from multiprocessing import pool
 import os
 import random
@@ -181,29 +182,6 @@ def kill_children_processes(parent_pids: Optional[Union[
     if isinstance(parent_pids, int):
         parent_pids = [parent_pids]
-    def kill(proc: psutil.Process):
-        if not proc.is_running():
-            # Skip if the process is not running.
-            return
-        logger.debug(f'Killing process {proc.pid}')
-        try:
-            if force:
-                proc.kill()
-            else:
-                proc.terminate()
-            proc.wait(timeout=10)
-        except psutil.NoSuchProcess:
-            # The child process may have already been terminated.
-            pass
-        except psutil.TimeoutExpired:
-            logger.debug(
-                f'Process {proc.pid} did not terminate after 10 seconds')
-            # Attempt to force kill if the normal termination fails
-            if not force:
-                logger.debug(f'Force killing process {proc.pid}')
-                proc.kill()
-                proc.wait(timeout=5)  # Shorter timeout after force kill
     parent_processes = []
     if parent_pids is None:
         parent_processes = [psutil.Process()]
@@ -218,10 +196,54 @@ def kill_children_processes(parent_pids: Optional[Union[
     for parent_process in parent_processes:
         child_processes = parent_process.children(recursive=True)
         if parent_pids is not None:
-            kill(parent_process)
+            kill_process_with_grace_period(parent_process, force=force)
         logger.debug(f'Killing child processes: {child_processes}')
         for child in child_processes:
-            kill(child)
+            kill_process_with_grace_period(child, force=force)
+def kill_process_with_grace_period(proc: Union[multiprocessing.Process,
+                                               psutil.Process],
+                                   force: bool = False,
+                                   grace_period: int = 10) -> None:
+    """Kill a process with SIGTERM and wait for it to exit.
+    Args:
+        proc: The process to kill, either a multiprocessing.Process or a
+            psutil.Process.
+        force: Whether to force kill the process.
+        grace_period: The grace period seconds to wait for the process to exit.
+    """
+    if isinstance(proc, psutil.Process):
+        alive = proc.is_running
+        wait = proc.wait
+    else:
+        alive = proc.is_alive
+        wait = proc.join
+    if not alive():
+        # Skip if the process is not running.
+        return
+    logger.debug(f'Killing process {proc.pid}')
+    try:
+        if force:
+            proc.kill()
+        else:
+            proc.terminate()
+        wait(timeout=grace_period)
+    except (psutil.NoSuchProcess, ValueError):
+        # The child process may have already been terminated.
+        return
+    except psutil.TimeoutExpired:
+        # Pass to finally to force kill the process.
+        pass
+    finally:
+        logger.debug(f'Process {proc.pid} did not terminate after '
+                     f'{grace_period} seconds')
+        # Attempt to force kill if the normal termination fails
+        if not force:
+            logger.debug(f'Force killing process {proc.pid}')
+            # Shorter timeout after force kill
+            kill_process_with_grace_period(proc, force=True, grace_period=5)
 def run_with_retries(

{skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: skypilot-nightly
-Version: 1.0.0.dev20250317
+Version: 1.0.0.dev20250319
 Summary: SkyPilot: An intercloud broker for the clouds
 Author: SkyPilot Team
 License: Apache 2.0

{skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-sky/__init__.py,sha256=rrcorJA9XwEtr4hzoKv0Vw-COYQaUddD9wrZOVjTeIw,6428
+sky/__init__.py,sha256=3eIvmaqr9j7Q14zbXB6K1AYrtAYYBeSZaufG8cPHilk,6428
 sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
 sky/authentication.py,sha256=hCEqi77nprQEg3ktfRL51xiiw16zwZOmFEDB_Z7fWVU,22384
 sky/check.py,sha256=NDKx_Zm7YRxPjMv82wz3ESLnGIPljaACyqVdVNM0PzY,11258
@@ -7,10 +7,10 @@ sky/cloud_stores.py,sha256=kEHXd2divyra-1c3EusHxKyM5yTQlTXc6cKVXofsefA,23978
 sky/core.py,sha256=MU9hcTdh8baMGrr2ZXmbxx12vNlhajrkeyg5QtV717c,47609
 sky/dag.py,sha256=Yl7Ry26Vql5cv4YMz8g9kOUgtoCihJnw7c8NgZYakMY,3242
 sky/exceptions.py,sha256=cEZ5nm7RhTW22Npw-oYS5Wp9rtxoHxdPQHfkNa92wOo,16641
-sky/execution.py,sha256=0M4RTEzWn-B9oz221XdZOIGH12XOACmNq0j-WGUT_No,28023
+sky/execution.py,sha256=9L8NFOXNphtabnsL7mHGPJeGdw4n6gIIUEOzjW7CEHw,28294
 sky/global_user_state.py,sha256=sUDdSsJeiJkbgmZNwy8YGFK0XeNh-RBr1VDUvbmjf0g,33246
 sky/models.py,sha256=4xSW05BdDPEjW8Ubvj3VlVOVnzv0TbrolsFvR5R5v1U,638
-sky/optimizer.py,sha256=C82l9N3umdrJ2AaM-pSg0aK5rpOAX3lEAfFU7r6hqPo,60183
+sky/optimizer.py,sha256=7FeTo0Bk4M7OnXugv-YdCj50PTL2R7NVGHMsr7DWBJ0,60457
 sky/resources.py,sha256=f2Qo_Wt0kFruKmYm6cgYbICH_wn0Zkb8uIv6LA82SRs,72153
 sky/sky_logging.py,sha256=pID2RINjH62n7SZpv70DuN8BSFYdCfTJ2ScGQpVmugg,5725
 sky/skypilot_config.py,sha256=bt1vSis2aKKdQfPz80-KcjM9vNIg_qYKLNXur782Poo,8693
@@ -55,7 +55,7 @@ sky/clouds/do.py,sha256=hmksx0XML0dVHUZBMV2Wr3a5VilOsYfxX2dSBV_XK5o,11487
 sky/clouds/fluidstack.py,sha256=Eb0nlfU_EwTtGtV0nPKS2ueBlB0nYiDAN9swA-jjQV0,12446
 sky/clouds/gcp.py,sha256=cvFSeX8RcyhX5HJb57YposUr9p1RaUPmpxvg_AI_D3c,55978
 sky/clouds/ibm.py,sha256=R4JR96YfXstZ2B_IgFNVEX2SBAq3q0lSWz4y7FoFoeE,21474
-sky/clouds/kubernetes.py,sha256=xsYX8HhdcRzsdx6Gd_3kumNqjMjpo_l4cinhs3ZMwZM,35067
+sky/clouds/kubernetes.py,sha256=u8mRd75a0NS7-uHdGXk_cqqLc4Z2vU0CedwmLJpzmZ0,36081
 sky/clouds/lambda_cloud.py,sha256=ejqA_Wj5-325Y_QjQ__FY4HMO8sv_2tSRsufmaldcmI,12699
 sky/clouds/nebius.py,sha256=G3v73NZjLzGoCi0ZfHj6VkOt-fs1i6DDxCpNiE88BdA,12676
 sky/clouds/oci.py,sha256=irINbQsQ6YxRxGTMaCNsms3mZkIun2oJMMA1fMCRJyA,27072
@@ -165,7 +165,7 @@ sky/provision/kubernetes/constants.py,sha256=dZCUV8FOO9Gct80sdqeubKnxeW3CGl-u5mx
 sky/provision/kubernetes/instance.py,sha256=oag17OtuiqU-1RjkgW9NvEpxSGUFIYdI7M61S-YmPu8,50503
 sky/provision/kubernetes/network.py,sha256=AtcOM8wPs_-UlQJhGEQGP6Lh4HIgdx63Y0iWEhP5jyc,12673
 sky/provision/kubernetes/network_utils.py,sha256=Bwy5ZQb62ejC7ZHM4htjzhs86UNACK7AXN-NfQ9IJrE,11454
-sky/provision/kubernetes/utils.py,sha256=A2nzKUCFqmq5KveyagE5u4_p0b6frg6256lwvAlwPEA,110155
+sky/provision/kubernetes/utils.py,sha256=puwjlWM4EMExa1jO0cxluzg8ZSF-QX4rgZGksZdxKiQ,124015
 sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml,sha256=AMzYzlY0JIlfBWj5eX054Rc1XDW2thUcLSOGMJVhIdA,229
 sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml,sha256=RtTq4F1QUmR2Uunb6zuuRaPhV7hpesz4saHjn3Ncsb4,2010
 sky/provision/lambda_cloud/__init__.py,sha256=6EEvSgtUeEiup9ivIFevHmgv0GqleroO2X0K7TRa2nE,612
@@ -230,13 +230,13 @@ sky/serve/server/server.py,sha256=gQGVU9nHYdGbaLhGjIUNIYn4xwKjRASRJkiiTL5AI1Y,32
 sky/server/__init__.py,sha256=MPPBqFzXz6Jv5QSk6td_IcvnfXfNErDZVcizu4MLRow,27
 sky/server/common.py,sha256=PMPaKoPtoUGolbdSW78VetUW5H0X7YKBT-z6Hbu3BJM,18471
 sky/server/constants.py,sha256=_ZNrxYh8vmgbf3DmkGDduxjvO2y43ZSPTkH5rCNsVjU,770
-sky/server/server.py,sha256=kEjwRjA7PJDZzx6KqD_NAFxryVLkzwCnuPfbmY_p30A,44232
+sky/server/server.py,sha256=62IysoY5jCbGi99xIsYrINFIuRgo-cKKIR8fXsKMuW0,44472
 sky/server/stream_utils.py,sha256=4JMHgtoXPpCT8JwtqyUcDQ9IdZFir9om0JaCRr8rvbQ,5849
 sky/server/uvicorn.py,sha256=wajwPHJ3IEEP3GMNOCc0S81-1v2qT5F-ejUkLFVhUzk,2953
 sky/server/html/log.html,sha256=TSGZktua9Ysl_ysg3w60rjxAxhH61AJnsYDHdtqrjmI,6929
 sky/server/requests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sky/server/requests/event_loop.py,sha256=OhpPbuce65bbjpGRlcJa78AVnYSm08SzFKt70ypCUuQ,1211
-sky/server/requests/executor.py,sha256=SuSr-cVrRnMzf-1SEz6O8HpcLzGM3mrbNc8re7QduYk,20862
+sky/server/requests/executor.py,sha256=BNJqkTQ3swYeRO5YVW-dTmobL2CYnDDf_m-kY7__n40,21684
 sky/server/requests/payloads.py,sha256=nVb7vr1SNAq6ay2dNe9301zLHp7NrM79M7nsWAECBms,16340
 sky/server/requests/preconditions.py,sha256=ipxIb_3JXG6S3-ymcOdqQNb7VDvoPqADxu9ZK7-nQWc,7179
 sky/server/requests/requests.py,sha256=Sys2rg22rIXn7SrHfKzDVuTjBdRlm5oZk58u1UmS6JA,21231
@@ -328,7 +328,7 @@ sky/utils/resources_utils.py,sha256=URp6OS9B9nc9tIB5ibZCgGK4XSABmI4kRG0wOM6qgvs,
 sky/utils/rich_utils.py,sha256=3xdDzmn-TQXAE83EevAtOf9N4aak3Bl4ZeD33xIxjOo,11931
 sky/utils/schemas.py,sha256=KJCHrn1nMZ3XqzddWuu_nFQoRQw01cZh9qh19OrRtps,30145
 sky/utils/status_lib.py,sha256=zn_MSuRYQdNKF8pnFOGQ54X_s_R7dyqWS6Q3a9zENw8,1512
-sky/utils/subprocess_utils.py,sha256=Q42CyjDNICXze2WCGuGxgpEjtjlka43_2ihRqKhSnQw,14916
+sky/utils/subprocess_utils.py,sha256=Ee4WajTJ6YLAjC8CgN5l1K7m6hsnpGqDa26MXkDifvw,15776
 sky/utils/timeline.py,sha256=ob6s3bc7nwAuSI76yLKBrSR5bzOHnOhbozz1avwoet4,4070
 sky/utils/ux_utils.py,sha256=ngcOCg1K44p-SOk6XfwxJGXwjoP__PRvNuEzj7t05Yc,10185
 sky/utils/validator.py,sha256=cAFERCoC7jH0DFKepcU4x9SYmdrYL1iVmW9tXA18hvo,701
@@ -347,9 +347,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488
 sky/utils/kubernetes/kubernetes_deploy_utils.py,sha256=otzHzpliHDCpzYT-nU9Q0ZExbiFpDPWvhxwkvchZj7k,10073
 sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
 sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
-skypilot_nightly-1.0.0.dev20250317.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
-skypilot_nightly-1.0.0.dev20250317.dist-info/METADATA,sha256=QQAbZSEDZeyfbiMEHyn0Fvvb-dGb4B6lFHhJJTFe510,17919
-skypilot_nightly-1.0.0.dev20250317.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
-skypilot_nightly-1.0.0.dev20250317.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
-skypilot_nightly-1.0.0.dev20250317.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
-skypilot_nightly-1.0.0.dev20250317.dist-info/RECORD,,
+skypilot_nightly-1.0.0.dev20250319.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
+skypilot_nightly-1.0.0.dev20250319.dist-info/METADATA,sha256=Iys5Rb5saDPHcYoCslzL2WR1YyxDv2fSA-knwQQb6jc,17919
+skypilot_nightly-1.0.0.dev20250319.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
+skypilot_nightly-1.0.0.dev20250319.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
+skypilot_nightly-1.0.0.dev20250319.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
+skypilot_nightly-1.0.0.dev20250319.dist-info/RECORD,,

{skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (76.0.0)
+Generator: setuptools (76.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/LICENSE RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20250317.dist-info → skypilot_nightly-1.0.0.dev20250319.dist-info}/top_level.txt RENAMED Viewed

File without changes

skypilot-nightly 1.0.0.dev20250317__py3-none-any.whl → 1.0.0.dev20250319__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250317py3-none-any.whl → 1.0.0.dev20250319py3-none-any.whl