PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250814py3-none-any.whl → 1.0.0.dev20250815py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (127) hide show

sky/schemas/db/global_user_state/006_provision_log.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Add provision_log_path to clusters and cluster_history.
+Revision ID: 006
+Revises: 005
+Create Date: 2025-08-12
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sky.utils.db import db_utils
+# revision identifiers, used by Alembic.
+revision: str = '006'
+down_revision: Union[str, Sequence[str], None] = '005'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add provision_log_path columns."""
+    with op.get_context().autocommit_block():
+        # clusters.provision_log_path
+        db_utils.add_column_to_table_alembic('clusters',
+                                             'provision_log_path',
+                                             sa.Text(),
+                                             server_default=None)
+        # cluster_history.provision_log_path
+        db_utils.add_column_to_table_alembic('cluster_history',
+                                             'provision_log_path',
+                                             sa.Text(),
+                                             server_default=None)
+def downgrade():
+    """No-op for backward compatibility."""
+    pass

sky/serve/replica_managers.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """ReplicaManager: handles the creation and deletion of endpoint replicas."""
-import collections
 import dataclasses
-import enum
 import functools
 import multiprocessing
 from multiprocessing import pool as mp_pool
@@ -199,6 +197,12 @@ def _should_use_spot(service_task_yaml_path: str,
     return len(spot_use_resources) == len(task.resources)
+# Every function that calls serve_state.add_or_update_replica should acquire
+# this lock. It is to prevent race condition when the replica status is updated
+# by multiple threads at the same time. The modification of replica info is
+# 2 database calls: read the whole replica info object, unpickle it, and modify
+# corresponding fields. Then it is write back to the database. We need to ensure
+# the read-modify-write operation is atomic.
 def with_lock(func):
     @functools.wraps(func)
@@ -209,22 +213,6 @@ def with_lock(func):
     return wrapper
-class ProcessStatus(enum.Enum):
-    """Process status."""
-    # The process is running
-    RUNNING = 'RUNNING'
-    # The process is finished and succeeded
-    SUCCEEDED = 'SUCCEEDED'
-    # The process is interrupted
-    INTERRUPTED = 'INTERRUPTED'
-    # The process failed
-    FAILED = 'FAILED'
 @dataclasses.dataclass
 class ReplicaStatusProperty:
     """Some properties that determine replica status.
@@ -236,15 +224,16 @@ class ReplicaStatusProperty:
         first_ready_time: The first time the service is ready.
         sky_down_status: Process status of sky.down.
     """
-    # None means sky.launch is not called yet.
-    sky_launch_status: Optional[ProcessStatus] = None
+    # sky.launch will always be scheduled on creation of ReplicaStatusProperty.
+    sky_launch_status: common_utils.ProcessStatus = (
+        common_utils.ProcessStatus.SCHEDULED)
     user_app_failed: bool = False
     service_ready_now: bool = False
     # None means readiness probe is not succeeded yet;
     # -1 means the initial delay seconds is exceeded.
     first_ready_time: Optional[float] = None
     # None means sky.down is not called yet.
-    sky_down_status: Optional[ProcessStatus] = None
+    sky_down_status: Optional[common_utils.ProcessStatus] = None
     # Whether the termination is caused by autoscaler's decision
     is_scale_down: bool = False
     # The replica's spot instance was preempted.
@@ -299,7 +288,7 @@ class ReplicaStatusProperty:
             (1) Job status;
             (2) Readiness probe.
         """
-        if self.sky_launch_status != ProcessStatus.SUCCEEDED:
+        if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
             return False
         if self.sky_down_status is not None:
             return False
@@ -313,37 +302,43 @@ class ReplicaStatusProperty:
     def to_replica_status(self) -> serve_state.ReplicaStatus:
         """Convert status property to human-readable replica status."""
-        if self.sky_launch_status is None:
+        # Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
+        # we use None to represent sky.launch is not called yet.
+        if (self.sky_launch_status is None or
+                self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
             # Pending to launch
             return serve_state.ReplicaStatus.PENDING
-        if self.sky_launch_status == ProcessStatus.RUNNING:
-            if self.sky_down_status == ProcessStatus.FAILED:
+        if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
+            if self.sky_down_status == common_utils.ProcessStatus.FAILED:
                 return serve_state.ReplicaStatus.FAILED_CLEANUP
-            if self.sky_down_status == ProcessStatus.SUCCEEDED:
+            if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
                 # This indicate it is a scale_down with correct teardown.
                 # Should have been cleaned from the replica table.
                 return serve_state.ReplicaStatus.UNKNOWN
             # Still launching
             return serve_state.ReplicaStatus.PROVISIONING
-        if self.sky_launch_status == ProcessStatus.INTERRUPTED:
+        if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
             # sky.down is running and a scale down interrupted sky.launch
             return serve_state.ReplicaStatus.SHUTTING_DOWN
         if self.sky_down_status is not None:
             if self.preempted:
                 # Replica (spot) is preempted
                 return serve_state.ReplicaStatus.PREEMPTED
-            if self.sky_down_status == ProcessStatus.RUNNING:
+            if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
+                # sky.down is scheduled to run, but not started yet.
+                return serve_state.ReplicaStatus.SHUTTING_DOWN
+            if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
                 # sky.down is running
                 return serve_state.ReplicaStatus.SHUTTING_DOWN
-            if self.sky_launch_status == ProcessStatus.INTERRUPTED:
+            if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
                 return serve_state.ReplicaStatus.SHUTTING_DOWN
-            if self.sky_down_status == ProcessStatus.FAILED:
+            if self.sky_down_status == common_utils.ProcessStatus.FAILED:
                 # sky.down failed
                 return serve_state.ReplicaStatus.FAILED_CLEANUP
             if self.user_app_failed:
                 # Failed on user setup/run
                 return serve_state.ReplicaStatus.FAILED
-            if self.sky_launch_status == ProcessStatus.FAILED:
+            if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
                 # sky.launch failed
                 return serve_state.ReplicaStatus.FAILED_PROVISION
             if self.first_ready_time is None:
@@ -359,7 +354,7 @@ class ReplicaStatusProperty:
             # This indicate it is a scale_down with correct teardown.
             # Should have been cleaned from the replica table.
             return serve_state.ReplicaStatus.UNKNOWN
-        if self.sky_launch_status == ProcessStatus.FAILED:
+        if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
             # sky.launch failed
             # The down process has not been started if it reaches here,
             # due to the `if self.sky_down_status is not None`` check above.
@@ -703,6 +698,7 @@ class SkyPilotReplicaManager(ReplicaManager):
         self._recover_replica_operations()
+    @with_lock
     def _recover_replica_operations(self):
         """Let's see are there something to do for ReplicaManager in a
         recovery run"""
@@ -743,9 +739,8 @@ class SkyPilotReplicaManager(ReplicaManager):
     # Replica management functions #
     ################################
-    # Adding lock here to make sure spot placer's current locations are
-    # consistent with the replicas' status.
-    @with_lock
+    # We don't need to add lock here since every caller of this function
+    # will acquire the lock.
     def _launch_replica(
         self,
         replica_id: int,
@@ -801,11 +796,61 @@ class SkyPilotReplicaManager(ReplicaManager):
         # to avoid too many sky.launch running at the same time.
         self._launch_process_pool[replica_id] = p
+    @with_lock
     def scale_up(self,
                  resources_override: Optional[Dict[str, Any]] = None) -> None:
         self._launch_replica(self._next_replica_id, resources_override)
         self._next_replica_id += 1
+    def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
+        if exitcode != 0:
+            logger.error(f'Down process for replica {info.replica_id} '
+                         f'exited abnormally with code {exitcode}.')
+            info.status_property.sky_down_status = (
+                common_utils.ProcessStatus.FAILED)
+        else:
+            info.status_property.sky_down_status = (
+                common_utils.ProcessStatus.SUCCEEDED)
+        # Failed replica still count as a replica. In our current design, we
+        # want to fail early if user code have any error. This will prevent
+        # infinite loop of teardown and re-provision. However, there is a
+        # special case that if the replica is UP for longer than
+        # initial_delay_seconds, we assume it is just some random failure and
+        # we should restart the replica. Please refer to the implementation of
+        # `is_scale_down_succeeded` for more details.
+        # TODO(tian): Currently, restart replicas that failed within
+        # initial_delay_seconds is not supported. We should add it
+        # later when we support `sky serve update`.
+        removal_reason = None
+        if info.status_property.is_scale_down:
+            # This means the cluster is deleted due to an autoscaler
+            # decision or the cluster is recovering from preemption.
+            # Delete the replica info so it won't count as a replica.
+            if info.status_property.preempted:
+                removal_reason = 'for preemption recovery'
+            else:
+                removal_reason = 'normally'
+        # Don't keep failed record for version mismatch replicas,
+        # since user should fixed the error before update.
+        elif info.version != self.latest_version:
+            removal_reason = 'for version outdated'
+        elif info.status_property.purged:
+            removal_reason = 'for purge'
+        elif info.status_property.failed_spot_availability:
+            removal_reason = 'for spot availability failure'
+        else:
+            logger.info(f'Termination of replica {info.replica_id} '
+                        'finished. Replica info is kept since some '
+                        'failure detected.')
+            serve_state.add_or_update_replica(self._service_name,
+                                              info.replica_id, info)
+        if removal_reason is not None:
+            serve_state.remove_replica(self._service_name, info.replica_id)
+            logger.info(f'Replica {info.replica_id} removed from the '
+                        f'replica table {removal_reason}.')
+    # We don't need to add lock here since every caller of this function
+    # will acquire the lock.
     def _terminate_replica(self,
                            replica_id: int,
                            sync_down_logs: bool,
@@ -823,7 +868,8 @@ class SkyPilotReplicaManager(ReplicaManager):
             info = serve_state.get_replica_info_from_id(self._service_name,
                                                         replica_id)
             assert info is not None
-            info.status_property.sky_launch_status = ProcessStatus.INTERRUPTED
+            info.status_property.sky_launch_status = (
+                common_utils.ProcessStatus.INTERRUPTED)
             serve_state.add_or_update_replica(self._service_name, replica_id,
                                               info)
             launch_process = self._launch_process_pool[replica_id]
@@ -895,18 +941,30 @@ class SkyPilotReplicaManager(ReplicaManager):
         logger.info(f'preempted: {info.status_property.preempted}, '
                     f'replica_id: {replica_id}')
+        info.status_property.is_scale_down = is_scale_down
+        info.status_property.purged = purge
+        # If the cluster does not exist, it means either the cluster never
+        # exists (e.g., the cluster is scaled down before it gets a chance to
+        # provision) or the cluster is preempted and cleaned up by the status
+        # refresh. In this case, we skip spawning a new down process to save
+        # controller resources.
+        if global_user_state.get_cluster_from_name(info.cluster_name) is None:
+            self._handle_sky_down_finish(info, exitcode=0)
+            return
+        # Otherwise, start the process to terminate the cluster.
         p = multiprocessing.Process(
             target=ux_utils.RedirectOutputForProcess(terminate_cluster,
                                                      log_file_name, 'a').run,
             args=(info.cluster_name, replica_drain_delay_seconds),
         )
-        info.status_property.sky_down_status = ProcessStatus.RUNNING
-        info.status_property.is_scale_down = is_scale_down
-        info.status_property.purged = purge
+        info.status_property.sky_down_status = (
+            common_utils.ProcessStatus.SCHEDULED)
         serve_state.add_or_update_replica(self._service_name, replica_id, info)
-        p.start()
         self._down_process_pool[replica_id] = p
+    @with_lock
     def scale_down(self, replica_id: int, purge: bool = False) -> None:
         self._terminate_replica(
             replica_id,
@@ -915,6 +973,8 @@ class SkyPilotReplicaManager(ReplicaManager):
             is_scale_down=True,
             purge=purge)
+    # We don't need to add lock here since every caller of this function
+    # will acquire the lock.
     def _handle_preemption(self, info: ReplicaInfo) -> bool:
         """Handle preemption of the replica if any error happened.
@@ -990,7 +1050,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                     if controller_utils.can_provision():
                         p.start()
                         info.status_property.sky_launch_status = (
-                            ProcessStatus.RUNNING)
+                            common_utils.ProcessStatus.RUNNING)
                 else:
                     # sky.launch finished
                     # TODO(tian): Try-catch in process, and have an enum return
@@ -1007,11 +1067,11 @@ class SkyPilotReplicaManager(ReplicaManager):
                             f'exited abnormally with code {p.exitcode}.'
                             ' Terminating...')
                         info.status_property.sky_launch_status = (
-                            ProcessStatus.FAILED)
+                            common_utils.ProcessStatus.FAILED)
                         error_in_sky_launch = True
                     else:
                         info.status_property.sky_launch_status = (
-                            ProcessStatus.SUCCEEDED)
+                            common_utils.ProcessStatus.SUCCEEDED)
                         schedule_next_jobs = True
                     if self._spot_placer is not None and info.is_spot:
                         # TODO(tian): Currently, we set the location to
@@ -1033,8 +1093,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                 serve_state.add_or_update_replica(self._service_name,
                                                   replica_id, info)
                 if schedule_next_jobs and self._is_pool:
-                    jobs_scheduler.maybe_schedule_next_jobs(
-                        pool=self._service_name)
+                    jobs_scheduler.maybe_schedule_next_jobs()
                 if error_in_sky_launch:
                     # Teardown after update replica info since
                     # _terminate_replica will update the replica info too.
@@ -1045,59 +1104,25 @@ class SkyPilotReplicaManager(ReplicaManager):
             jobs_scheduler.maybe_schedule_next_jobs()
         down_process_pool_snapshot = list(self._down_process_pool.items())
         for replica_id, p in down_process_pool_snapshot:
-            if not p.is_alive():
-                logger.info(
-                    f'Terminate process for replica {replica_id} finished.')
-                del self._down_process_pool[replica_id]
-                info = serve_state.get_replica_info_from_id(
-                    self._service_name, replica_id)
-                assert info is not None, replica_id
-                if p.exitcode != 0:
-                    logger.error(f'Down process for replica {replica_id} '
-                                 f'exited abnormally with code {p.exitcode}.')
-                    info.status_property.sky_down_status = (
-                        ProcessStatus.FAILED)
-                else:
+            if p.is_alive():
+                continue
+            info = serve_state.get_replica_info_from_id(self._service_name,
+                                                        replica_id)
+            assert info is not None, replica_id
+            if (info.status_property.sky_down_status ==
+                    common_utils.ProcessStatus.SCHEDULED):
+                # sky.down not started yet
+                if controller_utils.can_terminate():
+                    p.start()
                     info.status_property.sky_down_status = (
-                        ProcessStatus.SUCCEEDED)
-                # Failed replica still count as a replica. In our current
-                # design, we want to fail early if user code have any error.
-                # This will prevent infinite loop of teardown and
-                # re-provision. However, there is a special case that if the
-                # replica is UP for longer than initial_delay_seconds, we
-                # assume it is just some random failure and we should restart
-                # the replica. Please refer to the implementation of
-                # `is_scale_down_succeeded` for more details.
-                # TODO(tian): Currently, restart replicas that failed within
-                # initial_delay_seconds is not supported. We should add it
-                # later when we support `sky serve update`.
-                removal_reason = None
-                if info.status_property.is_scale_down:
-                    # This means the cluster is deleted due to an autoscaler
-                    # decision or the cluster is recovering from preemption.
-                    # Delete the replica info so it won't count as a replica.
-                    if info.status_property.preempted:
-                        removal_reason = 'for preemption recovery'
-                    else:
-                        removal_reason = 'normally'
-                # Don't keep failed record for version mismatch replicas,
-                # since user should fixed the error before update.
-                elif info.version != self.latest_version:
-                    removal_reason = 'for version outdated'
-                elif info.status_property.purged:
-                    removal_reason = 'for purge'
-                elif info.status_property.failed_spot_availability:
-                    removal_reason = 'for spot availability failure'
-                else:
-                    logger.info(f'Termination of replica {replica_id} '
-                                'finished. Replica info is kept since some '
-                                'failure detected.')
+                        common_utils.ProcessStatus.RUNNING)
                     serve_state.add_or_update_replica(self._service_name,
                                                       replica_id, info)
-                if removal_reason is not None:
-                    serve_state.remove_replica(self._service_name, replica_id)
-                    logger.info(f'Replica {replica_id} removed from the '
-                                f'replica table {removal_reason}.')
+            else:
+                logger.info(
+                    f'Terminate process for replica {replica_id} finished.')
+                del self._down_process_pool[replica_id]
+                self._handle_sky_down_finish(info, exitcode=p.exitcode)
         # Clean old version
         replica_infos = serve_state.get_replica_infos(self._service_name)
@@ -1393,12 +1418,9 @@ class SkyPilotReplicaManager(ReplicaManager):
                 old_config_any_of = old_config.get('resources',
                                                    {}).pop('any_of', [])
-                def normalize_dict_list(lst):
-                    return collections.Counter(
-                        frozenset(d.items()) for d in lst)
-                if (normalize_dict_list(old_config_any_of) !=
-                        normalize_dict_list(new_config_any_of)):
+                if (resources_utils.normalize_any_of_resources_config(
+                        old_config_any_of) != resources_utils.
+                        normalize_any_of_resources_config(new_config_any_of)):
                     logger.info('Replica config changed (any_of), skipping. '
                                 f'old: {old_config_any_of}, '
                                 f'new: {new_config_any_of}')

sky/serve/serve_state.py CHANGED Viewed

@@ -670,6 +670,38 @@ def total_number_provisioning_replicas() -> int:
     return provisioning_count
+@init_db
+def total_number_terminating_replicas() -> int:
+    """Returns the total number of terminating replicas."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.execute(sqlalchemy.select(
+            replicas_table.c.replica_info)).fetchall()
+    terminating_count = 0
+    for row in rows:
+        replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
+        if (replica_info.status_property.sky_down_status ==
+                common_utils.ProcessStatus.RUNNING):
+            terminating_count += 1
+    return terminating_count
+@init_db
+def total_number_scheduled_to_terminate_replicas() -> int:
+    """Returns the total number of terminating replicas."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.execute(sqlalchemy.select(
+            replicas_table.c.replica_info)).fetchall()
+    terminating_count = 0
+    for row in rows:
+        replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
+        if (replica_info.status_property.sky_down_status ==
+                common_utils.ProcessStatus.SCHEDULED):
+            terminating_count += 1
+    return terminating_count
 def get_replicas_at_status(
     service_name: str,
     status: ReplicaStatus,

sky/serve/serve_utils.py CHANGED Viewed

@@ -63,7 +63,10 @@ _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
 # when changing UX as this assumption is used to expand some log files while
 # ignoring others.
 _SKYPILOT_LOG_HINT = r'.*sky api logs -l'
-_SKYPILOT_PROVISION_LOG_PATTERN = (fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
+_SKYPILOT_PROVISION_API_LOG_PATTERN = (
+    fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
+# New hint pattern for provision logs
+_SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
 _SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
 # TODO(tian): Find all existing replica id and print here.
@@ -1114,31 +1117,49 @@ def _process_line(line: str,
             return False
         return cluster_record['status'] == status_lib.ClusterStatus.UP
-    provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
+    provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
+                                        line)
+    provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
+                                        line)
     log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
-    if provision_log_prompt is not None:
-        log_path = provision_log_prompt.group(1)
-        nested_log_path = pathlib.Path(
-            skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
-                log_path).resolve()
+    def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
         try:
-            with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
-                # We still exit if more than 10 seconds without new content
-                # to avoid any internal bug that causes the launch to fail
-                # while cluster status remains INIT.
+            with open(p, 'r', newline='', encoding='utf-8') as f:
+                # Exit if >10s without new content to avoid hanging when INIT
                 yield from log_utils.follow_logs(f,
                                                  should_stop=cluster_is_up,
                                                  stop_on_eof=stop_on_eof,
                                                  idle_timeout_seconds=10)
         except FileNotFoundError:
+            # Fall back cleanly if the hinted path doesn't exist
             yield line
             yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
-                   f'Try to expand log file {nested_log_path} but not '
-                   f'found. Skipping...{colorama.Style.RESET_ALL}')
-            pass
+                   f'Try to expand log file {p} but not found. Skipping...'
+                   f'{colorama.Style.RESET_ALL}')
+        return
+    if provision_api_log_prompt is not None:
+        rel_path = provision_api_log_prompt.group(1)
+        nested_log_path = pathlib.Path(
+            skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
+                rel_path).resolve()
+        yield from _stream_provision_path(nested_log_path)
+        return
+    if provision_log_cmd_prompt is not None:
+        # Resolve provision log via cluster table first, then history.
+        log_path_str = global_user_state.get_cluster_provision_log_path(
+            cluster_name)
+        if not log_path_str:
+            log_path_str = (
+                global_user_state.get_cluster_history_provision_log_path(
+                    cluster_name))
+        if not log_path_str:
+            yield line
+            return
+        yield from _stream_provision_path(
+            pathlib.Path(log_path_str).expanduser().resolve())
         return
     if log_prompt is not None:

sky/serve/service.py CHANGED Viewed

@@ -113,6 +113,9 @@ def cleanup_storage(task_yaml: str) -> bool:
     return not failed
+# NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
+# because we killed all the processes (controller & replica manager) before
+# calling this function.
 def _cleanup(service_name: str) -> bool:
     """Clean up all service related resources, i.e. replicas and storage."""
     # Cleanup the HA recovery script first as it is possible that some error
@@ -135,28 +138,59 @@ def _cleanup(service_name: str) -> bool:
             continue
         p = multiprocessing.Process(target=replica_managers.terminate_cluster,
                                     args=(info.cluster_name,))
-        p.start()
         info2proc[info] = p
         # Set replica status to `SHUTTING_DOWN`
         info.status_property.sky_launch_status = (
-            replica_managers.ProcessStatus.SUCCEEDED)
+            replica_managers.common_utils.ProcessStatus.SUCCEEDED)
         info.status_property.sky_down_status = (
-            replica_managers.ProcessStatus.RUNNING)
+            replica_managers.common_utils.ProcessStatus.SCHEDULED)
         serve_state.add_or_update_replica(service_name, info.replica_id, info)
-        logger.info(f'Terminating replica {info.replica_id} ...')
-    for info, p in info2proc.items():
-        p.join()
-        if p.exitcode == 0:
-            serve_state.remove_replica(service_name, info.replica_id)
-            logger.info(f'Replica {info.replica_id} terminated successfully.')
-        else:
-            # Set replica status to `FAILED_CLEANUP`
-            info.status_property.sky_down_status = (
-                replica_managers.ProcessStatus.FAILED)
-            serve_state.add_or_update_replica(service_name, info.replica_id,
-                                              info)
-            failed = True
-            logger.error(f'Replica {info.replica_id} failed to terminate.')
+        logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
+    def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
+        nonlocal failed
+        # Set replica status to `FAILED_CLEANUP`
+        info.status_property.sky_down_status = (
+            replica_managers.common_utils.ProcessStatus.FAILED)
+        serve_state.add_or_update_replica(service_name, info.replica_id, info)
+        failed = True
+        logger.error(f'Replica {info.replica_id} failed to terminate.')
+    # Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
+    # TODO(tian): Refactor to use the same logic and code.
+    while info2proc:
+        snapshot = list(info2proc.items())
+        for info, p in snapshot:
+            if p.is_alive():
+                continue
+            if (info.status_property.sky_down_status ==
+                    replica_managers.common_utils.ProcessStatus.SCHEDULED):
+                if controller_utils.can_terminate():
+                    try:
+                        p.start()
+                    except Exception as e:  # pylint: disable=broad-except
+                        _set_to_failed_cleanup(info)
+                        logger.error(f'Failed to start process for replica '
+                                     f'{info.replica_id}: {e}')
+                        del info2proc[info]
+                    else:
+                        info.status_property.sky_down_status = (
+                            common_utils.ProcessStatus.RUNNING)
+                        serve_state.add_or_update_replica(
+                            service_name, info.replica_id, info)
+            else:
+                logger.info('Terminate process for replica '
+                            f'{info.replica_id} finished.')
+                p.join()
+                del info2proc[info]
+                if p.exitcode == 0:
+                    serve_state.remove_replica(service_name, info.replica_id)
+                    logger.info(
+                        f'Replica {info.replica_id} terminated successfully.')
+                else:
+                    _set_to_failed_cleanup(info)
+        time.sleep(3)
     versions = serve_state.get_service_versions(service_name)
     serve_state.remove_service_versions(service_name)

sky/server/constants.py CHANGED Viewed

@@ -10,7 +10,7 @@ from sky.skylet import constants
 # based on version info is needed.
 # For more details and code guidelines, refer to:
 # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
-API_VERSION = 16
+API_VERSION = 17
 # The minimum peer API version that the code should still work with.
 # Notes (dev):

sky/server/requests/payloads.py CHANGED Viewed

@@ -497,6 +497,12 @@ class JobsQueueBody(RequestBody):
     skip_finished: bool = False
     all_users: bool = False
     job_ids: Optional[List[int]] = None
+    user_match: Optional[str] = None
+    workspace_match: Optional[str] = None
+    name_match: Optional[str] = None
+    pool_match: Optional[str] = None
+    page: Optional[int] = None
+    limit: Optional[int] = None
 class JobsCancelBody(RequestBody):

skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250814py3-none-any.whl → 1.0.0.dev20250815py3-none-any.whl